Merge "kubernetes: make isolcpus allocation SMT-aware"
This commit is contained in:
commit
8161dc9764
@ -0,0 +1,240 @@
|
||||
From 6bf9795d5e0dfc705299381dc902b22d03ded063 Mon Sep 17 00:00:00 2001
|
||||
From: Tao Wang <tao.wang@windriver.com>
|
||||
Date: Tue, 25 Jan 2022 19:23:43 -0500
|
||||
Subject: [PATCH] kubernetes: make isolcpus allocation SMT-aware
|
||||
|
||||
Enhance isolcpus support in Kubernetes to allocate isolated SMT
|
||||
siblings to the same container when SMT/HT is enabled on the host.
|
||||
|
||||
As it stands, the device manager code in Kubernetes is not SMT-aware
|
||||
(since normally it doesn't deal with CPUs). However, StarlingX
|
||||
exposes isolated CPUs as devices and if possible we want to allocate
|
||||
all SMT siblings from a CPU core to the same container in order to
|
||||
minimize cross- container interference due to resource contention
|
||||
within the CPU core.
|
||||
|
||||
The solution is basically to take the list of isolated CPUs and
|
||||
re-order it so that the SMT siblings are next to each other. That
|
||||
way the existing resource selection code will allocate the siblings
|
||||
together. As an optimization, if it is known that an odd number
|
||||
of isolated CPUs are desired, a singleton SMT sibling will be
|
||||
inserted into the list to avoid breaking up sibling pairs.
|
||||
|
||||
Signed-off-by: Tao Wang <tao.wang@windriver.com>
|
||||
---
|
||||
pkg/kubelet/cm/devicemanager/manager.go | 153 ++++++++++++++++++++++--
|
||||
1 file changed, 146 insertions(+), 7 deletions(-)
|
||||
|
||||
diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go
|
||||
index 65b91393..76a0ea6e 100644
|
||||
--- a/pkg/kubelet/cm/devicemanager/manager.go
|
||||
+++ b/pkg/kubelet/cm/devicemanager/manager.go
|
||||
@@ -19,10 +19,13 @@ package devicemanager
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
+ "io/ioutil"
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
+ "strconv"
|
||||
+ "strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -41,6 +44,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
|
||||
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
|
||||
cputopology "k8s.io/kubernetes/pkg/kubelet/cm/cpumanager/topology"
|
||||
+ "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager/bitmask"
|
||||
@@ -635,6 +639,80 @@ func (m *ManagerImpl) UpdateAllocatedDevices() {
|
||||
m.allocatedDevices = m.podDevices.devices()
|
||||
}
|
||||
|
||||
+//Given a list of isolated CPUs in 'devices', and the number of desired CPUs in 'needed',
|
||||
+//return an ordered list of isolated CPUs such that the first 'needed' CPUs in the list
|
||||
+//contain as many hyperthread sibling pairs as possible.
|
||||
+func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) {
|
||||
+ var dev_lst []string
|
||||
+ var single_lst []string
|
||||
+ sibling_lst := make([]string, 0, int(devices.Len()))
|
||||
+ _iterated_cpu := make(map[string]string)
|
||||
+
|
||||
+ get_sibling := func(cpu string, cpu_lst []string) string {
|
||||
+ if cpu_lst[0] == cpu {
|
||||
+ return cpu_lst[1]
|
||||
+ } else {
|
||||
+ return cpu_lst[0]
|
||||
+ }
|
||||
+ }
|
||||
+ for cpu_id := range devices {
|
||||
+ // If we've already found cpu_id as a sibling, skip it.
|
||||
+ if _, ok := _iterated_cpu[cpu_id]; ok {
|
||||
+ continue
|
||||
+ }
|
||||
+ devPath := fmt.Sprintf("/sys/devices/system/cpu/cpu%s/topology/thread_siblings_list", cpu_id)
|
||||
+ dat, err := ioutil.ReadFile(devPath)
|
||||
+ if err != nil {
|
||||
+ return dev_lst, fmt.Errorf("Can't read cpu[%s] thread_siblings_list", cpu_id)
|
||||
+ }
|
||||
+ cpustring := strings.TrimSuffix(string(dat), "\n")
|
||||
+ cpu_pair_set, err := cpuset.Parse(cpustring)
|
||||
+ if err != nil {
|
||||
+ return dev_lst, fmt.Errorf("Unable to parse thread_siblings_list[%s] string to cpuset", cpustring)
|
||||
+ }
|
||||
+ var cpu_pair_lst []string
|
||||
+ for _, v := range cpu_pair_set.ToSlice() {
|
||||
+ cpu_pair_lst = append(cpu_pair_lst, strconv.Itoa(v))
|
||||
+ }
|
||||
+ sibling_cpu_id := get_sibling(cpu_id, cpu_pair_lst)
|
||||
+
|
||||
+ if _, ok := devices[sibling_cpu_id]; ok {
|
||||
+ sibling_lst = append(sibling_lst, cpu_id, sibling_cpu_id)
|
||||
+ _iterated_cpu[sibling_cpu_id] = ""
|
||||
+ } else {
|
||||
+ single_lst = append(single_lst, cpu_id)
|
||||
+ }
|
||||
+ _iterated_cpu[cpu_id] = ""
|
||||
+ }
|
||||
+
|
||||
+ if needed%2 == 0 {
|
||||
+ dev_lst = append(sibling_lst, single_lst...)
|
||||
+ } else {
|
||||
+ if len(single_lst) > 1 {
|
||||
+ _tmp_list := append(sibling_lst, single_lst[1:]...)
|
||||
+ dev_lst = append(single_lst[0:1], _tmp_list...)
|
||||
+ } else {
|
||||
+ if len(single_lst) == 0 {
|
||||
+ dev_lst = sibling_lst
|
||||
+ } else {
|
||||
+ dev_lst = append(single_lst, sibling_lst...)
|
||||
+ }
|
||||
+
|
||||
+ }
|
||||
+ }
|
||||
+ //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst)
|
||||
+ return dev_lst, nil
|
||||
+}
|
||||
+
|
||||
+func smt_enabled() bool {
|
||||
+ dat, _ := ioutil.ReadFile("/sys/devices/system/cpu/smt/active")
|
||||
+ state := strings.TrimSuffix(string(dat), "\n")
|
||||
+ if state == "0" {
|
||||
+ return false
|
||||
+ }
|
||||
+ return true
|
||||
+}
|
||||
+
|
||||
// Returns list of device Ids we need to allocate with Allocate rpc call.
|
||||
// Returns empty list in case we don't need to issue the Allocate rpc call.
|
||||
func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) {
|
||||
@@ -664,13 +742,29 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
|
||||
}
|
||||
devices = sets.NewString()
|
||||
// Allocates from reusableDevices list first.
|
||||
- for device := range reusableDevices {
|
||||
- devices.Insert(device)
|
||||
- needed--
|
||||
- if needed == 0 {
|
||||
- return devices, nil
|
||||
+ if resource == "windriver.com/isolcpus" && smt_enabled() {
|
||||
+ _reusableDevices, err := order_devices_by_sibling(reusableDevices, needed)
|
||||
+ if err != nil {
|
||||
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
|
||||
+ }
|
||||
+ // _reusableDevices is type of slice,So we need a unique loop to process here.
|
||||
+ for _, device := range _reusableDevices {
|
||||
+ devices.Insert(device)
|
||||
+ needed--
|
||||
+ if needed == 0 {
|
||||
+ return devices, nil
|
||||
+ }
|
||||
+ }
|
||||
+ } else {
|
||||
+ for device := range reusableDevices {
|
||||
+ devices.Insert(device)
|
||||
+ needed--
|
||||
+ if needed == 0 {
|
||||
+ return devices, nil
|
||||
+ }
|
||||
}
|
||||
}
|
||||
+
|
||||
// Needs to allocate additional devices.
|
||||
if m.allocatedDevices[resource] == nil {
|
||||
m.allocatedDevices[resource] = sets.NewString()
|
||||
@@ -682,13 +776,25 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
|
||||
if available.Len() < needed {
|
||||
return nil, fmt.Errorf("requested number of devices unavailable for %s. Requested: %d, Available: %d", resource, needed, available.Len())
|
||||
}
|
||||
- // By default, pull devices from the unsorted list of available devices.
|
||||
- allocated := available.UnsortedList()[:needed]
|
||||
+
|
||||
// If topology alignment is desired, update allocated to the set of devices
|
||||
// with the best alignment.
|
||||
+ var allocated []string
|
||||
hint := m.topologyAffinityStore.GetAffinity(podUID, contName)
|
||||
if m.deviceHasTopologyAlignment(resource) && hint.NUMANodeAffinity != nil {
|
||||
allocated = m.takeByTopology(resource, available, hint.NUMANodeAffinity, needed)
|
||||
+ } else {
|
||||
+ if resource == "windriver.com/isolcpus" && smt_enabled() {
|
||||
+ var err error
|
||||
+ allocated, err = order_devices_by_sibling(available, needed)
|
||||
+ allocated = allocated[:needed]
|
||||
+ if err != nil {
|
||||
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
|
||||
+ }
|
||||
+ } else {
|
||||
+ // By default, pull devices from the unsorted list of available devices.
|
||||
+ allocated = available.UnsortedList()[:needed]
|
||||
+ }
|
||||
}
|
||||
// Updates m.allocatedDevices with allocated devices to prevent them
|
||||
// from being allocated to other pods/containers, given that we are
|
||||
@@ -764,6 +870,39 @@ func (m *ManagerImpl) takeByTopology(resource string, available sets.String, aff
|
||||
}
|
||||
}
|
||||
|
||||
+ //Add specific logic to process isolcpus resource.
|
||||
+ //Try to not sabotage the original logical structure.
|
||||
+ //Sort the original three lists by sibling: fromAffinity,notFromAffinity,withoutTopology
|
||||
+ if resource == "windriver.com/isolcpus" && smt_enabled() {
|
||||
+ var err error
|
||||
+ _request_device_map := make(sets.String)
|
||||
+ for _, dev := range fromAffinity {
|
||||
+ _request_device_map[dev] = sets.Empty{}
|
||||
+ }
|
||||
+ fromAffinity, err = order_devices_by_sibling(_request_device_map, request)
|
||||
+ if err != nil {
|
||||
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
|
||||
+ }
|
||||
+
|
||||
+ _request_device_map = make(sets.String)
|
||||
+ for _, dev := range notFromAffinity {
|
||||
+ _request_device_map[dev] = sets.Empty{}
|
||||
+ }
|
||||
+ notFromAffinity, err = order_devices_by_sibling(_request_device_map, request)
|
||||
+ if err != nil {
|
||||
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
|
||||
+ }
|
||||
+
|
||||
+ _request_device_map = make(sets.String)
|
||||
+ for _, dev := range withoutTopology {
|
||||
+ _request_device_map[dev] = sets.Empty{}
|
||||
+ }
|
||||
+ withoutTopology, err = order_devices_by_sibling(_request_device_map, request)
|
||||
+ if err != nil {
|
||||
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
// Concatenate the lists above return the first 'request' devices from it..
|
||||
return append(append(fromAffinity, notFromAffinity...), withoutTopology...)[:request]
|
||||
}
|
||||
--
|
||||
2.22.5
|
||||
|
@ -62,6 +62,7 @@ Patch7: kubelet-cpumanager-introduce-concept-of-isolated-CPU.patch
|
||||
Patch8: Fix-exclusive-CPU-allocations-being-deleted-at-conta.patch
|
||||
Patch9: kubeadm-create-platform-pods-with-zero-CPU-resources.patch
|
||||
Patch10: add-option-to-disable-isolcpu-awareness.patch
|
||||
Patch11: kubernetes-make-isolcpus-allocation-SMT-aware.patch
|
||||
|
||||
# It obsoletes cadvisor but needs its source code (literally integrated)
|
||||
Obsoletes: cadvisor
|
||||
@ -858,6 +859,7 @@ install in production.
|
||||
%patch8 -p1
|
||||
%patch9 -p1
|
||||
%patch10 -p1
|
||||
%patch11 -p1
|
||||
|
||||
#src/k8s.io/kubernetes/pkg/util/certificates
|
||||
# Patch the code to remove eliptic.P224 support
|
||||
|
@ -0,0 +1,151 @@
|
||||
From 95b7b6e1ddb25511c67a3d4018f62df1e76ee7bc Mon Sep 17 00:00:00 2001
|
||||
From: Tao Wang <tao.wang@windriver.com>
|
||||
Date: Tue, 25 Jan 2022 19:25:45 -0500
|
||||
Subject: [PATCH] kubernetes: make isolcpus allocation SMT-aware
|
||||
|
||||
Enhance isolcpus support in Kubernetes to allocate isolated SMT
|
||||
siblings to the same container when SMT/HT is enabled on the host.
|
||||
|
||||
As it stands, the device manager code in Kubernetes is not SMT-aware
|
||||
(since normally it doesn't deal with CPUs). However, StarlingX
|
||||
exposes isolated CPUs as devices and if possible we want to allocate
|
||||
all SMT siblings from a CPU core to the same container in order to
|
||||
minimize cross- container interference due to resource contention
|
||||
within the CPU core.
|
||||
|
||||
The solution is basically to take the list of isolated CPUs and
|
||||
re-order it so that the SMT siblings are next to each other. That
|
||||
way the existing resource selection code will allocate the siblings
|
||||
together. As an optimization, if it is known that an odd number
|
||||
of isolated CPUs are desired, a singleton SMT sibling will be
|
||||
inserted into the list to avoid breaking up sibling pairs.
|
||||
|
||||
Signed-off-by: Tao Wang <tao.wang@windriver.com>
|
||||
---
|
||||
pkg/kubelet/cm/devicemanager/manager.go | 84 ++++++++++++++++++++++++-
|
||||
1 file changed, 83 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/pkg/kubelet/cm/devicemanager/manager.go b/pkg/kubelet/cm/devicemanager/manager.go
|
||||
index 60de14a9..609da8ed 100644
|
||||
--- a/pkg/kubelet/cm/devicemanager/manager.go
|
||||
+++ b/pkg/kubelet/cm/devicemanager/manager.go
|
||||
@@ -19,11 +19,14 @@ package devicemanager
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
+ "io/ioutil"
|
||||
"net"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"sort"
|
||||
+ "strconv"
|
||||
+ "strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
@@ -41,6 +44,7 @@ import (
|
||||
"k8s.io/kubernetes/pkg/features"
|
||||
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager"
|
||||
"k8s.io/kubernetes/pkg/kubelet/checkpointmanager/errors"
|
||||
+ "k8s.io/kubernetes/pkg/kubelet/cm/cpuset"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/devicemanager/checkpoint"
|
||||
"k8s.io/kubernetes/pkg/kubelet/cm/topologymanager"
|
||||
"k8s.io/kubernetes/pkg/kubelet/config"
|
||||
@@ -667,6 +671,75 @@ func (m *ManagerImpl) UpdateAllocatedDevices() {
|
||||
m.allocatedDevices = m.podDevices.devices()
|
||||
}
|
||||
|
||||
+//Given a list of isolated CPUs in 'devices', and the number of desired CPUs in 'needed',
|
||||
+//return an ordered list of isolated CPUs such that the first 'needed' CPUs in the list
|
||||
+//contain as many hyperthread sibling pairs as possible.
|
||||
+func order_devices_by_sibling(devices sets.String, needed int) ([]string, error) {
|
||||
+ var dev_lst []string
|
||||
+ var single_lst []string
|
||||
+ sibling_lst := make([]string, 0, int(devices.Len()))
|
||||
+ _iterated_cpu := make(map[string]string)
|
||||
+ get_sibling := func(cpu string, cpu_lst []string) string {
|
||||
+ if cpu_lst[0] == cpu {
|
||||
+ return cpu_lst[1]
|
||||
+ } else {
|
||||
+ return cpu_lst[0]
|
||||
+ }
|
||||
+ }
|
||||
+ for cpu_id := range devices {
|
||||
+ // If we've already found cpu_id as a sibling, skip it.
|
||||
+ if _, ok := _iterated_cpu[cpu_id]; ok {
|
||||
+ continue
|
||||
+ }
|
||||
+ devPath := fmt.Sprintf("/sys/devices/system/cpu/cpu%s/topology/thread_siblings_list", cpu_id)
|
||||
+ dat, err := ioutil.ReadFile(devPath)
|
||||
+ if err != nil {
|
||||
+ return dev_lst, fmt.Errorf("Can't read cpu[%s] thread_siblings_list", cpu_id)
|
||||
+ }
|
||||
+ cpustring := strings.TrimSuffix(string(dat), "\n")
|
||||
+ cpu_pair_set, err := cpuset.Parse(cpustring)
|
||||
+ if err != nil {
|
||||
+ return dev_lst, fmt.Errorf("Unable to parse thread_siblings_list[%s] string to cpuset", cpustring)
|
||||
+ }
|
||||
+ var cpu_pair_lst []string
|
||||
+ for _, v := range cpu_pair_set.ToSlice() {
|
||||
+ cpu_pair_lst = append(cpu_pair_lst, strconv.Itoa(v))
|
||||
+ }
|
||||
+ sibling_cpu_id := get_sibling(cpu_id, cpu_pair_lst)
|
||||
+ if _, ok := devices[sibling_cpu_id]; ok {
|
||||
+ sibling_lst = append(sibling_lst, cpu_id, sibling_cpu_id)
|
||||
+ _iterated_cpu[sibling_cpu_id] = ""
|
||||
+ } else {
|
||||
+ single_lst = append(single_lst, cpu_id)
|
||||
+ }
|
||||
+ _iterated_cpu[cpu_id] = ""
|
||||
+ }
|
||||
+ if needed%2 == 0 {
|
||||
+ dev_lst = append(sibling_lst, single_lst...)
|
||||
+ } else {
|
||||
+ if len(single_lst) > 1 {
|
||||
+ _tmp_list := append(sibling_lst, single_lst[1:]...)
|
||||
+ dev_lst = append(single_lst[0:1], _tmp_list...)
|
||||
+ } else {
|
||||
+ if len(single_lst) == 0 {
|
||||
+ dev_lst = sibling_lst
|
||||
+ } else {
|
||||
+ dev_lst = append(single_lst, sibling_lst...)
|
||||
+ }
|
||||
+ }
|
||||
+ }
|
||||
+ //klog.Infof("needed=%d ordered_cpu_list=%v", needed, dev_lst)
|
||||
+ return dev_lst, nil
|
||||
+}
|
||||
+func smt_enabled() bool {
|
||||
+ dat, _ := ioutil.ReadFile("/sys/devices/system/cpu/smt/active")
|
||||
+ state := strings.TrimSuffix(string(dat), "\n")
|
||||
+ if state == "0" {
|
||||
+ return false
|
||||
+ }
|
||||
+ return true
|
||||
+}
|
||||
+
|
||||
// Returns list of device Ids we need to allocate with Allocate rpc call.
|
||||
// Returns empty list in case we don't need to issue the Allocate rpc call.
|
||||
func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, required int, reusableDevices sets.String) (sets.String, error) {
|
||||
@@ -702,7 +775,16 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
|
||||
// Create a closure to help with device allocation
|
||||
// Returns 'true' once no more devices need to be allocated.
|
||||
allocateRemainingFrom := func(devices sets.String) bool {
|
||||
- for device := range devices.Difference(allocated) {
|
||||
+ availableDevices := devices.Difference(allocated).List()
|
||||
+ // If we're dealing with isolcpus and SMT is enabled, reorder to group SMT siblings together.
|
||||
+ if resource == "windriver.com/isolcpus" && len(devices) > 0 && smt_enabled() {
|
||||
+ var err error
|
||||
+ availableDevices, err = order_devices_by_sibling(devices.Difference(allocated), needed)
|
||||
+ if err != nil {
|
||||
+ klog.Errorf("error in order_devices_by_sibling: %v", err)
|
||||
+ }
|
||||
+ }
|
||||
+ for _, device := range availableDevices {
|
||||
m.allocatedDevices[resource].Insert(device)
|
||||
allocated.Insert(device)
|
||||
needed--
|
||||
--
|
||||
2.22.5
|
||||
|
@ -61,6 +61,7 @@ Patch5: kubeadm-create-platform-pods-with-zero-CPU-resources.patch
|
||||
Patch6: enable-support-for-kubernetes-to-ignore-isolcpus.patch
|
||||
Patch7: Revert-use-subpath-for-coredns-only-for-default-repo.patch
|
||||
Patch8: Change-log-level-to-Debug.patch
|
||||
Patch9: kubernetes-make-isolcpus-allocation-SMT-aware.patch
|
||||
|
||||
# It obsoletes cadvisor but needs its source code (literally integrated)
|
||||
Obsoletes: cadvisor
|
||||
@ -856,6 +857,7 @@ install in production.
|
||||
%patch6 -p1
|
||||
%patch7 -p1
|
||||
%patch8 -p1
|
||||
%patch9 -p1
|
||||
|
||||
#src/k8s.io/kubernetes/pkg/util/certificates
|
||||
# Patch the code to remove eliptic.P224 support
|
||||
|
Loading…
x
Reference in New Issue
Block a user