enable k8s huge page feature

- record k8s labels for nodes in puppet - enable k8s huge page feature for worker w/o openstack compute label. and disable otherwise. - automatically defaults hugepages for worker nodes with openstack compute labels, changes will be applied on the unlock - do not allocate any huge pages by defaults for worker nodes without the openstack compute labels and vswitch_type is none. is assigned - when changing vswitch type,trigger update grub mem config Story: 2004763 Task: 28880 Change-Id: I7636eeb4773fa3fe32671a6bb2870c2e1074a5fa Signed-off-by: Sun Austin <austin.sun@intel.com>
2019-03-18 10:03:23 +08:00 · 2019-03-18 10:03:23 +08:00 · 9c5bf5771e
commit 9c5bf5771e
parent 560b2b6500
7 changed files with 95 additions and 30 deletions
--- a/puppet-manifests/src/modules/platform/manifests/compute.pp
+++ b/puppet-manifests/src/modules/platform/manifests/compute.pp
@ -52,7 +52,7 @@ class platform::compute::grub::params (
    $eptad = ''
  }

-  if $::is_gb_page_supported {
+  if $::is_gb_page_supported and $::platform::params::vswitch_type != 'none' {
    if $g_hugepages != undef {
      $gb_hugepages = $g_hugepages
    } else {
--- a/puppet-manifests/src/modules/platform/manifests/kubernetes.pp
+++ b/puppet-manifests/src/modules/platform/manifests/kubernetes.pp
@ -91,8 +91,11 @@ class platform::kubernetes::cgroup
 }

 class platform::kubernetes::kubeadm {
-  include ::platform::docker::params

+  include ::platform::docker::params
+  include ::platform::kubernetes::params
+
+  $host_labels = $::platform::kubernetes::params::host_labels
  $iptables_file = "net.bridge.bridge-nf-call-ip6tables = 1
    net.bridge.bridge-nf-call-iptables = 1"

@ -102,6 +105,21 @@ class platform::kubernetes::kubeadm {
    $k8s_registry = undef
  }

+  #only set k8s_hugepage true when subfunction is worker and openstack-compute-node is not in host_labels
+  if str2bool($::is_worker_subfunction)
+    and !('openstack-compute-node'
+          in $host_labels) {
+    $k8s_hugepage = true
+  } else {
+    $k8s_hugepage = false
+  }
+
+  # enable extra parameters such as hugepage
+  file { '/etc/sysconfig/kubelet':
+    ensure  => file,
+    content => template('platform/kubelet.conf.erb'),
+  }
+
  # Update iptables config. This is required based on:
  # https://kubernetes.io/docs/tasks/tools/install-kubeadm
  # This probably belongs somewhere else - initscripts package?
--- a/puppet-manifests/src/modules/platform/templates/kubelet.conf.erb
+++ b/puppet-manifests/src/modules/platform/templates/kubelet.conf.erb
@ -0,0 +1,3 @@
+# Overrides config file for kubelet
+KUBELET_EXTRA_ARGS=--feature-gates=HugePages=<%= @k8s_hugepage %>
+
--- a/sysinv/sysinv/sysinv/sysinv/agent/node.py
+++ b/sysinv/sysinv/sysinv/sysinv/agent/node.py
@ -23,9 +23,6 @@ import tsconfig.tsconfig as tsc

 LOG = logging.getLogger(__name__)

-# Defines per-socket vswitch memory requirements (in MB)
-VSWITCH_MEMORY_MB = 1024
-
 # Defines the size of one kilobyte
 SIZE_KB = 1024

@ -386,15 +383,16 @@ class NodeOperator(object):

                    vs_hp_nr, vs_hp_size = self._get_vswitch_reserved_memory(
                        node)
-                    if vs_hp_nr == 0 or vs_hp_size == 0:
-                        vs_hp_nr = VSWITCH_MEMORY_MB // size
+                    if vs_hp_size == 0:
                        vs_hp_size = size

                    # Libvirt hugepages can be 1G and 2M
                    if size == SIZE_1G_MB:
                        hp_attr = {}
                        if vs_hp_size == size:
-                            nr_hugepages -= vs_hp_nr
+                            # If the huge pages are not allocated
+                            if nr_hugepages != 0:
+                                nr_hugepages -= vs_hp_nr
                            hp_attr.update({
                                'vswitch_hugepages_size_mib': vs_hp_size,
                                'vswitch_hugepages_nr': vs_hp_nr,
@ -410,15 +408,19 @@ class NodeOperator(object):
                            # No 1G hugepage support.
                            hp_attr = {
                                'vm_hugepages_use_1G': 'False',
+                                'vm_hugepages_nr_1G': 0,
                                'vswitch_hugepages_size_mib': vs_hp_size,
                                'vswitch_hugepages_nr': vs_hp_nr,
                                'vswitch_hugepages_avail': 0
                            }
+                            if nr_hugepages != 0:
+                                nr_hugepages -= vs_hp_nr
                        else:
                            hp_attr = {}
                            if vs_hp_size == size and initial_report is False:
                                # User manually set 2M pages
-                                nr_hugepages -= vs_hp_nr
+                                if nr_hugepages != 0:
+                                    nr_hugepages -= vs_hp_nr
                                hp_attr.update({
                                    'vswitch_hugepages_size_mib': vs_hp_size,
                                    'vswitch_hugepages_nr': vs_hp_nr,
@ -546,18 +548,6 @@ class NodeOperator(object):
                'vm_hugepages_possible_1G': max_vm_pages_1gb,
            })

-            # calculate 90% 2M pages if it is initial report and the huge
-            # pages have not been allocated
-            if initial_report:
-                max_vm_pages_2mb = max_vm_pages_2mb * 0.9
-                total_hp_mb += int(max_vm_pages_2mb * (SIZE_2M_KB / SIZE_KB))
-                free_hp_mb = total_hp_mb
-                attr.update({
-                    'vm_hugepages_nr_2M': max_vm_pages_2mb,
-                    'vm_hugepages_avail_2M': max_vm_pages_2mb,
-                    'vm_hugepages_nr_1G': 0
-                })
-
            attr.update({
                'numa_node': node,
                'memtotal_mib': total_hp_mb,
--- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py
+++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py
@ -89,6 +89,7 @@ from sysinv.common import ceph
 from sysinv.common import constants
 from sysinv.common import exception
 from sysinv.common import utils as cutils
+from sysinv.helm import common as helm_common
 from sysinv.openstack.common import log
 from sysinv.openstack.common import uuidutils
 from sysinv.openstack.common.gettextutils import _
@ -3377,6 +3378,26 @@ class HostController(rest.RestController):
                    "addresses while in SDN mode.")
            raise wsme.exc.ClientSideError(msg)

+    @staticmethod
+    def _semantic_check_vswitch_type_attributes(ihost):
+        """
+        Perform semantic checks host label openstack-compute-node if ovs or ovs-dpdk
+        vswitch type is enabled since allocating 2M hugepage is needed
+        validity of the node configuration prior to unlocking it.
+        """
+        vswitch_type = utils.get_vswitch_type()
+        if vswitch_type == constants.VSWITCH_TYPE_NONE:
+            return
+
+        # Check whether compute_label has been assigned
+        if utils.is_openstack_compute(ihost) is not True:
+            raise wsme.exc.ClientSideError(
+                _("Can not unlock worker host %s without "
+                  " %s label if config %s. Action: assign "
+                  "%s label for this host prior to unlock."
+                  % (ihost['hostname'], helm_common.LABEL_COMPUTE_LABEL,
+                    vswitch_type, helm_common.LABEL_COMPUTE_LABEL)))
+
    @staticmethod
    def _semantic_check_data_vrs_attributes(ihost):
        """
@ -3637,16 +3658,27 @@ class HostController(rest.RestController):
                    pecan.request.dbapi.imemory_update(m.uuid, values)

    @staticmethod
-    def _update_vm_4k_pages(ihost):
+    def _update_huge_pages(ihost):
        """
-        Update VM 4K huge pages.
+        Update the host huge pages.
        """
        ihost_inodes = pecan.request.dbapi.inode_get_by_ihost(ihost['uuid'])

+        labels = pecan.request.dbapi.label_get_by_host(ihost['uuid'])
+        vswitch_type = utils.get_vswitch_type()
        for node in ihost_inodes:
            mems = pecan.request.dbapi.imemory_get_by_inode(node['id'])
            for m in mems:
                if m.hugepages_configured:
+                    value = {}
+                    vs_hugepages_nr = m.vswitch_hugepages_nr
+                    # allocate the default vswitch huge pages if required
+                    if vswitch_type != constants.VSWITCH_TYPE_NONE and \
+                       vs_hugepages_nr == 0:
+                        vs_hugepages_nr = constants.VSWITCH_MEMORY_MB \
+                                      // m.vswitch_hugepages_size_mib
+                        value.update({'vswitch_hugepages_nr': vs_hugepages_nr})
+
                    vm_hugepages_nr_2M = m.vm_hugepages_nr_2M_pending \
                        if m.vm_hugepages_nr_2M_pending is not None \
                        else m.vm_hugepages_nr_2M
@ -3654,10 +3686,18 @@ class HostController(rest.RestController):
                        if m.vm_hugepages_nr_1G_pending is not None \
                        else m.vm_hugepages_nr_1G

+                    # calculate 90% 2M pages if the huge pages have not been
+                    # allocated and the compute label is set
+                    if cutils.has_openstack_compute(labels) and \
+                                    vm_hugepages_nr_2M == 0 and \
+                                    vm_hugepages_nr_1G == 0:
+                        vm_hugepages_nr_2M = m.vm_hugepages_possible_2M * 0.9
+                        value.update({'vm_hugepages_nr_2M': vm_hugepages_nr_2M})
+
                    vm_hugepages_4K = \
                        (m.node_memtotal_mib - m.platform_reserved_mib)
                    vm_hugepages_4K -= \
-                        (m.vswitch_hugepages_nr * m.vswitch_hugepages_size_mib)
+                        (vs_hugepages_nr * m.vswitch_hugepages_size_mib)
                    vm_hugepages_4K -= \
                        (constants.MIB_2M * vm_hugepages_nr_2M)
                    vm_hugepages_4K -=  \
@ -3670,10 +3710,9 @@ class HostController(rest.RestController):
                    if vm_hugepages_4K < min_4K:
                        vm_hugepages_4K = 0

-                    value = {'vm_hugepages_nr_4K': vm_hugepages_4K}
-                    LOG.info("Set VM 4K pages for host (%s) node (%d) pages "
-                             "(%d)" % (ihost['hostname'], node['id'],
-                                       vm_hugepages_4K))
+                    value.update({'vm_hugepages_nr_4K': vm_hugepages_4K})
+                    LOG.info("Updating mem values of host(%s) node(%d): %s" %
+                             (ihost['hostname'], node['id'], str(value)))
                    pecan.request.dbapi.imemory_update(m.uuid, value)

    @staticmethod
@ -5204,6 +5243,7 @@ class HostController(rest.RestController):
            self._semantic_check_data_interfaces(ihost,
                                                 kubernetes_config,
                                                 force_unlock)
+            self._semantic_check_vswitch_type_attributes(ihost)
        else:
            # sdn configuration check
            self._semantic_check_sdn_attributes(ihost)
@ -5265,8 +5305,8 @@ class HostController(rest.RestController):
        if align_2M_memory or align_1G_memory:
            self._align_pending_memory(ihost, align_2M_memory, align_1G_memory)

-        # calculate the VM 4K huge pages for nova
-        self._update_vm_4k_pages(ihost)
+        # update ihost huge pages allocation
+        self._update_huge_pages(ihost)

        if cutils.is_virtual() or cutils.is_virtual_worker(ihost):
            mib_platform_reserved_no_io = mib_reserved
--- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/utils.py
+++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/utils.py
@ -33,6 +33,7 @@ from oslo_config import cfg
 from sysinv.common import constants
 from sysinv.common import exception
 from sysinv.common.utils import memoized
+from sysinv.helm import common as helm_common
 from sysinv.openstack.common.gettextutils import _
 from sysinv.openstack.common import log

@ -255,6 +256,16 @@ def get_vswitch_type():
    return system.capabilities.get('vswitch_type')


+def is_openstack_compute(ihost):
+    for obj in pecan.request.dbapi.label_get_by_host(ihost['uuid']):
+        try:
+            if helm_common.LABEL_COMPUTE_LABEL == obj.label_key:
+                return True
+        except AttributeError:
+            pass
+    return False
+
+
 def get_https_enabled():
    system = pecan.request.dbapi.isystem_get_one()
    return system.capabilities.get('https_enabled', False)
--- a/sysinv/sysinv/sysinv/sysinv/common/constants.py
+++ b/sysinv/sysinv/sysinv/sysinv/common/constants.py
@ -208,6 +208,9 @@ MIB_1G = 1024
 Ki = 1024
 NUM_4K_PER_MiB = 256

+# Defines per-socket vswitch memory requirements (in MB)
+VSWITCH_MEMORY_MB = 1024
+
 # Dynamic IO Resident Set Size(RSS) in MiB per socket
 DISK_IO_RESIDENT_SET_SIZE_MIB = 2000
 DISK_IO_RESIDENT_SET_SIZE_MIB_VBOX = 500