class platform::kubernetes::params ( $enabled = true, # K8S version we are upgrading to (None if not in an upgrade) $upgrade_to_version = undef, # K8S version running on a host $version = undef, $node_ip = undef, $service_domain = undef, $dns_service_ip = undef, $host_labels = [], $k8s_cpuset = undef, $k8s_nodeset = undef, $k8s_reserved_cpus = undef, $k8s_reserved_mem = undef, $k8s_isol_cpus = undef, $k8s_cpu_mgr_policy = 'none', $k8s_topology_mgr_policy = 'best-effort', $k8s_cni_bin_dir = '/usr/libexec/cni', $join_cmd = undef, $oidc_issuer_url = undef, $oidc_client_id = undef, $oidc_username_claim = undef, $oidc_groups_claim = undef ) { } class platform::kubernetes::cgroup::params ( $cgroup_root = '/sys/fs/cgroup', $cgroup_name = 'k8s-infra', $controllers = ['cpuset', 'cpu', 'cpuacct', 'memory', 'systemd', 'pids'], ) {} class platform::kubernetes::cgroup inherits ::platform::kubernetes::cgroup::params { include ::platform::kubernetes::params $k8s_cpuset = $::platform::kubernetes::params::k8s_cpuset $k8s_nodeset = $::platform::kubernetes::params::k8s_nodeset # Default to float across all cpus and numa nodes if !defined('$k8s_cpuset') { $k8s_cpuset = generate('/bin/cat', '/sys/devices/system/cpu/online') notice("System default cpuset ${k8s_cpuset}.") } if !defined('$k8s_nodeset') { $k8s_nodeset = generate('/bin/cat', '/sys/devices/system/node/online') notice("System default nodeset ${k8s_nodeset}.") } # Create kubelet cgroup for the minimal set of required controllers. # NOTE: The kubernetes cgroup_manager_linux func Exists() checks that # specific subsystem cgroup paths actually exist on the system. The # particular cgroup cgroupRoot must exist for the following controllers: # "cpu", "cpuacct", "cpuset", "memory", "systemd", "pids". # Reference: # https://github.com/kubernetes/kubernetes/blob/master/pkg/kubelet/cm/cgroup_manager_linux.go # systemd automatically mounts cgroups and controllers, so don't need # to do that here. notice("Create ${cgroup_root}/${controllers}/${cgroup_name}") $controllers.each |String $controller| { $cgroup_dir = "${cgroup_root}/${controller}/${cgroup_name}" file { $cgroup_dir : ensure => directory, owner => 'root', group => 'root', mode => '0700', } # Modify k8s cpuset resources to reflect platform configured cores. # NOTE: Using 'exec' here instead of 'file' resource type with 'content' # tag to update contents under /sys, since puppet tries to create files # with temp names in the same directory, and the kernel only allows # specific filenames to be created in these particular directories. # This causes puppet to fail if we use the 'content' tag. # NOTE: Child cgroups cpuset must be subset of parent. In the case where # child directories already exist and we change the parent's cpuset to # be a subset of what the children have, will cause the command to fail # with "-bash: echo: write error: device or resource busy". if $controller == 'cpuset' { $cgroup_mems = "${cgroup_dir}/cpuset.mems" $cgroup_cpus = "${cgroup_dir}/cpuset.cpus" $cgroup_tasks = "${cgroup_dir}/tasks" notice("Set ${cgroup_name} nodeset: ${k8s_nodeset}, cpuset: ${k8s_cpuset}") File[ $cgroup_dir ] -> exec { "Create ${cgroup_mems}" : command => "/bin/echo ${k8s_nodeset} > ${cgroup_mems} || :", } -> exec { "Create ${cgroup_cpus}" : command => "/bin/echo ${k8s_cpuset} > ${cgroup_cpus} || :", } -> file { $cgroup_tasks : ensure => file, owner => 'root', group => 'root', mode => '0644', } } } } class platform::kubernetes::kubeadm { include ::platform::docker::params include ::platform::kubernetes::params include ::platform::params $node_ip = $::platform::kubernetes::params::node_ip $host_labels = $::platform::kubernetes::params::host_labels $k8s_reserved_cpus = $::platform::kubernetes::params::k8s_reserved_cpus $k8s_reserved_mem = $::platform::kubernetes::params::k8s_reserved_mem $k8s_isol_cpus = $::platform::kubernetes::params::k8s_isol_cpus $k8s_cni_bin_dir = $::platform::kubernetes::params::k8s_cni_bin_dir $k8s_cpu_mgr_policy = $::platform::kubernetes::params::k8s_cpu_mgr_policy $k8s_topology_mgr_policy = $::platform::kubernetes::params::k8s_topology_mgr_policy $iptables_file = "net.bridge.bridge-nf-call-ip6tables = 1 net.bridge.bridge-nf-call-iptables = 1" # Configure kubelet cpumanager options if ($::personality == 'controller' and $::platform::params::distributed_cloud_role == 'systemcontroller') { $k8s_cpu_manager_opts = '--cpu-manager-policy=none' } else { if str2bool($::is_worker_subfunction) and !('openstack-compute-node' in $host_labels) { $opts = join(['--feature-gates TopologyManager=true', "--cpu-manager-policy=${k8s_cpu_mgr_policy}", "--topology-manager-policy=${k8s_topology_mgr_policy}", '--system-reserved-cgroup=/system.slice'], ' ') $opts_sys_res = join(['--system-reserved=', "cpu=${k8s_reserved_cpus},", "memory=${k8s_reserved_mem}Mi"]) $opts_kube_res = join(['--kube-reserved=', "cpu=${k8s_isol_cpus}"]) if $k8s_cpu_mgr_policy == 'none' { $k8s_cpu_manager_opts = join([$opts, $opts_sys_res], ' ') } else { $k8s_cpu_manager_opts = join([$opts, $opts_sys_res, $opts_kube_res], ' ') } } else { $k8s_cpu_manager_opts = '--cpu-manager-policy=none' } } # Enable kubelet extra parameters that are node specific such as # cpumanager file { '/etc/sysconfig/kubelet': ensure => file, content => template('platform/kubelet.conf.erb'), } # The cpu_manager_state file is regenerated when cpumanager starts or # changes allocations so it is safe to remove before kubelet starts. # This file persists so cpumanager's DefaultCPUSet becomes inconsistent # when we offline/online CPUs or change the number of reserved cpus. -> exec { 'remove cpu_manager_state': command => 'rm -f /var/lib/kubelet/cpu_manager_state || true', } # Update iptables config. This is required based on: # https://kubernetes.io/docs/tasks/tools/install-kubeadm # This probably belongs somewhere else - initscripts package? file { '/etc/sysctl.d/k8s.conf': ensure => file, content => $iptables_file, owner => 'root', group => 'root', mode => '0644', } -> exec { 'update kernel parameters for iptables': command => 'sysctl --system', } # Create manifests directory required by kubelet -> file { '/etc/kubernetes/manifests': ensure => directory, owner => 'root', group => 'root', mode => '0700', } # Start kubelet. -> service { 'kubelet': enable => true, } # A seperate enable is required since we have modified the service resource # to never enable services. -> exec { 'enable-kubelet': command => '/usr/bin/systemctl enable kubelet.service', } } class platform::kubernetes::master::init inherits ::platform::kubernetes::params { include ::platform::params include ::platform::docker::params include ::platform::dockerdistribution::params if str2bool($::is_initial_k8s_config) { # This allows subsequent node installs # Notes regarding ::is_initial_k8s_config check: # - Ensures block is only run for new node installs (e.g. controller-1) # or reinstalls. This part is needed only once; # - Ansible configuration is independently configuring Kubernetes. A retry # in configuration by puppet leads to failed manifest application. # This flag is created by Ansible on controller-0; # - Ansible replay is not impacted by flag creation. $local_registry_auth = "${::platform::dockerdistribution::params::registry_username}:${::platform::dockerdistribution::params::registry_password}" # lint:ignore:140chars exec { 'pre pull k8s images': command => "kubeadm config images list --kubernetes-version ${version} --image-repository registry.local:9001/k8s.gcr.io | xargs -i crictl pull --creds ${local_registry_auth} {}", # lint:ignore:140chars logoutput => true, } -> exec { 'configure master node': command => $join_cmd, logoutput => true, } -> exec { 'create kubeadm.yaml': command => 'kubeadm config view > /etc/kubernetes/kubeadm.yaml', creates => '/etc/kubernetes/kubeadm.yaml' } -> file { '/etc/kubernetes/kubeadm.yaml': ensure => file, owner => 'root', group => 'root', mode => '0644', } # Update ownership/permissions for file created by "kubeadm init". # We want it readable by sysinv and sysadmin. -> file { '/etc/kubernetes/admin.conf': ensure => file, owner => 'root', group => $::platform::params::protected_group_name, mode => '0640', } # Add a bash profile script to set a k8s env variable -> file {'bash_profile_k8s': ensure => present, path => '/etc/profile.d/kubeconfig.sh', mode => '0644', source => "puppet:///modules/${module_name}/kubeconfig.sh" } # Remove the taint from the master node -> exec { 'remove taint from master node': command => "kubectl --kubeconfig=/etc/kubernetes/admin.conf taint node ${::platform::params::hostname} node-role.kubernetes.io/master- || true", # lint:ignore:140chars logoutput => true, } # Add kubelet service override -> file { '/etc/systemd/system/kubelet.service.d/kube-stx-override.conf': ensure => file, content => template('platform/kube-stx-override.conf.erb'), owner => 'root', group => 'root', mode => '0644', } # set kubelet monitored by pmond -> file { '/etc/pmon.d/kubelet.conf': ensure => file, content => template('platform/kubelet-pmond-conf.erb'), owner => 'root', group => 'root', mode => '0644', } # Reload systemd -> exec { 'perform systemctl daemon reload for kubelet override': command => 'systemctl daemon-reload', logoutput => true, } # Initial kubernetes config done on node -> file { '/etc/platform/.initial_k8s_config_complete': ensure => present, } } # Run kube-cert-rotation daily cron { 'kube-cert-rotation': ensure => 'present', command => '/usr/bin/kube-cert-rotation.sh', environment => 'PATH=/bin:/usr/bin:/usr/sbin', minute => '10', hour => '*/24', user => 'root', } } class platform::kubernetes::master inherits ::platform::kubernetes::params { contain ::platform::kubernetes::kubeadm contain ::platform::kubernetes::cgroup contain ::platform::kubernetes::master::init contain ::platform::kubernetes::coredns contain ::platform::kubernetes::firewall Class['::platform::sysctl::controller::reserve_ports'] -> Class[$name] Class['::platform::etcd'] -> Class[$name] Class['::platform::docker::config'] -> Class[$name] Class['::platform::containerd::config'] -> Class[$name] # Ensure DNS is configured as name resolution is required when # kubeadm init is run. Class['::platform::dns'] -> Class[$name] Class['::platform::kubernetes::kubeadm'] -> Class['::platform::kubernetes::cgroup'] -> Class['::platform::kubernetes::master::init'] -> Class['::platform::kubernetes::coredns'] -> Class['::platform::kubernetes::firewall'] } class platform::kubernetes::worker::init inherits ::platform::kubernetes::params { Class['::platform::docker::config'] -> Class[$name] Class['::platform::containerd::config'] -> Class[$name] Class['::platform::filesystem::kubelet'] -> Class[$name] if str2bool($::is_initial_config) { include ::platform::dockerdistribution::params # Get the pause image tag from kubeadm required images # list and replace with local registry $get_k8s_pause_img = "kubeadm config images list 2>/dev/null |\ awk '/^k8s.gcr.io\\/pause:/{print \$1}' | sed 's#k8s.gcr.io#registry.local:9001\\/k8s.gcr.io#'" $k8s_pause_img = generate('/bin/sh', '-c', $get_k8s_pause_img) if k8s_pause_img { exec { 'load k8s pause image by containerd': command => "crictl pull --creds ${::platform::dockerdistribution::params::registry_username}:${::platform::dockerdistribution::params::registry_password} ${k8s_pause_img}", # lint:ignore:140chars logoutput => true, before => Exec['configure worker node'] } } } # Configure the worker node. Only do this once, so check whether the # kubelet.conf file has already been created (by the join). exec { 'configure worker node': command => $join_cmd, logoutput => true, unless => 'test -f /etc/kubernetes/kubelet.conf', } # Add kubelet service override -> file { '/etc/systemd/system/kubelet.service.d/kube-stx-override.conf': ensure => file, content => template('platform/kube-stx-override.conf.erb'), owner => 'root', group => 'root', mode => '0644', } # set kubelet monitored by pmond -> file { '/etc/pmon.d/kubelet.conf': ensure => file, content => template('platform/kubelet-pmond-conf.erb'), owner => 'root', group => 'root', mode => '0644', } # Reload systemd -> exec { 'perform systemctl daemon reload for kubelet override': command => 'systemctl daemon-reload', logoutput => true, } } class platform::kubernetes::worker::pci ( $pcidp_network_resources = undef, ) { include ::platform::kubernetes::params file { '/etc/pcidp': ensure => 'directory', owner => 'root', group => 'root', mode => '0700', } -> file { '/etc/pcidp/config.json': ensure => present, owner => 'root', group => 'root', mode => '0644', content => template('platform/pcidp.conf.erb'), } } class platform::kubernetes::worker inherits ::platform::kubernetes::params { # Worker configuration is not required on AIO hosts, since the master # will already be configured and includes support for running pods. if $::personality != 'controller' { contain ::platform::kubernetes::kubeadm contain ::platform::kubernetes::cgroup contain ::platform::kubernetes::worker::init Class['::platform::kubernetes::kubeadm'] -> Class['::platform::kubernetes::cgroup'] -> Class['::platform::kubernetes::worker::init'] } else { # Reconfigure cgroups cpusets on AIO contain ::platform::kubernetes::cgroup # Add refresh dependency for kubelet for hugepage allocation Class['::platform::compute::allocate'] ~> service { 'kubelet': } } # TODO: The following exec is a workaround. Once kubernetes becomes the # default installation, /etc/pmon.d/libvirtd.conf needs to be removed from # the load. exec { 'Update PMON libvirtd.conf': command => "/bin/sed -i 's#mode = passive#mode = ignore #' /etc/pmon.d/libvirtd.conf", onlyif => '/usr/bin/test -e /etc/pmon.d/libvirtd.conf' } contain ::platform::kubernetes::worker::pci } class platform::kubernetes::coredns { include ::platform::params if str2bool($::is_initial_k8s_config) { if $::platform::params::system_mode != 'simplex' { # For duplex and multi-node system, restrict the dns pod to master nodes exec { 'restrict coredns to master nodes': command => 'kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system patch deployment coredns -p \'{"spec":{"template":{"spec":{"nodeSelector":{"node-role.kubernetes.io/master":""}}}}}\'', # lint:ignore:140chars logoutput => true, } -> exec { 'Use anti-affinity for coredns pods': command => 'kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system patch deployment coredns -p \'{"spec":{"template":{"spec":{"affinity":{"podAntiAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":[{"labelSelector":{"matchExpressions":[{"key":"k8s-app","operator":"In","values":["kube-dns"]}]},"topologyKey":"kubernetes.io/hostname"}]}}}}}}\'', # lint:ignore:140chars logoutput => true, } } else { # For simplex system, 1 coredns is enough exec { '1 coredns for simplex mode': command => 'kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system scale --replicas=1 deployment coredns', # lint:ignore:140chars logoutput => true, } } } } # TODO: remove port 9001 once we have a public docker image registry using standard ports. # add 5000 as the default port for private registry class platform::kubernetes::firewall::params ( $transport = 'tcp', $table = 'nat', $dports = [80, 443, 9001, 5000], $chain = 'POSTROUTING', $jump = 'SNAT', ) {} class platform::kubernetes::firewall inherits ::platform::kubernetes::firewall::params { include ::platform::params include ::platform::network::oam::params include ::platform::network::mgmt::params include ::platform::docker::params # add http_proxy and https_proxy port to k8s firewall # in order to allow worker node access public network via proxy if $::platform::docker::params::http_proxy { $http_proxy_str_array = split($::platform::docker::params::http_proxy, ':') $http_proxy_port = $http_proxy_str_array[length($http_proxy_str_array) - 1] if $http_proxy_port =~ /^\d+$/ { $http_proxy_port_val = $http_proxy_port } } if $::platform::docker::params::https_proxy { $https_proxy_str_array = split($::platform::docker::params::https_proxy, ':') $https_proxy_port = $https_proxy_str_array[length($https_proxy_str_array) - 1] if $https_proxy_port =~ /^\d+$/ { $https_proxy_port_val = $https_proxy_port } } if defined('$http_proxy_port_val') { if defined('$https_proxy_port_val') and ($http_proxy_port_val != $https_proxy_port_val) { $dports = $dports << $http_proxy_port_val << $https_proxy_port_val } else { $dports = $dports << $http_proxy_port_val } } elsif defined('$https_proxy_port_val') { $dports = $dports << $https_proxy_port_val } $system_mode = $::platform::params::system_mode $oam_float_ip = $::platform::network::oam::params::controller_address $oam_interface = $::platform::network::oam::params::interface_name $mgmt_subnet = $::platform::network::mgmt::params::subnet_network $mgmt_prefixlen = $::platform::network::mgmt::params::subnet_prefixlen $s_mgmt_subnet = "${mgmt_subnet}/${mgmt_prefixlen}" $d_mgmt_subnet = "! ${s_mgmt_subnet}" if $system_mode != 'simplex' { platform::firewall::rule { 'kubernetes-nat': service_name => 'kubernetes', table => $table, chain => $chain, proto => $transport, jump => $jump, ports => $dports, host => $s_mgmt_subnet, destination => $d_mgmt_subnet, outiface => $oam_interface, tosource => $oam_float_ip, } } } class platform::kubernetes::pre_pull_control_plane_images inherits ::platform::kubernetes::params { include ::platform::dockerdistribution::params $local_registry_auth = "${::platform::dockerdistribution::params::registry_username}:${::platform::dockerdistribution::params::registry_password}" # lint:ignore:140chars exec { 'pre pull images': command => "kubeadm config images list --kubernetes-version ${upgrade_to_version} --image-repository=registry.local:9001/k8s.gcr.io | xargs -i crictl pull --creds ${local_registry_auth} {}", # lint:ignore:140chars logoutput => true, } } class platform::kubernetes::upgrade_first_control_plane inherits ::platform::kubernetes::params { include ::platform::params exec { 'upgrade first control plane': command => "kubeadm upgrade apply ${version} -y", logoutput => true, } if $::platform::params::system_mode != 'simplex' { # For duplex and multi-node system, restrict the coredns pod to master nodes exec { 'restrict coredns to master nodes': command => 'kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system patch deployment coredns -p \'{"spec":{"template":{"spec":{"nodeSelector":{"node-role.kubernetes.io/master":""}}}}}\'', # lint:ignore:140chars logoutput => true, require => Exec['upgrade first control plane'] } -> exec { 'Use anti-affinity for coredns pods': command => 'kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system patch deployment coredns -p \'{"spec":{"template":{"spec":{"affinity":{"podAntiAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":[{"labelSelector":{"matchExpressions":[{"key":"k8s-app","operator":"In","values":["kube-dns"]}]},"topologyKey":"kubernetes.io/hostname"}]}}}}}}\'', # lint:ignore:140chars logoutput => true, } } else { # For simplex system, 1 coredns is enough exec { '1 coredns for simplex mode': command => 'kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system scale --replicas=1 deployment coredns', # lint:ignore:140chars logoutput => true, require => Exec['upgrade first control plane'] } } } class platform::kubernetes::upgrade_control_plane inherits ::platform::kubernetes::params { exec { 'upgrade control plane': command => 'kubeadm upgrade node', logoutput => true, } } class platform::kubernetes::master::upgrade_kubelet inherits ::platform::kubernetes::params { exec { 'restart kubelet': command => '/usr/local/sbin/pmon-restart kubelet' } } class platform::kubernetes::worker::upgrade_kubelet inherits ::platform::kubernetes::params { include ::platform::dockerdistribution::params # Get the pause image tag from kubeadm required images # list and replace with local registry $get_k8s_pause_img = "kubeadm config images list 2>/dev/null |\ awk '/^k8s.gcr.io\\/pause:/{print \$1}' | sed 's#k8s.gcr.io#registry.local:9001\\/k8s.gcr.io#'" $k8s_pause_img = generate('/bin/sh', '-c', $get_k8s_pause_img) if k8s_pause_img { exec { 'load k8s pause image': command => "crictl pull --creds ${::platform::dockerdistribution::params::registry_username}:${::platform::dockerdistribution::params::registry_password} ${k8s_pause_img}", # lint:ignore:140chars logoutput => true, before => Exec['upgrade kubelet'] } } exec { 'upgrade kubelet': command => 'kubeadm upgrade node', logoutput => true, } -> exec { 'restart kubelet': command => '/usr/local/sbin/pmon-restart kubelet' } } class platform::kubernetes::master::change_apiserver_parameters inherits ::platform::kubernetes::params { $configmap_temp_file = '/tmp/cluster_configmap.yaml' $configview_temp_file = '/tmp/kubeadm_config_view.yaml' file { $configmap_temp_file: ensure => present, owner => 'root', group => 'root', mode => '0600', } -> file { $configview_temp_file: ensure => present, owner => 'root', group => 'root', mode => '0600', } # Kubeadm stores the cluster configuration as a configmap in the cluster. # We will change that configmap to include/remove kube-apiserver parameters. # In order to restart kube-apiserver, we will use the "kubeadm init phase" # command and feed it the output of "kubeadm config view". # This keeps the configmap consistent and keeps kube-apiserver managed by kubeadm. -> exec { 'read kubeadm config map': command => "kubectl --kubeconfig=/etc/kubernetes/admin.conf get configmap kubeadm-config -o yaml -n kube-system > ${configmap_temp_file}" # lint:ignore:140chars } -> exec { 'update kube-apiserver params': command => template('platform/kube-apiserver-change-params.erb') } -> exec { 'patch kubeadm config map': command => "kubectl --kubeconfig=/etc/kubernetes/admin.conf -n kube-system patch configmap kubeadm-config -p \"$(cat ${configmap_temp_file})\"" # lint:ignore:140chars } -> exec { 'get patched configmap': command => "kubeadm config view > ${configview_temp_file}" } -> exec { 'update kube-apiserver parameters': command => "kubeadm init phase control-plane apiserver --config ${configview_temp_file}" } -> exec { 'remove temp configmap': command => "rm ${configmap_temp_file}", } -> exec { 'remove temp configview': command => "rm ${configview_temp_file}", } }