Integrate host configuration into configuration framework

Integrates the following host configuration into the configuration
framework:
- Host boot parameters
- CPU reservation
- Process affinity
- Memory huge page allocations

Change-Id: I2259e0e93eefd5ce5000271fa32ecaa8d13fa411
Signed-off-by: Matt Peters <matt.peters@windriver.com>
This commit is contained in:
Matt Peters 2018-06-07 15:59:14 -05:00
parent b3d4df1cc5
commit 69365bb834
28 changed files with 846 additions and 1728 deletions

View File

@ -35,7 +35,6 @@ Initial compute node hugepages and reserved cpus configuration
# compute init scripts
install -d -m 755 %{buildroot}%{local_etc_initd}
install -p -D -m 755 affine-platform.sh %{buildroot}%{local_etc_initd}/affine-platform.sh
install -p -D -m 755 compute-huge.sh %{buildroot}%{local_etc_initd}/compute-huge.sh
# utility scripts
install -p -D -m 755 cpumap_functions.sh %{buildroot}%{local_etc_initd}/cpumap_functions.sh
@ -53,7 +52,6 @@ install -p -D -m 755 bin/topology %{buildroot}%{local_bindir}/topology
# compute config data
install -d -m 755 %{buildroot}%{local_etc_nova}
install -p -D -m 755 compute_reserved.conf %{buildroot}%{local_etc_nova}/compute_reserved.conf
install -p -D -m 755 compute_hugepages_total.conf %{buildroot}%{local_etc_nova}/compute_hugepages_total.conf
# goenabled check
install -d -m 755 %{buildroot}%{local_etc_goenabledd}
@ -62,11 +60,9 @@ install -p -D -m 755 compute-huge-goenabled.sh %{buildroot}%{local_etc_goenabled
# systemd services
install -d -m 755 %{buildroot}%{_unitdir}
install -p -D -m 664 affine-platform.sh.service %{buildroot}%{_unitdir}/affine-platform.sh.service
install -p -D -m 664 compute-huge.sh.service %{buildroot}%{_unitdir}/compute-huge.sh.service
%post
/bin/systemctl enable affine-platform.sh.service >/dev/null 2>&1
/bin/systemctl enable compute-huge.sh.service >/dev/null 2>&1
%clean
rm -rf $RPM_BUILD_ROOT
@ -79,7 +75,5 @@ rm -rf $RPM_BUILD_ROOT
%{local_etc_initd}/*
%{local_etc_goenabledd}/*
%config(noreplace) %{local_etc_nova}/compute_reserved.conf
%config(noreplace) %{local_etc_nova}/compute_hugepages_total.conf
%{_unitdir}/compute-huge.sh.service
%{_unitdir}/affine-platform.sh.service

View File

@ -1,7 +1,7 @@
[Unit]
Description=Titanium Cloud Affine Platform
After=syslog.service network.service dbus.service sw-patch.service
Before=compute-huge.sh.service
Before=computeconfig.service
[Service]
Type=oneshot

View File

@ -17,7 +17,7 @@ source "/etc/init.d/log_functions.sh"
source "/usr/bin/tsconfig"
if [ -e ${VOLATILE_COMPUTE_CONFIG_COMPLETE} -a ! -f ${COMPUTE_HUGE_GOENABLED} ]; then
log_error "compute-huge.sh CPU configuration check failed. Failing goenabled check."
log_error "Compute manifest CPU configuration check failed. Failing goenabled check."
exit 1
fi

File diff suppressed because it is too large Load Diff

View File

@ -1,14 +0,0 @@
[Unit]
Description=Titanium Cloud Compute Huge
After=syslog.service network.service affine-platform.sh.service sw-patch.service
Before=sshd.service sw-patch-agent.service sysinv-agent.service
[Service]
Type=oneshot
RemainAfterExit=yes
ExecStart=/etc/init.d/compute-huge.sh start
ExecStop=/etc/init.d/compute-huge.sh stop
ExecReload=/etc/init.d/compute-huge.sh restart
[Install]
WantedBy=multi-user.target

View File

@ -52,7 +52,23 @@ rm -rf ${PUPPET_TMP}
mkdir -p ${PUPPET_TMP}/hieradata
cp /etc/puppet/hieradata/global.yaml ${PUPPET_TMP}/hieradata/global.yaml
cp /etc/puppet/hieradata/${PERSONALITY}.yaml ${PUPPET_TMP}/hieradata/personality.yaml
cp -f ${HIERADATA}/${HOST}.yaml ${PUPPET_TMP}/hieradata/host.yaml
# When the compute node is first booted and goes online, sysinv-agent reports
# host CPU inventory which triggers the first runtime manifest apply that updates
# the grub. At this time, copying the host file failed due to a timing issue that
# has not yet been fully understood. Subsequent retries worked.
if [ "${PERSONALITY}" = "compute" ]; then
n=0
until [ $n -ge 3 ]
do
cp -f ${HIERADATA}/${HOST}.yaml ${PUPPET_TMP}/hieradata/host.yaml && break
n=$[$n+1]
logger -t $0 "Failed to copy /etc/puppet/hieradata/${HOST}.yaml"
sleep 15
done
else
cp -f ${HIERADATA}/${HOST}.yaml ${PUPPET_TMP}/hieradata/host.yaml
fi
cp -f ${HIERADATA}/system.yaml \
${HIERADATA}/secure_system.yaml \
${HIERADATA}/static.yaml \

View File

@ -13,6 +13,7 @@ include ::platform::sysctl::compute
include ::platform::dhclient
include ::platform::partitions
include ::platform::lvm::compute
include ::platform::compute
include ::platform::vswitch
include ::platform::network
include ::platform::fstab

View File

@ -0,0 +1,5 @@
# Returns the current boot parameters
Facter.add(:get_cmdline) do
setcode "cat /proc/cmdline 2>/dev/null"
end

View File

@ -0,0 +1,8 @@
# Returns true if it is Broadwell processor
# Broadwell specific flags (model: 79)
Facter.add("is_broadwell_processor") do
setcode do
Facter::Core::Execution.exec('grep -q -E "^model\s+:\s+79$" /proc/cpuinfo')
$?.exitstatus == 0
end
end

View File

@ -0,0 +1,7 @@
# Returns true if one GB pages is supported
Facter.add("is_gb_page_supported") do
setcode do
Facter::Core::Execution.exec('grep -q pdpe1gb /proc/cpuinfo')
$?.exitstatus == 0
end
end

View File

@ -0,0 +1,7 @@
# Returns true if hugetlbfs not enabled
Facter.add("is_hugetlbfs_enabled") do
setcode do
Facter::Core::Execution.exec('grep -q hugetlbfs /proc/filesystems')
$?.exitstatus == 0
end
end

View File

@ -0,0 +1,6 @@
# Returns true if Resource Control is supported on this node
Facter.add("is_per_numa_supported") do
setcode do
Dir.exist?('/sys/devices/system/node/node0')
end
end

View File

@ -0,0 +1,6 @@
# Returns true if Resource Control is supported on this node
Facter.add("is_resctrl_supported") do
setcode do
Dir.exist?('/sys/fs/resctrl')
end
end

View File

@ -0,0 +1,4 @@
# Returns number of logical cpus
Facter.add(:number_of_logical_cpus) do
setcode "cat /proc/cpuinfo 2>/dev/null | awk '/^[pP]rocessor/ { n +=1 } END { print (n>0) ? n : 1}'"
end

View File

@ -0,0 +1,4 @@
# Returns number of numa nodes
Facter.add(:number_of_numa_nodes) do
setcode "ls -d /sys/devices/system/node/node* 2>/dev/null | wc -l"
end

View File

@ -0,0 +1,34 @@
module Puppet::Parser::Functions
newfunction(:check_grub_config,
:type => :rvalue,
:doc => <<-EOD
This internal function checks if a list of arguments are configured
in the current boot args based on the input parameters
EOD
) do |args|
func_name = "check_grub_config()"
raise(Puppet::ParseError, "#{func_name}: Requires 1 argument" +
"#{args.size} given") if args.size != 1
expected = args[0]
raise(Puppet::ParseError, "#{func_name}: first argument must be a string") \
unless expected.instance_of? String
# get the current boot args
cmd = Facter.value(:get_cmdline)
cmd_array = cmd.split()
value = true
expected.split().each do |element|
value = cmd_array.include?(element)
if value == false
Puppet.debug("#{element} is not presented in #{cmd}")
return value
end
end
value
end
end

View File

@ -0,0 +1,246 @@
class platform::compute::grub::params (
$n_cpus = '',
$cpu_options = '',
$m_hugepages = 'hugepagesz=2M hugepages=0',
$default_pgsz = 'default_hugepagesz=2M',
$keys = ['kvm-intel.eptad', 'default_hugepagesz', 'hugepagesz', 'hugepages', 'isolcpus', 'nohz_full', 'rcu_nocbs', 'kthread_cpus', 'irqaffinity'],
) {
if $::is_broadwell_processor {
$eptad = 'kvm-intel.eptad=0'
} else {
$eptad = ''
}
if $::is_gb_page_supported {
$gb_hugepages = "hugepagesz=1G hugepages=$::number_of_numa_nodes"
} else {
$gb_hugepages = ''
}
$grub_updates = strip("${eptad} ${$gb_hugepages} ${m_hugepages} ${default_pgsz} ${cpu_options}")
}
class platform::compute::grub::update
inherits ::platform::compute::grub::params {
notice("Updating grub configuration")
$to_be_removed = join($keys, " ")
exec { "Remove the cpu arguments":
command => "grubby --update-kernel=ALL --remove-args='$to_be_removed'",
} ->
exec { "Add the cpu arguments":
command => "grubby --update-kernel=ALL --args='$grub_updates'",
}
}
class platform::compute::grub::recovery {
notice("Update Grub and Reboot")
class {'platform::compute::grub::update': } -> Exec['reboot-recovery']
exec { "reboot-recovery":
command => "reboot",
}
}
class platform::compute::grub::audit
inherits ::platform::compute::grub::params {
if ! str2bool($::is_initial_config_primary) {
notice("Audit CPU and Grub Configuration")
$expected_n_cpus = $::number_of_logical_cpus
$n_cpus_ok = ("$n_cpus" == "$expected_n_cpus")
$cmd_ok = check_grub_config($grub_updates)
if $cmd_ok and $n_cpus_ok {
$ensure = present
notice("CPU and Boot Argument audit passed.")
} else {
$ensure = absent
if !$cmd_ok {
notice("Kernel Boot Argument Mismatch")
include ::platform::compute::grub::recovery
}
}
file { "/var/run/compute_huge_goenabled":
ensure => $ensure,
owner => 'root',
group => 'root',
mode => '0644',
}
}
}
class platform::compute::grub::runtime {
include ::platform::compute::grub::update
}
# Mounts virtual hugetlbfs filesystems for each supported page size
class platform::compute::hugetlbf {
if str2bool($::is_hugetlbfs_enabled) {
$fs_list = generate("/bin/bash", "-c", "ls -1d /sys/kernel/mm/hugepages/hugepages-*")
$array = split($fs_list, '\n')
$array.each | String $val | {
$page_name = generate("/bin/bash", "-c", "basename $val")
$page_size = strip(regsubst($page_name, 'hugepages-', ''))
$hugemnt ="/mnt/huge-$page_size"
$options = "pagesize=${page_size}"
notice("Mounting hugetlbfs at: $hugemnt")
exec { "create $hugemnt":
command => "mkdir -p ${hugemnt}",
onlyif => "test ! -d ${hugemnt}",
} ->
mount { "${hugemnt}":
name => "${hugemnt}",
device => 'none',
fstype => 'hugetlbfs',
ensure => 'mounted',
options => "${options}",
atboot => 'yes',
remounts => true,
}
}
}
}
class platform::compute::hugepage::params (
$nr_hugepages_2M = undef,
$nr_hugepages_1G = undef,
$vswitch_2M_pages = '',
$vswitch_1G_pages = '',
$vm_4K_pages = '',
$vm_2M_pages = '',
$vm_1G_pages = '',
) {}
define allocate_pages (
$path,
$page_count,
) {
exec { "Allocate ${page_count} ${path}":
command => "echo $page_count > $path",
onlyif => "test -f $path",
}
}
# Allocates HugeTLB memory according to the attributes specified in the
# nr_hugepages_2M and nr_hugepages_1G
class platform::compute::allocate
inherits ::platform::compute::hugepage::params {
# determine the node file system
if str2bool($::is_per_numa_supported) {
$nodefs = '/sys/devices/system/node'
} else {
$nodefs = '/sys/kernel/mm'
}
if $nr_hugepages_2M != undef {
$nr_hugepages_2M_array = regsubst($nr_hugepages_2M, '[\(\)\"]', '', 'G').split(' ')
$nr_hugepages_2M_array.each | String $val | {
$per_node_2M = $val.split(':')
if size($per_node_2M)== 3 {
$node = $per_node_2M[0]
$page_size = $per_node_2M[1]
allocate_pages { "Start ${node} ${page_size}":
path => "${nodefs}/${node}/hugepages/hugepages-${page_size}/nr_hugepages",
page_count => $per_node_2M[2],
}
}
}
}
if $nr_hugepages_1G != undef {
$nr_hugepages_1G_array = regsubst($nr_hugepages_1G , '[\(\)\"]', '', 'G').split(' ')
$nr_hugepages_1G_array.each | String $val | {
$per_node_1G = $val.split(':')
if size($per_node_1G)== 3 {
$node = $per_node_1G[0]
$page_size = $per_node_1G[1]
allocate_pages { "Start ${node} ${page_size}":
path => "${nodefs}/${node}/hugepages/hugepages-${page_size}/nr_hugepages",
page_count => $per_node_1G[2],
}
}
}
}
}
class platform::compute::extend
inherits ::platform::compute::hugepage::params {
# nova-compute reads on init, extended nova compute options
# used with nova accounting
file { "/etc/nova/compute_extend.conf":
ensure => 'present',
replace => true,
content => template('platform/compute_extend.conf.erb')
}
}
# Mount resctrl to allow Cache Allocation Technology per VM
class platform::compute::resctrl {
if str2bool($::is_resctrl_supported) {
mount { "/sys/fs/resctrl":
name => '/sys/fs/resctrl',
device => 'resctrl',
fstype => 'resctrl',
ensure => 'mounted',
atboot => 'yes',
remounts => true,
}
}
}
# Set Power Management QoS resume latency constraints for CPUs.
# The PM QoS resume latency limit is set to shallow C-state for vswitch CPUs.
# All other CPUs are allowed to go to the deepest C-state available.
class platform::compute::pmqos (
$low_wakeup_cpus = '',
$hight_wakeup_cpus = '',
) {
if str2bool($::is_compute_subfunction) and str2bool($::is_lowlatency_subfunction) {
$script = "/usr/bin/set-cpu-wakeup-latency.sh"
# Set low wakeup latency (shallow C-state) for vswitch CPUs using PM QoS interface
exec { "low-wakeup-latency":
command => "${script} low ${low_wakeup_cpus}",
onlyif => "test -f ${script}",
logoutput => true,
}
#Set high wakeup latency (deep C-state) for non-vswitch CPUs using PM QoS interface
exec { "high-wakeup-latency":
command => "${script} high ${hight_wakeup_cpus}",
onlyif => "test -f ${script}",
logoutput => true,
}
}
}
class platform::compute {
Class[$name] -> Class['::platform::vswitch']
Class[$name] -> Class['::nova::compute']
require ::platform::compute::grub::audit
require ::platform::compute::hugetlbf
require ::platform::compute::allocate
require ::platform::compute::pmqos
require ::platform::compute::resctrl
require ::platform::compute::extend
}

View File

@ -0,0 +1,12 @@
###########################################################################
#
# compute_extend.conf contains compute extended nova options
#
# - This file is managed by Puppet. DO NOT EDIT.
#
###########################################################################
compute_vswitch_2M_pages=<%= @vswitch_2M_pages.gsub!(/\A"|"\Z/, '') %>
compute_vswitch_1G_pages=<%= @vswitch_1G_pages.gsub!(/\A"|"\Z/, '') %>
compute_vm_4K_pages=<%= @vm_4K_pages.gsub!(/\A"|"\Z/, '') %>
compute_vm_2M_pages=<%= @vm_2M_pages.gsub!(/\A"|"\Z/, '') %>
compute_vm_1G_pages=<%= @vm_1G_pages.gsub!(/\A"|"\Z/, '') %>

View File

@ -151,6 +151,7 @@ class AgentManager(service.PeriodicService):
self._notify_subfunctions_alarm_raise = False
self._tpmconfig_rpc_failure = False
self._tpmconfig_host_first_apply = False
self._first_grub_update = False
def start(self):
super(AgentManager, self).start()
@ -316,6 +317,16 @@ class AgentManager(service.PeriodicService):
except subprocess.CalledProcessError as e:
LOG.error("subprocess error: (%d)", e.returncode)
def _force_grub_update(self):
""" Force update the grub on the first AIO controller after the initial
config is completed
"""
if (not self._first_grub_update and
os.path.isfile(tsc.INITIAL_CONFIG_COMPLETE_FLAG)):
self._first_grub_update = True
return True
return False
def periodic_tasks(self, context, raise_on_error=False):
""" Periodic tasks are run at pre-specified intervals. """
@ -712,11 +723,13 @@ class AgentManager(service.PeriodicService):
LOG.exception("Sysinv Agent uncaught exception updating inuma.")
pass
force_grub_update = self._force_grub_update()
try:
# may get duplicate key if already sent on earlier init
rpcapi.icpus_update_by_ihost(icontext,
ihost['uuid'],
icpus)
icpus,
force_grub_update)
except RemoteError as e:
LOG.error("icpus_update_by_ihost RemoteError exc_type=%s" %
e.exc_type)
@ -731,19 +744,21 @@ class AgentManager(service.PeriodicService):
pass
imemory = self._inode_operator.inodes_get_imemory()
try:
# may get duplicate key if already sent on earlier init
rpcapi.imemory_update_by_ihost(icontext,
ihost['uuid'],
imemory)
except RemoteError as e:
LOG.error("imemory_update_by_ihost RemoteError exc_type=%s" %
e.exc_type)
# Allow the audit to update
pass
except:
LOG.exception("Sysinv Agent exception updating imemory conductor.")
pass
if imemory:
try:
# may get duplicate key if already sent on earlier init
rpcapi.imemory_update_by_ihost(icontext,
ihost['uuid'],
imemory)
except RemoteError as e:
LOG.error("imemory_update_by_ihost RemoteError exc_type=%s" %
e.exc_type)
# Allow the audit to update
pass
except:
LOG.exception("Sysinv Agent exception updating imemory "
"conductor.")
pass
idisk = self._idisk_operator.idisk_get()
try:
@ -1283,7 +1298,9 @@ class AgentManager(service.PeriodicService):
try:
# runtime manifests can not be applied without the initial
# configuration applied
if not os.path.isfile(tsc.INITIAL_CONFIG_COMPLETE_FLAG):
force = config_dict.get('force', False)
if (not force and
not os.path.isfile(tsc.INITIAL_CONFIG_COMPLETE_FLAG)):
return
personalities = config_dict.get('personalities')

View File

@ -19,18 +19,13 @@ from os import listdir
from os.path import isfile, join
import random
import re
import shlex
import shutil
import signal
import six
import socket
import subprocess
import tempfile
from sysinv.common import exception
from sysinv.common import utils
from sysinv.openstack.common import log as logging
import tsconfig.tsconfig as tsc
LOG = logging.getLogger(__name__)
@ -97,6 +92,30 @@ class NodeOperator(object):
# self._get_free_memory_MiB()
# self._get_free_memory_nodes_MiB()
def _is_strict(self):
with open(os.devnull, "w") as fnull:
try:
output = subprocess.check_output(
["cat", "/proc/sys/vm/overcommit_memory"],
stderr=fnull)
if int(output) == 2:
return True
except subprocess.CalledProcessError as e:
LOG.info("Failed to check for overcommit, error (%s)",
e.output)
return False
def _is_hugepages_allocated(self):
with open(os.devnull, "w") as fnull:
try:
output = subprocess.check_output(
["cat", "/proc/sys/vm/nr_hugepages"], stderr=fnull)
if int(output) > 0:
return True
except subprocess.CalledProcessError as e:
LOG.info("Failed to check hugepages, error (%s)", e.output)
return False
def convert_range_string_to_list(self, s):
olist = []
s = s.strip()
@ -267,7 +286,7 @@ class NodeOperator(object):
return [name for name in listdir(dir)
if os.path.isdir(join(dir, name))]
def _set_default_avs_hugesize(self, attr):
def _set_default_avs_hugesize(self):
'''
Set the default memory size for avs hugepages when it must fallback to
2MB pages because there are no 1GB pages. In a virtual environment we
@ -281,18 +300,10 @@ class NodeOperator(object):
else:
avs_hugepages_nr = AVS_REAL_MEMORY_MB / hugepage_size
memtotal_mib = attr.get('memtotal_mib', 0)
memavail_mib = attr.get('memavail_mib', 0)
memtotal_mib = memtotal_mib - (hugepage_size * avs_hugepages_nr)
memavail_mib = min(memtotal_mib, memavail_mib)
## Create a new set of dict attributes
hp_attr = {'avs_hugepages_size_mib': hugepage_size,
'avs_hugepages_nr': avs_hugepages_nr,
'avs_hugepages_avail': 0,
'vm_hugepages_use_1G': 'False',
'memtotal_mib': memtotal_mib,
'memavail_mib': memavail_mib}
'avs_hugepages_avail': 0}
return hp_attr
def _inode_get_memory_hugepages(self):
@ -303,17 +314,34 @@ class NodeOperator(object):
'''
imemory = []
num_2M_for_1G = 512
num_4K_for_2M = 512
Ki = 1024
SZ_2M_Ki = 2048
SZ_1G_Ki = 1048576
controller_min_MB = 6000
compute_min_MB = 1600
compute_min_non0_MB = 500
re_node_MemFreeInit = re.compile(r'^Node\s+\d+\s+\MemFreeInit:\s+(\d+)')
initial_compute_config_completed = \
os.path.exists(tsc.INITIAL_COMPUTE_CONFIG_COMPLETE)
# check if it is initial report before the huge pages are allocated
initial_report = not initial_compute_config_completed
# do not send report if the initial compute config is completed and
# the huge pages have not been allocated, i.e.during subsequent
# reboot before the manifest allocates the huge pages
if (initial_compute_config_completed and
not self._is_hugepages_allocated()):
return imemory
for node in range(self.num_nodes):
attr = {}
Total_MiB = 0
Free_MiB = 0
Total_HP_MiB = 0 # Total memory (MiB) currently configured in HPs
Free_HP_MiB = 0
# Check AVS and Libvirt memory
# Loop through configured hugepage sizes of this node and record
# total number and number free
hugepages = "/sys/devices/system/node/node%d/hugepages" % node
try:
@ -325,15 +353,14 @@ class NodeOperator(object):
# role via size; also from /etc/nova/compute_reserved.conf
if sizesplit[1].startswith("1048576kB"):
hugepages_role = "avs"
size = int(1048576 / 1024)
size = int(SZ_1G_Ki / Ki)
else:
hugepages_role = "vm"
size = int(2048 / 1024)
size = int(SZ_2M_Ki / Ki)
nr_hugepages = 0
free_hugepages = 0
# files = os.walk(subdir).next()[2]
mydir = hugepages + '/' + subdir
files = [f for f in listdir(mydir) if isfile(join(mydir, f))]
@ -345,11 +372,11 @@ class NodeOperator(object):
if file.startswith("free_hugepages"):
free_hugepages = int(f.readline())
Total_HP_MiB = Total_HP_MiB + int(nr_hugepages * size)
Free_HP_MiB = Free_HP_MiB + int(free_hugepages * size)
# Libvirt hugepages can now be 1G and 2M, can't only look
# at 2M pages
Total_MiB = Total_MiB + int(nr_hugepages * size)
Free_MiB = Free_MiB + int(free_hugepages * size)
if hugepages_role == "avs":
avs_hugepages_nr = AVS_REAL_MEMORY_MB / size
hp_attr = {
@ -359,18 +386,19 @@ class NodeOperator(object):
'vm_hugepages_nr_1G':
(nr_hugepages - avs_hugepages_nr),
'vm_hugepages_avail_1G': free_hugepages,
'vm_hugepages_use_1G': 'True'
}
else:
if len(subdirs) == 1:
hp_attr = {
'vm_hugepages_nr_2M': (nr_hugepages - 256),
'vm_hugepages_avail_2M': free_hugepages,
}
else:
hp_attr = {
'vm_hugepages_nr_2M': nr_hugepages,
'vm_hugepages_avail_2M': free_hugepages,
}
hp_attr = self._set_default_avs_hugesize()
hp_attr.update({'vm_hugepages_use_1G': 'False'})
avs_hugepages_nr = hp_attr.get('avs_hugepages_nr', 0)
hp_attr.update({
'vm_hugepages_avail_2M': free_hugepages,
'vm_hugepages_nr_2M':
(nr_hugepages - avs_hugepages_nr)
})
attr.update(hp_attr)
@ -378,115 +406,134 @@ class NodeOperator(object):
# silently ignore IO errors (eg. file missing)
pass
# Read the total possible number of libvirt (2M and 1G) hugepages,
# and total available memory determined by compute-huge.
hp_pages_2M = []
hp_pages_1G = []
tot_memory = []
huge_total_attrs = {}
hp_total_info = "/etc/nova/compute_hugepages_total.conf"
try:
with open(hp_total_info, 'r') as infile:
for line in infile:
possible_memorys = line.split("=")
if possible_memorys[0] == 'compute_hp_total_2M':
hp_pages_2M = map(int, possible_memorys[1].split(','))
continue
# Get the free and total memory from meminfo for this node
re_node_MemTotal = re.compile(r'^Node\s+\d+\s+\MemTotal:\s+(\d+)')
re_node_MemFree = re.compile(r'^Node\s+\d+\s+\MemFree:\s+(\d+)')
re_node_FilePages = \
re.compile(r'^Node\s+\d+\s+\FilePages:\s+(\d+)')
re_node_SReclaim = \
re.compile(r'^Node\s+\d+\s+\SReclaimable:\s+(\d+)')
re_node_CommitLimit = \
re.compile(r'^Node\s+\d+\s+\CommitLimit:\s+(\d+)')
re_node_Committed_AS = \
re.compile(r'^Node\s+\d+\s+\'Committed_AS:\s+(\d+)')
elif possible_memorys[0] == 'compute_hp_total_1G':
hp_pages_1G = map(int, possible_memorys[1].split(','))
continue
Free_KiB = 0 # Free Memory (KiB) available
Total_KiB = 0 # Total Memory (KiB)
limit = 0 # only used in strict accounting
committed = 0 # only used in strict accounting
elif possible_memorys[0] == 'compute_total_MiB':
tot_memory = map(int, possible_memorys[1].split(','))
continue
except IOError:
# silently ignore IO errors (eg. file missing)
pass
huge_total_attrs = {
'vm_hugepages_possible_2M': hp_pages_2M[node],
'vm_hugepages_possible_1G': hp_pages_1G[node],
}
# The remaining VM pages are allocated to 4K pages
vm_hugepages_2M = attr.get('vm_hugepages_nr_2M')
vm_hugepages_1G = attr.get('vm_hugepages_nr_1G')
vm_hugepages_4K = (hp_pages_2M[node] - vm_hugepages_2M)
if vm_hugepages_1G:
vm_hugepages_4K -= (vm_hugepages_1G * num_2M_for_1G)
vm_hugepages_4K = vm_hugepages_4K * num_4K_for_2M
# Clip 4K pages, just like compute-huge.
min_4K = 32 * 1024 / 4
if vm_hugepages_4K < min_4K:
vm_hugepages_4K = 0
hp_attrs_4K = {
'vm_hugepages_nr_4K': vm_hugepages_4K,
}
attr.update(huge_total_attrs)
attr.update(hp_attrs_4K)
# Include 4K pages in the displayed VM memtotal.
# Since there is no way to track used VM 4K pages, we treat them
# as available, but that is bogus.
vm_4K_MiB = vm_hugepages_4K * 4 / 1024
Total_MiB += vm_4K_MiB
Free_MiB += vm_4K_MiB
self.total_memory_nodes_MiB.append(Total_MiB)
attroverview = {
'numa_node': node,
'memtotal_mib': Total_MiB,
'memavail_mib': Free_MiB,
'hugepages_configured': 'True',
}
attr.update(attroverview)
new_attrs = {}
if 'avs_hugepages_size_mib' not in attr:
## No 1GB pages were found so borrow from the VM 2MB pool
##
## FIXME:
## It is unfortunate that memory is categorized as VM or
## AVS here on the compute node. It would have been more
## flexible if memory parameters were collected and sent
## up to the controller without making any decisions about
## what the memory was going to be used for. That type of
## decision is better left to the controller (or better
## yet, to the user)
new_attrs = self._set_default_avs_hugesize(attr)
else:
new_attrs = {'vm_hugepages_use_1G': 'True'}
attr.update(new_attrs)
# Get the total memory of the numa node
memTotal_mib = 0
meminfo = "/sys/devices/system/node/node%d/meminfo_extra" % node
meminfo = "/sys/devices/system/node/node%d/meminfo" % node
try:
with open(meminfo, 'r') as infile:
for line in infile:
match = re_node_MemFreeInit.search(line)
match = re_node_MemTotal.search(line)
if match:
memTotal_mib = int(match.group(1))
Total_KiB += int(match.group(1))
continue
match = re_node_MemFree.search(line)
if match:
Free_KiB += int(match.group(1))
continue
match = re_node_FilePages.search(line)
if match:
Free_KiB += int(match.group(1))
continue
match = re_node_SReclaim.search(line)
if match:
Free_KiB += int(match.group(1))
continue
match = re_node_CommitLimit.search(line)
if match:
limit = int(match.group(1))
continue
match = re_node_Committed_AS.search(line)
if match:
committed = int(match.group(1))
continue
if self._is_strict():
Free_KiB = limit - committed
except IOError:
# silently ignore IO errors (eg. file missing)
pass
memTotal_mib /= 1024
if tot_memory[node]:
memTotal_mib = tot_memory[node]
node_attr = {
'node_memtotal_mib': memTotal_mib,
}
attr.update(node_attr)
# Calculate PSS
Pss_MiB = 0
if node == 0:
cmd = 'cat /proc/*/smaps 2>/dev/null | awk \'/^Pss:/ ' \
'{a += $2;} END {printf "%d\\n", a/1024.0;}\''
try:
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE,
shell=True)
result = proc.stdout.read().strip()
Pss_MiB = int(result)
except subprocess.CalledProcessError as e:
LOG.error("Cannot calculate PSS (%s) (%d)", cmd,
e.returncode)
except OSError as e:
LOG.error("Failed to execute (%s) OS error (%d)", cmd,
e.errno)
# need to multiply Total_MiB by 1024 to match compute_huge
node_total_kib = Total_HP_MiB * Ki + Free_KiB + Pss_MiB * Ki
# Read base memory from compute_reserved.conf
base_mem_MiB = 0
with open('/etc/nova/compute_reserved.conf', 'r') as infile:
for line in infile:
if "COMPUTE_BASE_RESERVED" in line:
val = line.split("=")
base_reserves = val[1].strip('\n')[1:-1]
for reserve in base_reserves.split():
reserve = reserve.split(":")
if reserve[0].strip('"') == "node%d" % node:
base_mem_MiB = int(reserve[1].strip('MB'))
# On small systems, clip memory overhead to more reasonable minimal
# settings
if (Total_KiB / Ki - base_mem_MiB) < 1000:
if node == 0:
base_mem_MiB = compute_min_MB
if tsc.nodetype == 'controller':
base_mem_MiB += controller_min_MB
else:
base_mem_MiB = compute_min_non0_MB
Eng_KiB = node_total_kib - base_mem_MiB * Ki
vswitch_mem_kib = (attr.get('avs_hugepages_size_mib', 0) *
attr.get('avs_hugepages_nr', 0) * Ki)
VM_KiB = (Eng_KiB - vswitch_mem_kib)
max_vm_pages_2M = VM_KiB / SZ_2M_Ki
max_vm_pages_1G = VM_KiB / SZ_1G_Ki
attr.update({
'vm_hugepages_possible_2M': max_vm_pages_2M,
'vm_hugepages_possible_1G': max_vm_pages_1G,
})
# calculate 100% 2M pages if it is initial report and the huge
# pages have not been allocated
if initial_report:
Total_HP_MiB += int(max_vm_pages_2M * (SZ_2M_Ki / Ki))
Free_HP_MiB = Total_HP_MiB
attr.update({
'vm_hugepages_nr_2M': max_vm_pages_2M,
'vm_hugepages_avail_2M': max_vm_pages_2M,
'vm_hugepages_nr_1G': 0
})
attr.update({
'numa_node': node,
'memtotal_mib': Total_HP_MiB,
'memavail_mib': Free_HP_MiB,
'hugepages_configured': 'True',
'node_memtotal_mib': node_total_kib / 1024,
})
imemory.append(attr)
@ -502,7 +549,6 @@ class NodeOperator(object):
self.total_memory_MiB = 0
re_node_MemTotal = re.compile(r'^Node\s+\d+\s+\MemTotal:\s+(\d+)')
re_node_MemFreeInit = re.compile(r'^Node\s+\d+\s+\MemFreeInit:\s+(\d+)')
re_node_MemFree = re.compile(r'^Node\s+\d+\s+\MemFree:\s+(\d+)')
re_node_FilePages = re.compile(r'^Node\s+\d+\s+\FilePages:\s+(\d+)')
re_node_SReclaim = re.compile(r'^Node\s+\d+\s+\SReclaimable:\s+(\d+)')
@ -538,19 +584,6 @@ class NodeOperator(object):
# silently ignore IO errors (eg. file missing)
pass
# WRS kernel customization to exclude kernel overheads
meminfo = "/sys/devices/system/node/node%d/meminfo_extra" % node
try:
with open(meminfo, 'r') as infile:
for line in infile:
match = re_node_MemFreeInit.search(line)
if match:
Total_MiB = int(match.group(1))
continue
except IOError:
# silently ignore IO errors (eg. file missing)
pass
Total_MiB /= 1024
Free_MiB /= 1024
self.total_memory_nodes_MiB.append(Total_MiB)

View File

@ -436,8 +436,7 @@ class CPUController(rest.RestController):
rpc_port.save()
if (utils.get_system_mode() == constants.SYSTEM_MODE_SIMPLEX and
action == constants.APPLY_ACTION):
if action == constants.APPLY_ACTION:
# perform rpc to conductor to perform config apply
pecan.request.rpcapi.update_cpu_config(
pecan.request.context)

View File

@ -305,9 +305,9 @@ class HostStatesController(rest.RestController):
(cpu.uuid, values))
pecan.request.dbapi.icpu_update(cpu.uuid, values)
# perform inservice apply if this is a controller in simplex state
if utils.is_host_simplex_controller(ihost):
pecan.request.rpcapi.update_cpu_config(pecan.request.context)
# perform inservice apply
pecan.request.rpcapi.update_cpu_config(pecan.request.context,
host_uuid)
return self._get_host_cpus_collection(ihost.uuid)
@ -3478,6 +3478,46 @@ class HostController(rest.RestController):
(ihost['hostname'], values))
pecan.request.dbapi.imemory_update(m.uuid, values)
@staticmethod
def _update_vm_4k_pages(ihost):
"""
Update VM 4K huge pages.
"""
ihost_inodes = pecan.request.dbapi.inode_get_by_ihost(ihost['uuid'])
for node in ihost_inodes:
mems = pecan.request.dbapi.imemory_get_by_inode(node['id'])
for m in mems:
if m.hugepages_configured:
vm_hugepages_nr_2M = m.vm_hugepages_nr_2M_pending \
if m.vm_hugepages_nr_2M_pending is not None \
else m.vm_hugepages_nr_2M
vm_hugepages_nr_1G = m.vm_hugepages_nr_1G_pending \
if m.vm_hugepages_nr_1G_pending is not None \
else m.vm_hugepages_nr_1G
vm_hugepages_4K = \
(m.node_memtotal_mib - m.platform_reserved_mib)
vm_hugepages_4K -= \
(m.avs_hugepages_nr * m.avs_hugepages_size_mib)
vm_hugepages_4K -= \
(constants.MIB_2M * vm_hugepages_nr_2M)
vm_hugepages_4K -= \
(constants.MIB_1G * vm_hugepages_nr_1G)
vm_hugepages_4K = \
(constants.NUM_4K_PER_MiB * vm_hugepages_4K)
# Clip 4K pages
min_4K = 32 * constants.Ki / 4
if vm_hugepages_4K < min_4K:
vm_hugepages_4K = 0
value = {'vm_hugepages_nr_4K': vm_hugepages_4K}
LOG.info("Set VM 4K pages for host (%s) node (%d) pages "
"(%d)" % (ihost['hostname'], node['id'],
vm_hugepages_4K))
pecan.request.dbapi.imemory_update(m.uuid, value)
@staticmethod
def _semantic_mtc_check_action(hostupdate, action):
"""
@ -4739,6 +4779,9 @@ class HostController(rest.RestController):
if align_2M_memory or align_1G_memory:
self._align_pending_memory(ihost, align_2M_memory, align_1G_memory)
# calculate the VM 4K huge pages for nova
self._update_vm_4k_pages(ihost)
if cutils.is_virtual() or cutils.is_virtual_compute(ihost):
mib_platform_reserved_no_io = mib_reserved
required_platform = \

View File

@ -206,6 +206,8 @@ REGION_SECONDARY = "External"
# Hugepage sizes in MiB
MIB_2M = 2
MIB_1G = 1024
Ki = 1024
NUM_4K_PER_MiB = 256
# Dynamic IO Resident Set Size(RSS) in MiB per socket
DISK_IO_RESIDENT_SET_SIZE_MIB = 2000

View File

@ -2553,7 +2553,8 @@ class ConductorManager(service.PeriodicService):
LOG.info('%9s : %s' % ('thread_id', t))
def icpus_update_by_ihost(self, context,
ihost_uuid, icpu_dict_array):
ihost_uuid, icpu_dict_array,
force_grub_update=False):
"""Create cpus for an ihost with the supplied data.
This method allows records for cpus for ihost to be created.
@ -2561,6 +2562,7 @@ class ConductorManager(service.PeriodicService):
:param context: an admin context
:param ihost_uuid: ihost uuid unique id
:param icpu_dict_array: initial values for cpu objects
:param force_grub_update: bool value to force grub update
:returns: pass or fail
"""
@ -2626,6 +2628,9 @@ class ConductorManager(service.PeriodicService):
subfunctions=ihost.get('subfunctions'),
reference='current (unchanged)',
sockets=cs, cores=cc, threads=ct)
if ihost.administrative == constants.ADMIN_LOCKED and \
force_grub_update:
self.update_cpu_config(context, ihost_uuid)
return
self.print_cpu_topology(hostname=ihost.get('hostname'),
@ -2679,9 +2684,15 @@ class ConductorManager(service.PeriodicService):
# info may have already been posted
pass
if (utils.is_host_simplex_controller(ihost) and
ihost.administrative == constants.ADMIN_LOCKED):
self.update_cpu_config(context)
# if it is the first controller wait for the initial config to
# be completed
if ((utils.is_host_simplex_controller(ihost) and
os.path.isfile(tsc.INITIAL_CONFIG_COMPLETE_FLAG)) or
(not utils.is_host_simplex_controller(ihost) and
ihost.administrative == constants.ADMIN_LOCKED)):
LOG.info("Update CPU grub config, host_uuid (%s), name (%s)"
% (ihost_uuid, ihost.get('hostname')))
self.update_cpu_config(context, ihost_uuid)
return
@ -2753,6 +2764,13 @@ class ConductorManager(service.PeriodicService):
mem = self.dbapi.imemory_create(forihostid, mem_dict)
else:
for imem in imems:
# Include 4K pages in the displayed VM memtotal
if imem.vm_hugepages_nr_4K is not None:
vm_4K_mib = \
(imem.vm_hugepages_nr_4K /
constants.NUM_4K_PER_MiB)
mem_dict['memtotal_mib'] += vm_4K_mib
mem_dict['memavail_mib'] += vm_4K_mib
pmem = self.dbapi.imemory_update(imem['uuid'],
mem_dict)
except:
@ -6689,19 +6707,28 @@ class ConductorManager(service.PeriodicService):
# discard temporary file
os.remove(hosts_file_temp)
def update_cpu_config(self, context):
"""Update the cpu assignment configuration on an AIO system"""
LOG.info("update_cpu_config")
def update_cpu_config(self, context, host_uuid):
"""Update the cpu assignment configuration on a host"""
try:
hostname = socket.gethostname()
host = self.dbapi.ihost_get(hostname)
except Exception as e:
LOG.warn("Failed to get local host object: %s", str(e))
return
command = ['/etc/init.d/compute-huge.sh', 'reload']
rpcapi = agent_rpcapi.AgentAPI()
rpcapi.execute_command(context, host_uuid=host.uuid, command=command)
# only apply the manifest on the host that has compute sub function
host = self.dbapi.ihost_get(host_uuid)
if constants.COMPUTE in host.subfunctions:
force = (not utils.is_host_simplex_controller(host))
LOG.info("update_cpu_config, host uuid: (%s), force: (%s)",
host_uuid, str(force))
personalities = [constants.CONTROLLER, constants.COMPUTE]
config_uuid = self._config_update_hosts(context,
personalities,
host_uuid=host_uuid)
config_dict = {
"personalities": personalities,
"host_uuids": [host_uuid],
"classes": ['platform::compute::grub::runtime']
}
self._config_apply_runtime_manifest(context, config_uuid,
config_dict,
force=force,
host_uuid=host_uuid)
def _update_resolv_file(self, context, config_uuid, personalities):
"""Generate and update the resolv.conf files on the system"""
@ -7403,7 +7430,8 @@ class ConductorManager(service.PeriodicService):
context,
config_uuid,
config_dict,
host_uuid=None):
host_uuid=None,
force=False):
"""Apply manifests on all hosts affected by the supplied personalities.
If host_uuid is set, only update hiera data for that host
@ -7413,8 +7441,10 @@ class ConductorManager(service.PeriodicService):
# is not set. If host_uuid is set only update hiera data for that host
self._config_update_puppet(config_uuid,
config_dict,
host_uuid=host_uuid)
host_uuid=host_uuid,
force=force)
config_dict.update({'force': force})
rpcapi = agent_rpcapi.AgentAPI()
rpcapi.config_apply_runtime_manifest(context,
config_uuid=config_uuid,

View File

@ -282,7 +282,9 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy):
inuma_dict_array=inuma_dict_array))
def icpus_update_by_ihost(self, context,
ihost_uuid, icpu_dict_array):
ihost_uuid, icpu_dict_array,
force_grub_update,
):
"""Create cpus for an ihost with the supplied data.
This method allows records for cpus for ihost to be created.
@ -290,13 +292,15 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy):
:param context: an admin context
:param ihost_uuid: ihost uuid unique id
:param icpu_dict_array: initial values for cpu objects
:param force_grub_update: bool value to force grub update
:returns: pass or fail
"""
return self.call(context,
self.make_msg('icpus_update_by_ihost',
ihost_uuid=ihost_uuid,
icpu_dict_array=icpu_dict_array))
icpu_dict_array=icpu_dict_array,
force_grub_update=force_grub_update))
def imemory_update_by_ihost(self, context,
ihost_uuid, imemory_dict_array):
@ -834,13 +838,15 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy):
status=status,
error=error))
def update_cpu_config(self, context):
def update_cpu_config(self, context, host_uuid):
"""Synchronously, have the conductor update the cpu
configuration.
:param context: request context.
:param host_uuid: host unique uuid
"""
return self.call(context, self.make_msg('update_cpu_config'))
return self.call(context, self.make_msg('update_cpu_config',
host_uuid=host_uuid))
def iconfig_update_by_ihost(self, context,
ihost_uuid, imsg_dict):

View File

@ -4,6 +4,7 @@
# SPDX-License-Identifier: Apache-2.0
#
import collections
import abc
import itertools
import netaddr
@ -213,3 +214,11 @@ class BasePuppet(object):
s = "%s-%s" % (rng[0][1], rng[-1][1])
ranges.append(s)
return ','.join(ranges)
def _get_numa_index_list(self, obj):
"""Create map of objects indexed by numa node"""
obj_lists = collections.defaultdict(list)
for index, o in enumerate(obj):
o["_index"] = index
obj_lists[o.numa_node].append(o)
return obj_lists