config/compute-huge/compute-huge/compute-huge.sh

#!/bin/bash
################################################################################
# Copyright (c) 2013-2016 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#
################################################################################
# compute-huge.sh
# - mounts hugepages memory backing for libvirt/qemu and vswitch
# - allocates per-NUMA node hugepages values based on compute node
#   topology and memory engineered parameters.
# - IMPORTANT: mount of hugetlbfs must be called after udev is
#   initialized, otherwise libvirt/qemu will not properly recognize
#   the mount as HugeTLBFS.
# - generates /etc/nova/compute_extend.conf which nova-compute reads on init
# - updates grub.conf kernel boot arg parameters based on hugepages and cores

. /usr/bin/tsconfig

# Enable the 'extglob' feature to allow grouping in pattern matching
shopt -s extglob

# Utility functions
LOG_FUNCTIONS=${LOG_FUNCTIONS:-"/etc/init.d/log_functions.sh"}
CPUMAP_FUNCTIONS=${CPUMAP_FUNCTIONS:-"/etc/init.d/cpumap_functions.sh"}
source /etc/init.d/functions
[[ -e ${LOG_FUNCTIONS} ]] && source ${LOG_FUNCTIONS}
[[ -e ${CPUMAP_FUNCTIONS} ]] && source ${CPUMAP_FUNCTIONS}

# Configuration
PRODUCT_NAME=$(dmidecode --string 'system-product-name' 2>/dev/null)
RESERVE_CONF=${RESERVE_CONF:-"/etc/nova/compute_reserved.conf"}
VSWITCH_CONF=${VSWITCH_CONF:-"/etc/vswitch/vswitch.conf"}
linkname=$(readlink -n -f $0)
scriptname=$(basename $linkname)

# Enable debug logs (uncomment)
LOG_DEBUG=1

# Flag file that is touched to signal that it is safe to enable the board
COMPUTE_HUGE_GOENABLED="/var/run/compute_huge_goenabled"

# Flag file that is touched to signal that compute-huge has run at least once
COMPUTE_HUGE_RUN_ONCE="/etc/platform/.compute_huge_run_once"

# Flag file that is touched to indicate that hei host needs a reboot to finish the config
RECONFIG_REBOOT_REQUIRED="/var/run/.reconfig_reboot_required"

# Grub configuration files
GRUB_DEFAULTS=/etc/default/grub
if [ -f /etc/centos-release ] ; then
    GRUB=grub2-mkconfig
    if [ -d /sys/firmware/efi ] ; then
        GRUB_CONFIG=/boot/efi/EFI/centos/grub.cfg
    else
        GRUB_CONFIG=/boot/grub2/grub.cfg
    fi
else
    GRUB=grub-mkconfig
    GRUB_CONFIG=/boot/grub/grub.cfg
fi

# Various globals
declare -i N_CPUS=1
declare -i N_SOCKETS=1
declare -i N_SIBLINGS_IN_PKG=1
declare -i N_CORES_IN_PKG=1
declare -i N_THREADS=1
declare -i N_NUMA=1
declare -i MEMTOTAL_MiB=0
declare -i do_huge=1
declare -i is_reconfig=0

# Disable Broadwell kvm-intel.eptad flag to prevent kernel oops/memory issues.
declare BROADWELL_EPTAD="0" # Broadwell flag kvm-intel.eptad (0=disable, 1=enable)

# NOTE: cgroups currently disabled - this was previously working with DEV 0001,
# however we now get write permission errors. cgroups is supported by libvirt
# to give domain accounting, but is optional. Likely need to re-enable this to
# support performance measurements.
declare -i do_cgroups=0

# Ensure that first configuration doesn't contain stale info,
# clear these fields prior to reading config file.
if [ ! -f ${COMPUTE_HUGE_RUN_ONCE} ]; then
    sed -i "s#^COMPUTE_VM_MEMORY_2M=.*\$#COMPUTE_VM_MEMORY_2M=\(\)#" ${RESERVE_CONF}
    sed -i "s#^COMPUTE_VM_MEMORY_1G=.*\$#COMPUTE_VM_MEMORY_1G=\(\)#" ${RESERVE_CONF}
fi

# Load configuration files (declare arrays that get sourced)
declare -a COMPUTE_PLATFORM_CORES
declare -a COMPUTE_VSWITCH_CORES
declare -a COMPUTE_VSWITCH_MEMORY
declare -a COMPUTE_VM_MEMORY_2M
declare -a COMPUTE_VM_MEMORY_1G
[[ -e ${RESERVE_CONF} ]] && source ${RESERVE_CONF}
[[ -e ${VSWITCH_CONF} ]] && source ${VSWITCH_CONF}
. /etc/platform/platform.conf

################################################################################
# vswitch_cpu_list() - compute the vswitch cpu list, including it's siblings
################################################################################
function vswitch_cpu_list() {
    local CONF_FILE=${VSWITCH_CONF}
    local KEY="VSWITCH_CPU_LIST="

    provision_list=$(curl -sf http://controller:6385/v1/ihosts/${UUID}/icpus/vswitch_cpu_list)
    if [ $? -eq 0 ]; then
       list=`echo ${provision_list} | bc`
       grep ${KEY} ${CONF_FILE} > /dev/null
       if [ $? -ne 0 ]; then
          echo "$KEY\"$list"\" >>  ${CONF_FILE}
       else
          #update vswitch.conf
          sed -i "s/^VSWITCH_CPU_LIST=.*/VSWITCH_CPU_LIST=\"${list}\"/" /etc/vswitch/vswitch.conf
       fi
    else
       list=$(get_vswitch_cpu_list)
    fi
    # Expand vswitch cpulist
    vswitch_cpulist=$(expand_sequence ${list} " ")

    cpulist=""
    for e in $vswitch_cpulist
    do
       # claim hyperthread siblings if SMT enabled
       SIBLINGS_CPULIST=$(cat /sys/devices/system/cpu/cpu${e}/topology/thread_siblings_list 2>/dev/null)
       siblings_cpulist=$(expand_sequence ${SIBLINGS_CPULIST} " ")
       for s in $siblings_cpulist
       do
          in_list ${s} ${cpulist}
          if [ $? -eq 1 ]
          then
              cpulist=$(append_list ${s} ${cpulist})
          fi
       done
    done

    echo "$cpulist"
    return 0
}

################################################################################
# platform_cpu_list() - compute the platform cpu list, including it's siblings
################################################################################
function platform_cpu_list() {
    local CONF_FILE=${RESERVE_CONF}
    local KEY="PLATFORM_CPU_LIST="

    provision_list=$(curl -sf http://controller:6385/v1/ihosts/${UUID}/icpus/platform_cpu_list)
    if [ $? -eq 0 ]; then
       list=`echo ${provision_list} | bc`
       grep ${KEY} ${CONF_FILE} > /dev/null
       if [ $? -ne 0 ]; then
          echo "$KEY\"$list"\" >>  ${CONF_FILE}
       else
          #update compute_reserved.conf
          sed -i "s/^${KEY}.*/${KEY}\"${list}\"/" ${CONF_FILE}
       fi
    else
       list=$(get_platform_cpu_list)
    fi
    # Expand platform cpulist
    platform_cpulist=$(expand_sequence ${list} " ")

    cpulist=""
    for e in $platform_cpulist
    do
       # claim hyperthread siblings if SMT enabled
       SIBLINGS_CPULIST=$(cat /sys/devices/system/cpu/cpu${e}/topology/thread_siblings_list 2>/dev/null)
       siblings_cpulist=$(expand_sequence ${SIBLINGS_CPULIST} " ")
       for s in $siblings_cpulist
       do
          in_list ${s} ${cpulist}
          if [ $? -eq 1 ]
          then
              cpulist=$(append_list ${s} ${cpulist})
          fi
       done
    done

    echo "$cpulist"
    return 0
}

################################################################################
# check_cpu_configuration() - check that the current state of the CPU (e.g.,
# hyperthreading enabled/disabled) matches the expected state that was last
# written to the configuration file.
#
# NOTE: Puppet manifests are generated on unlock via sysinv profile.
#       Config file is updated via manifest (cgcs_vswitch_095).
#
################################################################################
function check_cpu_configuration() {
    local CONFIGURED=$(condense_sequence $(expand_sequence ${COMPUTE_CPU_LIST} " "))
    local ACTUAL="0-$((${N_CPUS} - 1))"
    local INIT="0-1"

    if [ -z "${CONFIGURED}" -o -z "${ACTUAL}" ]; then
       log_error "Unable to compare configured=${CONFIGURED} and actual=${ACTUAL} CPU configurations"
       return 2
    fi

    if [ "${CONFIGURED}" == "${INIT}" ]; then
       log_debug "CPU configuration init: configured=${CONFIGURED} and actual=${ACTUAL}"
       return 0
    fi

    if [ "${CONFIGURED}" != "${ACTUAL}" ]; then
        log_error "CPU configurations mismatched: configured=${CONFIGURED} and actual=${ACTUAL}"
        return 1
    fi

    return 0
}

################################################################################
# check_kernel_boot_args() - check that the kernel boot arguments are in
# agreement with the current set of logical CPU instances. That is, check that
# the hyperthreading state has not changed since the last time we updated our
# grub configuration.
# - check Broadwell kvm-intel.eptad flag is in agreement with current setting
#
################################################################################
function check_kernel_boot_args() {
  local BASE_CPULIST=$1
  local ISOL_CPULIST=$2

  local BASE_CPUMAP=$(cpulist_to_cpumap ${BASE_CPULIST} ${N_CPUS})
  local RCU_NOCBS_CPUMAP=$(invert_cpumap ${BASE_CPUMAP} ${N_CPUS})
  local RCU_NOCBS_CPULIST=$(cpumap_to_cpulist ${RCU_NOCBS_CPUMAP} ${N_CPUS})

  ## Query the current boot args and store them in a hash/map for easy access
  local CMDLINE=($(cat /proc/cmdline))
  declare -A BOOTARGS
  for ITEM in ${CMDLINE[@]}; do
     KV=(${ITEM//=/ })
     BOOTARGS[${KV[0]}]=${KV[1]}
  done

  ## Audit the attributes that impacts VM scheduling behaviour
  if [ "${BOOTARGS[isolcpus]}" != "${ISOL_CPULIST}" ]; then
      log_error "Kernel boot argument mismatch: isolcpus=${BOOTARGS[isolcpus]} expecting ${ISOL_CPULIST}"
      return 1
  fi

  if [ "${BOOTARGS[rcu_nocbs]}" != "${RCU_NOCBS_CPULIST}" ]; then
      log_error "Kernel boot argument mismatch: rcu_nocbs=${BOOTARGS[rcu_nocbs]} expecting ${RCU_NOCBS_CPULIST}"
      return 1
  fi

  if [ "${BOOTARGS[kthread_cpus]}" != "${BASE_CPULIST}" ]; then
      log_error "Kernel boot argument mismatch: kthread_cpus=${BOOTARGS[kthread_cpus]} expecting ${BASE_CPULIST}"
      return 1
  fi

  if [ "${BOOTARGS[irqaffinity]}" != "${BASE_CPULIST}" ]; then
      log_error "Kernel boot argument mismatch: irqaffinity=${BOOTARGS[irqaffinity]} expecting ${BASE_CPULIST}"
      return 1
  fi

  if grep -q -E "^model\s+:\s+79$" /proc/cpuinfo
  then
      if [ "${BOOTARGS[kvm-intel.eptad]}" != "${BROADWELL_EPTAD}" ]; then
          log_error "Kernel boot argument mismatch: kvm-intel.eptad=${BOOTARGS[kvm-intel.eptad]} expecting ${BROADWELL_EPTAD}"
          return 1
      fi
  fi

  return 0
}

################################################################################
# update_grub_configuration() - update the grub configuration so that the
# kernel boot arguments are correct on the next reboot.
#
################################################################################
function update_grub_configuration() {
  local BASE_CPULIST=$1
  local ISOL_CPULIST=$2

  local BASE_CPUMAP=$(cpulist_to_cpumap ${BASE_CPULIST} ${N_CPUS})
  local RCU_NOCBS_CPUMAP=$(invert_cpumap ${BASE_CPUMAP} ${N_CPUS})
  local RCU_NOCBS_CPULIST=$(cpumap_to_cpulist ${RCU_NOCBS_CPUMAP} ${N_CPUS})

  log "Updating grub configuration:"

  if [ ! -f ${GRUB_DEFAULTS} ]; then
      log_error "Missing grub defaults file ${GRUB_DEFAULTS}"
      return 1
  fi

  if [ ! -f ${GRUB_CONFIG} ]; then
      log_error "Missing grub config file ${GRUB_CONFIG}"
      return 1
  fi

  source ${GRUB_DEFAULTS}
  if [ -z "${GRUB_CMDLINE_LINUX}" ]; then
      log_error "Missing grub cmdline variable: GRUB_CMDLINE_LINUX"
      return 1
  fi

  ## Remove the arguments that we need to update (or remove)
  VALUE="${GRUB_CMDLINE_LINUX//?([[:blank:]])+(kvm-intel.eptad|default_hugepagesz|hugepagesz|hugepages|isolcpus|nohz_full|rcu_nocbs|kthread_cpus|irqaffinity)=+([-,0-9MG])/}"

  ## Add the new argument values

  # Broadwell specific flags (model: 79)
  if grep -q -E "^model\s+:\s+79$" /proc/cpuinfo
  then
      VALUE="${VALUE} kvm-intel.eptad=${BROADWELL_EPTAD}"
  fi
  if grep -q pdpe1gb /proc/cpuinfo
  then
      VALUE="${VALUE} hugepagesz=1G hugepages=${N_NUMA}"
  fi
  VALUE="${VALUE} hugepagesz=2M hugepages=0"
  VALUE="${VALUE} default_hugepagesz=2M"
  VALUE="${VALUE} isolcpus=${ISOL_CPULIST}"
  VALUE="${VALUE} rcu_nocbs=${RCU_NOCBS_CPULIST}"
  VALUE="${VALUE} kthread_cpus=${BASE_CPULIST}"
  VALUE="${VALUE} irqaffinity=${BASE_CPULIST}"
  if [[ "$subfunction" == *"compute,lowlatency" ]]; then
      # As force_grub_update() and check_cpu_grub_configuration call this
      # function with an ISOL_CPULIST with from lowlatency compute checks we'll
      # use it here for the nohz_full option
      VALUE="${VALUE} nohz_full=${ISOL_CPULIST}"
  fi

  if [ "${VALUE}" == "${GRUB_CMDLINE_LINUX}" ] &&
    grep -q -e "${GRUB_CMDLINE_LINUX}" /proc/cmdline
  then
      log_debug "Unchanged cmdline: ${GRUB_CMDLINE_LINUX}"
      return 0
  fi

  ## Replace the value in the file and re-run the grub config tool
  perl -pi -e 's/(GRUB_CMDLINE_LINUX)=.*/\1=\"'"${VALUE}"'\"/g' ${GRUB_DEFAULTS}
  ${GRUB} -o ${GRUB_CONFIG} 2>/dev/null
  RET=$?
  if [ ${RET} -ne 0 ]; then
      log_error "Failed to run grub-mkconfig, rc=${RET}"
      return 1
  fi
  source ${GRUB_DEFAULTS}
  if [ -z "${GRUB_CMDLINE_LINUX}" ]; then
      log_error "Missing grub cmdline variable: GRUB_CMDLINE_LINUX"
      return 1
  else
      log_debug "Updated cmdline: ${GRUB_CMDLINE_LINUX}"
  fi
  sync

  return 0
}

################################################################################
# force_grub_update() - force an update to the grub configuration so that the
# kernel boot arguments are correct on the next reboot.
#
################################################################################
function force_grub_update() {
  log_debug "stop: force_grub_update"

  ## fetch the cpu topology
  get_topology

  ## calculate the base and isolation cpu lists
  local BASE_CPULIST=$(platform_cpu_list)
  local ISOL_CPULIST=$(vswitch_cpu_list)

  if [[ "$subfunction" == *"compute,lowlatency" ]]; then
      local BASE_CPUMAP=$(cpulist_to_cpumap ${BASE_CPULIST} ${N_CPUS})
      local RCU_NOCBS_CPUMAP=$(invert_cpumap ${BASE_CPUMAP} ${N_CPUS})
      local RCU_NOCBS_CPULIST=$(cpumap_to_cpulist ${RCU_NOCBS_CPUMAP} ${N_CPUS})

      ISOL_CPULIST=$RCU_NOCBS_CPULIST
  fi

  if [ -z "${ISOL_CPULIST}" ]; then
      log_error "isolcpus cpu list is empty"
      return 1
  fi

  ## update grub with new settings
  update_grub_configuration ${BASE_CPULIST} ${ISOL_CPULIST}
  RET=$?

  return ${RET}
}

################################################################################
# check_cpu_grub_configuration() - check kernel boot arguments to ensure
# that the current CPU configuration matches the isolation and platform arguments
#  passed to the kernel at boot time.
#
################################################################################
function check_cpu_grub_configuration() {
  ## calculate the base and isolation cpu lists
  local BASE_CPULIST=$(platform_cpu_list)
  local ISOL_CPULIST=$(vswitch_cpu_list)

  if [[ "$subfunction" == *"compute,lowlatency" ]]; then
      local BASE_CPUMAP=$(cpulist_to_cpumap ${BASE_CPULIST} ${N_CPUS})
      local RCU_NOCBS_CPUMAP=$(invert_cpumap ${BASE_CPUMAP} ${N_CPUS})
      local RCU_NOCBS_CPULIST=$(cpumap_to_cpulist ${RCU_NOCBS_CPUMAP} ${N_CPUS})

      ISOL_CPULIST=$RCU_NOCBS_CPULIST
  fi

  if [ -z "${ISOL_CPULIST}" ]; then
      log_error "isolcpus cpu list is empty"
      return 1
  fi

  if [ -z "${BASE_CPULIST}" ]; then
      log_error "platform cpu list is empty"
      return 1
  fi

  ## check that the boot arguments are consistent with the current
  ## base/isolation cpu lists
  check_kernel_boot_args ${BASE_CPULIST} ${ISOL_CPULIST}
  RET=$?
  if [ ${RET} -eq 1 ]; then
      log_error "Boot args check failed; updating grub configuration"
      update_grub_configuration ${BASE_CPULIST} ${ISOL_CPULIST}
      RET=$?
      if [ ${RET} -ne 0 ]; then
          log_error "Failed to update grub configuration, rc=${RET}"
          return 2
      fi

      return 1
  fi

  return 0
}

################################################################################
# check_configuration() - check system configuration
#
################################################################################
function check_configuration() {
    ## Since script is called multiple times, remove previous flag
    rm -f ${COMPUTE_HUGE_GOENABLED}

    if [ -z "${N_CPUS}" ]; then
        log_error "N_CPUS environment variable not set"
        return 1
    fi

    # Check that the actual CPU configuration matches configured settings
    check_cpu_configuration
    RET1=$?
    if [ ${RET1} -gt 1 ]; then
        return ${RET1}
    fi

    # Check that CPU isolation and platform configuration has been applied according to the
    # current CPU configuration
    check_cpu_grub_configuration
    RET2=$?
    if [ ${RET2} -gt 1 ]; then
        return ${RET2}
    fi

    RET=$[ ${RET1} + ${RET2} ]
    if [ ${RET} -eq 0 ]; then
        ## All checks passed; safe to enable
        log_debug "compute-huge-goenabled: pass"
        touch ${COMPUTE_HUGE_GOENABLED}
    elif [ "$nodetype" = "controller" \
            -a ! -f ${COMPUTE_HUGE_RUN_ONCE} \
            -a ! -f ${PLATFORM_SIMPLEX_FLAG} ]; then
        touch ${COMPUTE_HUGE_RUN_ONCE}
        log_debug "Rebooting to process config changes"
        /sbin/reboot
    else
        log_error "compute-huge-goenabled: failed"
        if [ ! -f ${COMPUTE_HUGE_RUN_ONCE} ]; then
            touch ${RECONFIG_REBOOT_REQUIRED}
        fi
    fi

    # Mark when configuration run via compute_config packstack applyscript
    if [ ${is_reconfig} -eq 1 ]; then
        if [ ! -f ${COMPUTE_HUGE_RUN_ONCE} ]; then
            log_debug "check_configuration: config FIRST_RUN"
        else
            log_debug "check_configuration: config"
        fi
        touch ${COMPUTE_HUGE_RUN_ONCE}
    fi

    return 0
}


################################################################################
# get_topology() - deduce CPU and NUMA topology
#
################################################################################
function get_topology() {
    # number of logical cpus
    N_CPUS=$(cat /proc/cpuinfo 2>/dev/null | \
        awk '/^[pP]rocessor/ { n +=1 } END { print (n>0) ? n : 1}')

    # number of sockets (i.e. packages)
    N_SOCKETS=$(cat /proc/cpuinfo 2>/dev/null | \
        awk '/physical id/ { a[$4] = 1; } END { n=0; for (i in a) n++; print (n>0) ? n : 1 }')

    # number of logical cpu siblings per package
    N_SIBLINGS_IN_PKG=$(cat /proc/cpuinfo 2>/dev/null | \
        awk '/^siblings/ {n = $3} END { print (n>0) ? n: 1 }')

    # number of cores per package
    N_CORES_IN_PKG=$(cat /proc/cpuinfo 2>/dev/null | \
        awk '/^cpu cores/ {n = $4} END { print (n>0) ? n : 1 }')

    # number of SMT threads per core
    N_THREADS=$[ $N_SIBLINGS_IN_PKG / $N_CORES_IN_PKG ]

    # number of numa nodes
    N_NUMA=$(ls -d /sys/devices/system/node/node* 2>/dev/null | wc -l)

    # Total physical memory
    MEMTOTAL_MiB=$(cat /proc/meminfo 2>/dev/null | \
        awk '/^MemTotal/ {n = int($2/1024)} END { print (n>0) ? n : 0 }')

    log_debug "TOPOLOGY: CPUS:${N_CPUS} SOCKETS:${N_SOCKETS}" \
              "SIBLINGS:${N_SIBLINGS_IN_PKG} CORES:${N_CORES_IN_PKG} THREADS:${N_THREADS}" \
              "NODES:${N_NUMA} MEMTOTAL:${MEMTOTAL_MiB} MiB"

    # Get kernel command line options
    CMDLINE=$(cat /proc/cmdline 2>/dev/null)
    if [[ $CMDLINE =~ (console=.*) ]]; then
        log_debug "cmdline: ${BASH_REMATCH[1]}"
    fi
}

################################################################################
# is_strict() - determine whether we are using strict memory accounting
#
################################################################################
function is_strict() {
    RET=0
    OC_MEM=$(cat /proc/sys/vm/overcommit_memory 2>/dev/null)
    if [ ${OC_MEM} -eq 2 ]; then
        echo 1  # strict
    else
        echo 0  # non-strict
    fi
}

################################################################################
# get_memory() - determine memory breakdown for standard linux memory and
#                default hugepages
#
################################################################################
function get_memory() {
    local NODESYSFS=/sys/devices/system/node
    local HTLBSYSFS=""
    local -i Ki=1024
    local -i Ki2=512
    local -i SZ_2M_Ki=2048
    local -i SZ_1G_Ki=1048576

    # number of numa nodes
    local n_numa=$(ls -d /sys/devices/system/node/node* 2>/dev/null | wc -l)

    # Parse all values of /proc/meminfo
    declare -gA meminfo
    while read -r line
    do
        if [[ $line =~ ^([[:alnum:]_]+):[[:space:]]+([[:digit:]]+) ]]; then
            meminfo[${BASH_REMATCH[1]}]=${BASH_REMATCH[2]}
        fi
    done < "/proc/meminfo"

    # Parse all values of /sys/devices/system/node/node*/meminfo
    declare -gA memnode
    for ((node=0; node < n_numa; node++))
    do
        while read -r line
        do
            if [[ $line =~ ^Node[[:space:]]+[[:digit:]]+[[:space:]]+([[:alnum:]_]+):[[:space:]]+([[:digit:]]+) ]]; then
                memnode[$node,${BASH_REMATCH[1]}]=${BASH_REMATCH[2]}
            fi
        done < "/sys/devices/system/node/node${node}/meminfo"
    done

    # Parse all values of /sys/devices/system/node/node*/meminfo_extra
    for ((node=0; node < n_numa; node++))
    do
        memnode[$node,'MemFreeInit']=${memnode[$node,'MemTotal']}
        if [ -f /sys/devices/system/node/node${node}/meminfo_extra ]; then
            while read -r line
            do
                if [[ $line =~ ^Node[[:space:]]+[[:digit:]]+[[:space:]]+([[:alnum:]_]+):[[:space:]]+([[:digit:]]+) ]]; then
                    memnode[$node,${BASH_REMATCH[1]}]=${BASH_REMATCH[2]}
                fi
            done < "/sys/devices/system/node/node${node}/meminfo_extra"
        fi
    done

    # Parse all values of /sys/devices/system/node/node*/hugepages/hugepages-${pgsize}kB
    declare -a pgsizes
    pgsizes+=(${SZ_2M_Ki})
    pgsizes+=(${SZ_1G_Ki})
    for ((node=0; node < n_numa; node++))
    do
        for pgsize in ${pgsizes[@]}
        do
            memnode[$node,$pgsize,'nr']=0
            memnode[$node,$pgsize,'nf']=0
        done
    done
    for ((node=0; node < n_numa; node++))
    do
        for pgsize in ${pgsizes[@]}
        do
            HTLBSYSFS=${NODESYSFS}/node${node}/hugepages/hugepages-${pgsize}kB
            if [ -d ${HTLBSYSFS} ]; then
                memnode[$node,$pgsize,'nr']=$(cat ${HTLBSYSFS}/nr_hugepages)
                memnode[$node,$pgsize,'nf']=$(cat ${HTLBSYSFS}/free_hugepages)
            fi
        done
    done

    # Calculate available memory
    is_strict=$(is_strict)
    if [ $is_strict -eq 1 ]; then
        strict_msg='strict accounting'
        meminfo['Avail']=$[ ${meminfo['CommitLimit']} - ${meminfo['Committed_AS']} ]
    else
        strict_msg='non-strict accounting'
        meminfo['Avail']=$[ ${meminfo['MemFree']} +
                            ${meminfo['Cached']} +
                            ${meminfo['Buffers']} +
                            ${meminfo['SReclaimable']} ]
    fi
    # Used memory (this includes kernel overhead, so it is a bit bogus)
    meminfo['Used']=$[ ${meminfo['MemTotal']} - ${meminfo['Avail']} ]
    for ((node=0; node < n_numa; node++))
    do
        memnode[${node},'Avail']=$[ ${memnode[$node,'MemFree']} +
                                    ${memnode[$node,'FilePages']} +
                                    ${memnode[$node,'SReclaimable']} ]
        memnode[${node},'HTot']=0
        memnode[${node},'HFree']=0
        for pgsize in ${pgsizes[@]}
        do
            memnode[${node},'HTot']=$[ ${memnode[${node},'HTot']} +
                ${pgsize} * ${memnode[$node,${pgsize},'nr']} ]
            memnode[${node},'HFree']=$[ ${memnode[${node},'HFree']} +
                ${pgsize} * ${memnode[$node,${pgsize},'nf']} ]
        done
    done

    # Print memory usage summary
    log_debug "MEMORY OVERALL: MiB (${strict_msg})"

    # Print overall memory
    MEM=$(printf "%6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s %6s" \
        'Tot' 'Used' 'Free' 'Ca' 'Buf' 'Slab' 'CAS' 'CLim' 'Dirty' 'WBack' 'Active' 'Inact' 'Avail')
    log_debug "${MEM}"
    MEM=$(printf "%6d %6d %6d %6d %6d %6d %6d %6d %6d %6d %6d %6d %6d" \
        $[ (${meminfo['MemTotal']}     + $Ki2) / $Ki ] \
        $[ (${meminfo['Used']}         + $Ki2) / $Ki ] \
        $[ (${meminfo['MemFree']}      + $Ki2) / $Ki ] \
        $[ (${meminfo['Cached']}       + $Ki2) / $Ki ] \
        $[ (${meminfo['Buffers']}      + $Ki2) / $Ki ] \
        $[ (${meminfo['Slab']}         + $Ki2) / $Ki ] \
        $[ (${meminfo['Committed_AS']} + $Ki2) / $Ki ] \
        $[ (${meminfo['CommitLimit']}  + $Ki2) / $Ki ] \
        $[ (${meminfo['Dirty']}        + $Ki2) / $Ki ] \
        $[ (${meminfo['Writeback']}    + $Ki2) / $Ki ] \
        $[ (${meminfo['Active']}       + $Ki2) / $Ki ] \
        $[ (${meminfo['Inactive']}     + $Ki2) / $Ki ] \
        $[ (${meminfo['Avail']}        + $Ki2) / $Ki ])
    log_debug "${MEM}"

    # Print per-numa node memorybreakdown
    log_debug "MEMORY PER-NUMA NODE: MiB"
    MEM=""
    for ((node=0; node < n_numa; node++))
    do
        L=$(printf " %7s %7s %7s %7s" "$node:Init" "$node:Avail" "$node:Htot" "$node:HFree")
        MEM="${MEM}${L}"
    done
    log_debug "${MEM}"
    MEM=""
    for ((node=0; node < n_numa; node++))
    do
        L=$(printf " %7d %7d %7d %7d" \
            $[ (${memnode[$node,'MemFreeInit']} + $Ki2) / $Ki ] \
            $[ (${memnode[$node,'Avail']}       + $Ki2) / $Ki ] \
            $[ (${memnode[$node,'HTot']}        + $Ki2) / $Ki ] \
            $[ (${memnode[$node,'HFree']}       + $Ki2) / $Ki ])
        MEM="${MEM}${L}"
    done
    log_debug "${MEM}"
}

################################################################################
# mount_cgroups()
#  - mounts cgroups and all available controllers.
#  - cgroup domains used by libvirt/qemu
#
################################################################################
function mount_cgroups() {
    local RET=0

    # mount /sys/fs/cgroup
    log_debug "Mounting cgroups"
    mountpoint -q /sys/fs/cgroup || \
        mount -t tmpfs -o uid=0,gid=0,mode=0755 cgroup /sys/fs/cgroup
    RET=$?
    if [ ${RET} -ne 0 ]; then
        log_error "Failed to mount cgroups, rc=${RET}"
        return ${RET}
    fi

    # mount each available cgroup controller
    for cnt in $(cat /proc/cgroups | awk '!/#/ {print $1;}')
    do
        mkdir -p /sys/fs/cgroup/$cnt
        mountpoint -q /sys/fs/cgroup/$cnt || \
            (mount -n -t cgroup -o $cnt cgroup /sys/fs/cgroup/$cnt || \
            rmdir /sys/fs/cgroup/$cnt || true)
    done
    return ${RET}
}

################################################################################
# mount_resctrl()
#  - mounts resctrl for Cache Allocation Technology
#
################################################################################
function mount_resctrl() {
    local RET=0

    # mount /sys/fs/resctrl
    log_debug "Mounting resctrl"
    mountpoint -q /sys/fs/resctrl || \
        mount -t resctrl resctrl /sys/fs/resctrl
    RET=$?
    if [ ${RET} -ne 0 ]; then
        log_error "Failed to mount resctrl, rc=${RET}"
        return ${RET}
    fi

    return ${RET}
}


################################################################################
# Set Power Management QoS resume latency constraints for CPUs.
# The PM QoS resume latency limit is set to shalow C-state for vswitch CPUs.
# All other CPUs are allowed to go to the deepest C-state available.
#
################################################################################
set_pmqos_policy() {
    local RET=0

    if [[ "$subfunction" == *"compute,lowlatency" ]]; then
        ## Set low wakeup latency (shalow C-state) for vswitch CPUs using PM QoS interface
        local VSWITCH_CPULIST=$(vswitch_cpu_list)
        /bin/bash -c "/usr/bin/set-cpu-wakeup-latency.sh low ${VSWITCH_CPULIST}" 2>/dev/null
        RET=$?
        if [ ${RET} -ne 0 ]; then
            log_error "Failed to set low wakeup CPU latency for vswitch CPUs ${VSWITCH_CPULIST}, rc=${RET}"
        fi
        ## Set high wakeup latency (deep C-state) for non-vswitch CPUs using PM QoS interface
        local NON_VSWITCH_CPULIST=$(invert_cpulist ${VSWITCH_CPULIST} ${N_CPUS})
        /bin/bash -c "/usr/bin/set-cpu-wakeup-latency.sh high ${NON_VSWITCH_CPULIST}" 2>/dev/null
        RET=$?
        if [ ${RET} -ne 0 ]; then
            log_error "Failed to set high wakeup CPU latency for non-vswitch CPUs ${NON_VSWITCH_CPULIST}, rc=${RET}"
        fi
    fi

    return ${RET}
}

################################################################################
# Mounts virtual hugetlbfs filesystems for each supported page size.
#  return: 0 - success; 1 - failure
#
################################################################################
function mount_hugetlbfs_auto
{
    local SYSFSLIST=($(ls -1d /sys/kernel/mm/hugepages/hugepages-*))
    local SYSFS=""
    local RET=0

    if ! grep -q hugetlbfs /proc/filesystems
    then
        log_error "hugetlbfs not enabled"
        return 1
    fi

    for SYSFS in ${SYSFSLIST[@]}; do
        local PGNAME=$(basename $SYSFS)
        local PGSIZE=${PGNAME/hugepages-/}

        local HUGEMNT=/mnt/huge-${PGSIZE}
        log_debug "Mounting hugetlbfs at: $HUGEMNT"
        if [ ! -d ${HUGEMNT} ]; then
            mkdir -p ${HUGEMNT}
        fi

        grep -q ${HUGEMNT} /proc/mounts || \
            mount -t hugetlbfs -o pagesize=${PGSIZE} none ${HUGEMNT}
        RET=$?
        if [ ${RET} -ne 0 ]; then
            log_error "Failed to mount hugetlbfs at ${HUGEMNT}, rc=${RET}"
            return ${RET}
        fi
    done

    return ${RET}
}

################################################################################
# Mounts virtual hugetlbfs filesystems for specific supported page size.
#  param: MNT_HUGE - mount point for hugepages
#  param: PGSIZE   - pagesize attribute (eg, 2M, 1G)
#  return: 0 - success; 1 - failure
#
################################################################################
function mount_hugetlbfs
{
    local MNT_HUGE=$1
    local PGSIZE=$2
    local RET=0
    log_debug "Mounting hugetlbfs at: $MNT_HUGE"

    if ! grep -q hugetlbfs /proc/filesystems
    then
        log_error "hugetlbfs not enabled"
        return 1
    fi

    mountpoint -q ${MNT_HUGE}
    if [ $? -eq 1 ]
    then
        mkdir -p ${MNT_HUGE}
        mount -t hugetlbfs -o pagesize=${PGSIZE} hugetlbfs ${MNT_HUGE}
        RET=$?
        if [ ${RET} -ne 0 ]
        then
            log_error "Failed to mount hugetlbfs at ${MNT_HUGE}, rc=${RET}"
            return ${RET}
        fi
    fi
    return 0
}

################################################################################
# Allocates a set of HugeTLB pages according to the specified parameters.
# The first parameter specifies the NUMA node (e.g., node0, node1, etc.).
# The second parameter specifies the HugeTLB page size (e.g, 2048kB,
# 1048576kB, etc).
# The third parameter specifies the number of pages for the given page size.
################################################################################
function allocate_one_pagesize
{
    local NODE=$1
    local PGSIZE=$2
    local PGCOUNT=$3
    local NODESYSFS=/sys/devices/system/node
    local HTLBSYSFS=""
    local RET=0

    log_debug "Allocating ${PGCOUNT} HugeTLB pages of ${PGSIZE} on ${NODE}"

    if [ ! -d "${NODESYSFS}" ]; then
        ## Single NUMA node
        if [ "${NODE}" != "node0" ]; then
            log_error "${NODE} is not valid on a single NUMA node system"
            return 1
        fi
        NODESYSFS=/sys/kernel/mm/
    else
        NODESYSFS=${NODESYSFS}/${NODE}
        if [ ! -d "${NODESYSFS}" ]; then
            log_error "NUMA node ${NODE} does not exist"
            return 1
        fi
    fi

    HTLBSYSFS=${NODESYSFS}/hugepages/hugepages-${PGSIZE}
    if [ ! -d ${HTLBSYSFS} ]; then
        log_error "No HugeTLB support for ${PGSIZE} pages on ${NODE}"
        return 1
    fi

    ## Request pages
    echo ${PGCOUNT} > ${HTLBSYSFS}/nr_hugepages
    RET=$?
    if [ ${RET} -ne 0 ]
    then
        log_error "Failed to allocate ${PGCOUNT} pages on ${HTLBSYSFS}, rc=${RET}"
        return ${RET}
    fi

    return ${RET}
}

################################################################################
# Allocates HugeTLB memory according to the attributes specified in the
# parameter list.  The first parameters is expected to be a reference to an
# array rather than the actual contents of an array.
#
# Each element of the array is expected to be in the following format.
#   "<node>:<pgsize>:<pgcount>"
# For example,
#   ("node0:2048kB:256" "node0:1048576kB:2")
#
################################################################################
function allocate_hugetlb_memory
{
    local MEMLIST=("${!1}")
    local MEMDESC=""
    local ARRAY=""
    local RET=0

    ## Reserve memory for each node + pagesize
    for MEMDESC in ${MEMLIST[@]}
    do
        ARRAY=(${MEMDESC//:/ })
        if [ ${#ARRAY[@]} -ne 3 ]; then
            log_error "Invalid element format ${MEMDESC}, expecting 'node:pgsize:pgcount'"
            return 1
        fi

        NODE=${ARRAY[0]}
        PGSIZE=${ARRAY[1]}
        PGCOUNT=${ARRAY[2]}
        allocate_one_pagesize ${NODE} ${PGSIZE} ${PGCOUNT}
        RET=$?
        if [ ${RET} -ne 0 ]; then
            log_error "Failed to setup HugeTLB for ${NODE}:${PGSIZE}:${PGCOUNT}, rc=${RET}"
            return ${RET}
        fi
    done

    return 0
}

################################################################################
# per_numa_resources()
#  - mounts and allocates hugepages for Compute node libvirt
#  - hugepage requirements are calculated per NUMA node
#    based on engineering of BASE and VSWITCH
#  - it is assumed this is done very early in init to prevent fragmentation
#  - calculates reserved cpulists for BASE and vswitch
#
################################################################################
function per_numa_resources() {
    local err=0
    local NODESYSFS=/sys/devices/system/node
    local HTLBSYSFS=""
    local node

    do_huge=${do_huge:-1}

    log_debug "Setting per-NUMA resources: ${PRODUCT_NAME}"

    # Check for per-node NUMA topology
    NODESYSFS0=${NODESYSFS}/node0
    if [ ! -d "${NODESYSFS0}" ]; then
        log_error "NUMA node0 does not exist"
        return 1
    fi

    # Check that we have support for 2MB hugepages
    if [ ${do_huge} -eq 1 ]
    then
        node=0
        pgsize=2048
        HTLBSYSFS=${NODESYSFS}/node${node}/hugepages/hugepages-${pgsize}kB
        if [ ! -d ${HTLBSYSFS} ]; then
            do_huge=0
            log_error "No HugeTLB support for ${pgsize}kB pages on node${node}, do_huge=0"
        fi
    fi

    # Workaround: customize /etc/nova/rootwrap.d/
    ROOTWRAP=/etc/nova/rootwrap.d
    FILTER=${ROOTWRAP}/compute-extend.filters
    mkdir -p ${ROOTWRAP}
    PERM=$(stat --format=%a ${ROOTWRAP})
    chmod 755 ${ROOTWRAP}
    : > ${FILTER}
    echo "# nova-rootwrap command filters for compute nodes" >> ${FILTER}
    echo "# This file should be owned by (and only-writeable by) the root user" >> ${FILTER}
    echo "[Filters]" >> ${FILTER}
    echo "cat: CommandFilter, cat, root" >> ${FILTER}
    echo "taskset: CommandFilter, taskset, root" >> ${FILTER}
    chmod ${PERM} ${ROOTWRAP}

    # Minimally need 1GB for compute in VirtualBox
    declare -i compute_min_MB=1600
    declare -i compute_min_non0_MB=500

    # Minimally need 6GB for controller in VirtualBox
    declare -i controller_min_MB=6000

    # Some constants
    local -i Ki=1024
    local -i Ki2=512
    local -i SZ_4K_Ki=4
    local -i SZ_2M_Ki=2048
    local -i SZ_1G_Ki=1048576

    # Declare memory page sizes
    declare -A pgsizes
    pgsizes[${SZ_4K_Ki}]='4K'
    pgsizes[${SZ_2M_Ki}]='2M'
    pgsizes[${SZ_1G_Ki}]='1G'

    # Declare per-numa memory storage
    declare -A do_manual
    declare -A tot_memory
    declare -A base_memory
    declare -A vs_pages
    declare -A vm_pages
    declare -A max_vm_pages
    for ((node=0; node < N_NUMA; node++))
    do
        do_manual[$node]=0
        tot_memory[$node]=0
        base_memory[$node]=0
        for pgsize in "${!pgsizes[@]}"
        do
            vm_pages[${node},${pgsize}]=0
            max_vm_pages[${node},${pgsize}]=0
            vs_pages[${node},${pgsize}]=0
        done
    done

    # Track vswitch hugepages. Note that COMPUTE_VSWITCH_MEMORY is defined in
    # /etc/nova/compute_reserved.conf .
    for MEMDESC in ${COMPUTE_VSWITCH_MEMORY[@]}
    do
        ARRAY=(${MEMDESC//:/ })
        if [ ${#ARRAY[@]} -ne 3 ]; then
            log_error "Invalid element format ${MEMDESC}, expecting 'node:pgsize:pgcount'"
            return 1
        fi
        node=${ARRAY[0]#node}
        pgsize=${ARRAY[1]%kB}
        pgcount=${ARRAY[2]}
        if [ ${node} -ge ${N_NUMA} ]; then
            continue
        fi
        HTLBSYSFS=${NODESYSFS}/node${node}/hugepages/hugepages-${pgsize}kB
        if [ ! -d ${HTLBSYSFS} ]; then
            log_debug "SKIP: No HugeTLB support for ${pgsize}kB pages on node${node}"
            continue
        fi

        # Keep track of vswitch pages (we'll add them back in later)
        vs_pages[${node},${pgsize}]=$[ ${vs_pages[${node},${pgsize}]} + $pgcount ]
    done

    # Track total VM memory. Note that COMPUTE_VM_MEMORY_2M and
    # COMPUTE_VM_MEMORY_1G is defined in /etc/nova/compute_reserved.conf .
    for MEMDESC in ${COMPUTE_VM_MEMORY_2M[@]} ${COMPUTE_VM_MEMORY_1G[@]}
    do
        ARRAY=(${MEMDESC//:/ })
        if [ ${#ARRAY[@]} -ne 3 ]; then
            log_debug "Invalid element format ${MEMDESC}, expecting 'node:pgsize:pgcount'"
            break
        fi
        node=${ARRAY[0]#node}
        pgsize=${ARRAY[1]%kB}
        pgcount=${ARRAY[2]}
        if [ ${node} -ge ${N_NUMA} ]; then
            continue
        fi
        HTLBSYSFS=${NODESYSFS}/node${node}/hugepages/hugepages-${pgsize}kB
        if [ ! -d ${HTLBSYSFS} ]; then
            log_debug "SKIP: No HugeTLB support for ${pgsize}kB pages on node${node}"
            continue
        fi

        # Cumulate total VM memory
        do_manual[${node}]=1
        vm_pages[${node},${pgsize}]=$[ ${vm_pages[${node},${pgsize}]} + $pgcount ]
    done

    # Track base reserved cores and memory. Note that COMPUTE_BASE_RESERVED is
    # defined in /etc/nova/compute_reserved.conf .
    for MEMDESC in ${COMPUTE_BASE_RESERVED[@]}
    do
        ARRAY=(${MEMDESC//:/ })
        if [ ${#ARRAY[@]} -ne 3 ]; then
            log_error "Invalid element format ${MEMDESC}, expecting 'node:memory:cores'"
            return 1
        fi
        local -i node=${ARRAY[0]#node}
        local -i memory=${ARRAY[1]%MB}
        local -i cores=${ARRAY[2]}

        # On small systems, clip memory overhead to more reasonable minimal
        # settings in the case sysinv hasn't set run yet.
        INIT_MiB=$[ (${memnode[${node},'MemFreeInit']} + ${Ki2}) / ${Ki} ]
        MEMFREE=$[ ${INIT_MiB} - ${memory} ]
        if [ ${MEMFREE} -lt 1000 ]; then
            if [ ${node} -eq 0 ]; then
                memory=${compute_min_MB}
                if [ "$nodetype" = "controller" ]; then
                    ((memory += controller_min_MB))
                fi
            else
                memory=${compute_min_non0_MB}
            fi
        fi

        base_memory[$node]=$memory
    done

    # Declare array to store hugepage allocation info
    declare -a HUGE_MEMORY
    declare -a VM_MEMORY_2M
    declare -a VM_MEMORY_1G
    HUGE_MEMORY=()
    VM_MEMORY_2M=()
    VM_MEMORY_1G=()

    # Calculate memory breakdown for this numa node
    for ((node=0; node < N_NUMA; node++))
    do
        # Top-down memory calculation:
        #   NODE_TOTAL_MiB = MemFreeInit
        if [ -f /sys/devices/system/node/node${node}/meminfo_extra ]; then
            NODE_TOTAL_INIT_MiB=$(grep MemFreeInit \
                /sys/devices/system/node/node${node}/meminfo_extra | \
                awk '{printf "%d", ($4+512)/1024;}')
        else
            NODE_TOTAL_INIT_MiB=$(grep MemTotal \
                /sys/devices/system/node/node${node}/meminfo | \
                awk '{printf "%d", ($4+512)/1024;}')
        fi

        # Bottom-up memory calculation (total hugepages + usable linux mem)
        #   NODE_TOTAL_MiB = HTOT + (AVAIL + PSS)
        HTOT_MiB=$[ (${memnode[${node},'HTot']} + ${Ki2}) / ${Ki} ]
        AVAIL_MiB=$[ (${memnode[${node},'Avail']} + ${Ki2}) / ${Ki} ]
        if [ $node -eq 0 ]; then
            # Assume calling this when VMs not launched, so assume numa 0
            PSS_MiB=$(cat /proc/*/smaps 2>/dev/null | \
                      awk '/^Pss:/ {a += $2;} END {printf "%d\n", a/1024.0;}')
        else
            PSS_MiB=0
        fi
        NODE_TOTAL_MiB=$[ ${HTOT_MiB} + ${AVAIL_MiB} + ${PSS_MiB} ]
        tot_memory[${node}]=${NODE_TOTAL_MiB}

        # Engineered amount of memory for vswitch plus VMs.
        ENG_MiB=$[ ${NODE_TOTAL_MiB} - ${base_memory[$node]} ]
        if [ ${ENG_MiB} -lt 0 ]; then
            ENG_MiB=0
        fi

        # Amount of memory left for VMs
        VM_MiB=$[ ${ENG_MiB}
                  - ${SZ_2M_Ki} * ${vs_pages[$node,${SZ_2M_Ki}]} / ${Ki}
                  - ${SZ_1G_Ki} * ${vs_pages[$node,${SZ_1G_Ki}]} / ${Ki} ]

        # Prevent allocating hugepages if host is too small
        if [ ${do_huge} -eq 0 -o $VM_MiB -le 16 ]
        then
            VM_MiB=0
            log_error "insufficient memory on node $node to allocate hugepages"
        fi

        # Maximize use of 2M pages if not using pre-determined 2M and 1G pages.
        if [ ${do_manual[${node}]} -ne 1 ]; then
            vm_pages[${node},${SZ_2M_Ki}]=$[ ${Ki} * ${VM_MiB} / ${SZ_2M_Ki} / 16 * 16 ]
        fi

        # Calculate remaining memory as 4K pages
        vm_pages[${node},${SZ_4K_Ki}]=$[ (${Ki} * ${VM_MiB}
            - ${SZ_2M_Ki} * ${vm_pages[${node},${SZ_2M_Ki}]}
            - ${SZ_1G_Ki} * ${vm_pages[${node},${SZ_1G_Ki}]}) / ${SZ_4K_Ki} ]
        min_4K=$[ 32 * ${Ki} / ${SZ_4K_Ki} ]
        if [ ${vm_pages[${node},${SZ_4K_Ki}]} -lt ${min_4K} ]; then
            vm_pages[${node},${SZ_4K_Ki}]=0
        fi

        # Sanity check
        # The memory pages specifed in the $RESERVE_CONF file should not
        # exceed the available memory in the system.  Validate the values by
        # calculating the memory required for specified pages, and comparing
        # with available memory.
        #
        # We will override configured pages if the specified values are out of
        # range.  Note that we do not expect this to happen (unless a DIMM
        # fails, or some other error) as we check available pages before
        # allowing user to change allocated pages.
        local requested_VM_MiB=$[
            ${SZ_4K_Ki} * ${vm_pages[${node},${SZ_4K_Ki}]} / ${Ki}
            + ${SZ_2M_Ki} * ${vm_pages[${node},${SZ_2M_Ki}]} / ${Ki}
            + ${SZ_1G_Ki} * ${vm_pages[${node},${SZ_1G_Ki}]} / ${Ki} ]

        if [ ${requested_VM_MiB} -gt ${VM_MiB} ]; then

            # We're over comitted - clamp memory usage to actual available
            # memory.  In addition to the log files, we also want to output
            # to console
            log_error "Over-commited VM memory: " \
                "Requested ${requested_VM_MiB} MiB through ${RESERVE_CONF} " \
                "but ${VM_MiB} MiB available."

            # Reduce 1G pages to the max number that will fit (leave 1G pages
            # unchanged if it's already small enough)
            if [ $[ ${VM_MiB} * ${Ki} / ${SZ_1G_Ki} ] -lt \
                ${vm_pages[${node},${SZ_1G_Ki}]} ]; then
                vm_pages[${node},${SZ_1G_Ki}]=$[ ${VM_MiB} * ${Ki} / ${SZ_1G_Ki} ]
            fi

            # Calculate the 2M pages based on amount of memory left over after
            # 1G pages accounted for
            vm_pages[${node},${SZ_2M_Ki}]=$[ (${Ki} * ${VM_MiB}
                - ${SZ_1G_Ki} * ${vm_pages[${node},${SZ_1G_Ki}]})
                / ${SZ_2M_Ki} / 16 * 16 ]

            # Anything left over is 4K pages
            vm_pages[${node},${SZ_4K_Ki}]=$[ (${Ki} * ${VM_MiB}
                - ${SZ_2M_Ki} * ${vm_pages[${node},${SZ_2M_Ki}]}
               - ${SZ_1G_Ki} * ${vm_pages[${node},${SZ_1G_Ki}]}) / ${SZ_4K_Ki} ]

            if [ ${vm_pages[${node},${SZ_4K_Ki}]} -lt ${min_4K} ]; then
                vm_pages[${node},${SZ_4K_Ki}]=0
            fi

            requested_VM_MiB=$[
                ${SZ_4K_Ki} * ${vm_pages[${node},${SZ_4K_Ki}]} / ${Ki}
                + ${SZ_2M_Ki} * ${vm_pages[${node},${SZ_2M_Ki}]} / ${Ki}
                + ${SZ_1G_Ki} * ${vm_pages[${node},${SZ_1G_Ki}]} / ${Ki} ]
            log_error "VM memory reduced to ${requested_VM_MiB} MiB " \
                "using ${vm_pages[${node},${SZ_1G_Ki}]} 1G pages and " \
                "${vm_pages[${node},${SZ_2M_Ki}]} 2M pages"
        fi

        # Calculate total hugepages to be allocated.  Setting HUGE_MEMORY will
        # reset nr_hugepages. Always set values even if 0.
        if grep -q pdpe1gb /proc/cpuinfo
        then
            pages_1G=$[ ${vm_pages[${node},${SZ_1G_Ki}]} + ${vs_pages[${node},${SZ_1G_Ki}]} ]
            HUGE_MEMORY+=("node${node}:${SZ_1G_Ki}kB:${pages_1G}")
            pages_1G=$[ ${vm_pages[${node},${SZ_1G_Ki}]} ]
            VM_MEMORY_1G+=("node${node}:${SZ_1G_Ki}kB:${pages_1G}")
        fi
        pages_2M=$[ ${vm_pages[${node},${SZ_2M_Ki}]} + ${vs_pages[${node},${SZ_2M_Ki}]} ]
        HUGE_MEMORY+=("node${node}:${SZ_2M_Ki}kB:${pages_2M}")
        pages_2M=$[ ${vm_pages[${node},${SZ_2M_Ki}]} ]
        VM_MEMORY_2M+=("node${node}:${SZ_2M_Ki}kB:${pages_2M}")

        # Calculate maximum possible VM pages of a given pagesize
        max_vm_pages[${node},${SZ_2M_Ki}]=$[ ${Ki} * ${VM_MiB} / ${SZ_2M_Ki} / 16 * 16 ]
        max_vm_pages[${node},${SZ_1G_Ki}]=$[ ${Ki} * ${VM_MiB} / ${SZ_1G_Ki} ]

        # Calculate a few things to print out
        max_2M=${max_vm_pages[${node},${SZ_2M_Ki}]}
        max_1G=${max_vm_pages[${node},${SZ_1G_Ki}]}
        vm_4K_MiB=$[ ${SZ_4K_Ki} * ${vm_pages[${node},${SZ_4K_Ki}]} / ${Ki} ]
        vm_2M_MiB=$[ ${SZ_2M_Ki} * ${vm_pages[${node},${SZ_2M_Ki}]} / ${Ki} ]
        vm_1G_MiB=$[ ${SZ_1G_Ki} * ${vm_pages[${node},${SZ_1G_Ki}]} / ${Ki} ]
        vs_2M_MiB=$[ ${SZ_2M_Ki} * ${vs_pages[${node},${SZ_2M_Ki}]} / ${Ki} ]
        vs_1G_MiB=$[ ${SZ_1G_Ki} * ${vs_pages[${node},${SZ_1G_Ki}]} / ${Ki} ]
        log_debug "Memory: node:${node}, TOTAL:${NODE_TOTAL_MiB} MiB," \
            "INIT:${NODE_TOTAL_INIT_MiB} MiB," \
            "AVAIL:${AVAIL_MiB} MiB, PSS:${PSS_MiB} MiB," \
            "HTOT:${HTOT_MiB} MiB"
        log_debug "Memory: node:${node}," \
            "ENG:${ENG_MiB} MiB, VM:${VM_MiB} MiB," \
            "4K:${vm_4K_MiB} MiB, 2M:${vm_2M_MiB} MiB, 1G:${vm_1G_MiB} MiB," \
            "manual-set:${do_manual[$node]}"
        log_debug "Memory: node:${node}," \
            "max: 2M:${max_2M} pages, 1G:${max_1G} pages"
        log_debug "Memory: node:${node}," \
            "vswitch: 2M:${vs_2M_MiB} MiB, 1G:${vs_1G_MiB} MiB;" \
            "BASE:${base_memory[$node]} MiB reserved"
    done

    # Summarize overall lists and hugetlb
    log_debug "compute_hugetlb: ${HUGE_MEMORY[@]}"

    # Write out maximum possible hugepages of each type and total memory
    max_2M=""; max_1G=""; tot_MiB=""
    for ((node=0; node < N_NUMA; node++))
    do
        max_2M=$(append_list ${max_vm_pages[${node},${SZ_2M_Ki}]} ${max_2M})
        max_1G=$(append_list ${max_vm_pages[${node},${SZ_1G_Ki}]} ${max_1G})
        tot_MiB=$(append_list ${tot_memory[${node}]} ${tot_MiB})
    done
    CONF=/etc/nova/compute_hugepages_total.conf
    echo "# Compute total possible hugepages to allocate (generated: do not modify)" > ${CONF}
    echo "compute_hp_total_2M=${max_2M}" >> ${CONF}
    echo "compute_hp_total_1G=${max_1G}" >> ${CONF}
    echo "compute_total_MiB=${tot_MiB}" >> ${CONF}
    echo "" >> ${CONF}

    # Write out extended nova compute options; used with nova accounting.
    CONF=/etc/nova/compute_extend.conf
    echo "# Compute extended nova options (generated: do not modify)" > ${CONF}

    # memory allocations of each type
    vs_2M=""; vs_1G=""; vm_4K=""; vm_2M=""; vm_1G=""
    for ((node=0; node < N_NUMA; node++))
    do
        vs_2M=$(append_list ${vs_pages[${node},${SZ_2M_Ki}]} ${vs_2M})
        vs_1G=$(append_list ${vs_pages[${node},${SZ_1G_Ki}]} ${vs_1G})
        vm_4K=$(append_list ${vm_pages[${node},${SZ_4K_Ki}]} ${vm_4K})
        vm_2M=$(append_list ${vm_pages[${node},${SZ_2M_Ki}]} ${vm_2M})
        vm_1G=$(append_list ${vm_pages[${node},${SZ_1G_Ki}]} ${vm_1G})
    done
    echo "# memory options" >> ${CONF}
    echo "compute_vswitch_2M_pages=${vs_2M}" >> ${CONF}
    echo "compute_vswitch_1G_pages=${vs_1G}" >> ${CONF}
    echo "compute_vm_4K_pages=${vm_4K}" >> ${CONF}
    echo "compute_vm_2M_pages=${vm_2M}" >> ${CONF}
    echo "compute_vm_1G_pages=${vm_1G}" >> ${CONF}
    echo "" >> ${CONF}

    # Allocate hugepages of each pgsize for each NUMA node
    if [ ${do_huge} -eq 1 ]; then
        allocate_hugetlb_memory HUGE_MEMORY[@]

        # Write out current hugepages to configuration file,
        # keeping each individual array element quoted.
        q=(); for e in "${VM_MEMORY_2M[@]}"; do q+="\"${e}\" "; done
        r="${q[@]}"; r="${r%"${r##*[![:space:]]}"}"
        sed -i "s#^COMPUTE_VM_MEMORY_2M=.*\$#COMPUTE_VM_MEMORY_2M=\($r\)#" ${RESERVE_CONF}

        q=(); for e in "${VM_MEMORY_1G[@]}"; do q+="\"${e}\" "; done
        r="${q[@]}"; r="${r%"${r##*[![:space:]]}"}"
        sed -i "s#^COMPUTE_VM_MEMORY_1G=.*\$#COMPUTE_VM_MEMORY_1G=\($r\)#" ${RESERVE_CONF}
    fi
}

################################################################################
# Start/Setup all Compute node resources
# - Enabled a performance boost by mounting HugeTLBFS.
#   This reduces TLB entries, hence reduces processor cache-thrash.
# - Allocates aggregate nr_hugepages per NUMA node.
# - Mounts cgroups .
#
################################################################################
function start_compute() {
    local RET=0
    log_debug "start_compute"

    # Flush page cache
    sync; echo 3 > /proc/sys/vm/drop_caches

    # Determine cpu topology
    get_topology

    # Determine memory breakdown
    get_memory

    check_configuration
    RET=$?
    if [ ${RET} -ne 0 ]; then
        log_error "Failed to check configuration, rc=${RET}"
        return ${RET}
    fi

    # Mount HugeTLBFS for vswitch and libvirt
    mount_hugetlbfs_auto
    RET=$?
    if [ ${RET} -ne 0 ]; then
        log_error "Failed to auto mount HugeTLB filesystem(s), rc=${RET}"
        return ${RET}
    fi

    # Check that 2MB hugepages are available for libvirt
    MOUNT=/mnt/huge-2048kB
    mountpoint -q $MOUNT
    RET=$?
    if [ ${RET} -ne 0 ]; then
        log_error "Failed to mount 2048kB HugeTLB pages for libvirt, rc=${RET}, disabling huge"
        do_huge=0
    fi

    # Calculate aggregate hugepage memory requirements for vswitch + libvirt.
    # Set nr_hugepages per NUMA node.
    per_numa_resources
    RET=$?
    if [ ${RET} -ne 0 ]; then
        log_error "Failed to allocate sufficient resources, rc=${RET}"
        return ${RET}
    fi

    # Mount cgroups to take advantage of per domain accounting.
    if [ ${do_cgroups} -eq 1 ]; then
        mount_cgroups
        RET=$?
        if [ ${RET} -ne 0 ]; then
            log_error "Failed to mount cgroups, rc=${RET}"
            return ${RET}
        fi
    fi

    # Mount resctrl to allow Cache Allocation Technology per VM
    RESCTRL=/sys/fs/resctrl
    if [ -d $RESCTRL ]; then
        mount_resctrl
        RET=$?
        if [ ${RET} -ne 0 ]; then
            log_error "Failed to mount resctrl, rc=${RET}"
            return ${RET}
        fi
    fi

    # Set Power Management QoS resume latency constraints for all CPUs.
    set_pmqos_policy
    RET=$?
    if [ ${RET} -ne 0 ]; then
        log_error "Failed to set Power Management QoS policy, rc=${RET}"
        return ${RET}
    fi

    # Disable IRQ balance service
    IRQBALANCED=/etc/init.d/irqbalanced
    if [ -x ${IRQBALANCED} ]; then
        ${IRQBALANCED} stop &> /dev/null
        RET=$?
        if [ ${RET} -ne 0 ]; then
            log_error "Failed to stop IRQ balance service, rc=${RET}"
            return ${RET}
        fi
    fi

    return ${RET}
}

################################################################################
# Start Action
################################################################################
function start() {
    local RET=0
    echo -n "Starting ${scriptname}: "

    # COMPUTE Node related setup
    if [ -x /etc/init.d/nova-compute ]
    then
        start_compute
        RET=$?
    fi

    print_status ${RET}
    return ${RET}
}

################################################################################
# Stop Action
################################################################################
function stop
{
    local RET=0
    echo -n "Stopping ${scriptname}: "

    force_grub_update
    RET=$?

    print_status ${RET}
    return ${RET}
}


################################################################################
# Restart Action
################################################################################
function restart() {
    stop
    start
}

################################################################################
# Main Entry
#
################################################################################
case "$1" in
start)
    start
    ;;
stop)
    stop
    ;;
restart|reload)
    is_reconfig=1
    restart
    ;;
status)
    echo -n "OK"
    ;;
*)
    echo $"Usage: $0 {start|stop|restart|reload|status}"
    exit 1
esac

exit $?