Added list of mlock-using processes to peakmem_tracker output
The change makes peakmem_tracker list processes that lock memory pages from swapping to disk. It may be helpful when debugging oom-killer job failures in gate in case when dstat shows that swap is not fully used when oom-killer is triggered. The peakmem_tracker service was renamed into memory_tracker to reflect its new broader scope. Needed-By: I5862d92478397eac2e61b8a61ce3437b698678be Change-Id: I1dca120448ee87930fe903fd81277b58efaefc92
This commit is contained in:
parent
23d03b697f
commit
2b4735f1b3
14
lib/dstat
14
lib/dstat
@ -21,16 +21,22 @@ function start_dstat {
|
||||
# A better kind of sysstat, with the top process per time slice
|
||||
run_process dstat "$TOP_DIR/tools/dstat.sh $LOGDIR"
|
||||
|
||||
# To enable peakmem_tracker add:
|
||||
# enable_service peakmem_tracker
|
||||
# To enable memory_tracker add:
|
||||
# enable_service memory_tracker
|
||||
# to your localrc
|
||||
run_process peakmem_tracker "$TOP_DIR/tools/peakmem_tracker.sh"
|
||||
run_process memory_tracker "$TOP_DIR/tools/memory_tracker.sh"
|
||||
|
||||
# remove support for the old name when it's no longer used (sometime in Queens)
|
||||
if is_service_enabled peakmem_tracker; then
|
||||
deprecated "Use of peakmem_tracker in devstack is deprecated, use memory_tracker instead"
|
||||
run_process peakmem_tracker "$TOP_DIR/tools/memory_tracker.sh"
|
||||
fi
|
||||
}
|
||||
|
||||
# stop_dstat() stop dstat process
|
||||
function stop_dstat {
|
||||
stop_process dstat
|
||||
stop_process peakmem_tracker
|
||||
stop_process memory_tracker
|
||||
}
|
||||
|
||||
# Restore xtrace
|
||||
|
@ -21,11 +21,15 @@ SLEEP_TIME=20
|
||||
# around reclaimable memory. However, it is not available until 3.14
|
||||
# kernel (i.e. Ubuntu LTS Trusty misses it). In that case, we fall
|
||||
# back to free+buffers+cache as the available memory.
|
||||
USE_MEM_AVAILBLE=0
|
||||
USE_MEM_AVAILABLE=0
|
||||
if grep -q '^MemAvailable:' /proc/meminfo; then
|
||||
USE_MEM_AVAILABLE=1
|
||||
fi
|
||||
|
||||
function get_mem_unevictable {
|
||||
awk '/^Unevictable:/ {print $2}' /proc/meminfo
|
||||
}
|
||||
|
||||
function get_mem_available {
|
||||
if [[ $USE_MEM_AVAILABLE -eq 1 ]]; then
|
||||
awk '/^MemAvailable:/ {print $2}' /proc/meminfo
|
||||
@ -37,40 +41,56 @@ function get_mem_available {
|
||||
fi
|
||||
}
|
||||
|
||||
# whenever we see less memory available than last time, dump the
|
||||
# snapshot of current usage; i.e. checking the latest entry in the
|
||||
# file will give the peak-memory usage
|
||||
function tracker {
|
||||
local low_point
|
||||
local unevictable_point
|
||||
low_point=$(get_mem_available)
|
||||
# log mlocked memory at least on first iteration
|
||||
unevictable_point=0
|
||||
while [ 1 ]; do
|
||||
|
||||
local mem_available
|
||||
mem_available=$(get_mem_available)
|
||||
|
||||
if [[ $mem_available -lt $low_point ]]; then
|
||||
low_point=$mem_available
|
||||
local unevictable
|
||||
unevictable=$(get_mem_unevictable)
|
||||
|
||||
if [ $mem_available -lt $low_point -o $unevictable -ne $unevictable_point ]; then
|
||||
echo "[[["
|
||||
date
|
||||
|
||||
# whenever we see less memory available than last time, dump the
|
||||
# snapshot of current usage; i.e. checking the latest entry in the file
|
||||
# will give the peak-memory usage
|
||||
if [[ $mem_available -lt $low_point ]]; then
|
||||
low_point=$mem_available
|
||||
echo "---"
|
||||
# always available greppable output; given difference in
|
||||
# meminfo output as described above...
|
||||
echo "memory_tracker low_point: $mem_available"
|
||||
echo "---"
|
||||
cat /proc/meminfo
|
||||
echo "---"
|
||||
# would hierarchial view be more useful (-H)? output is
|
||||
# not sorted by usage then, however, and the first
|
||||
# question is "what's using up the memory"
|
||||
#
|
||||
# there are a lot of kernel threads, especially on a 8-cpu
|
||||
# system. do a best-effort removal to improve
|
||||
# signal/noise ratio of output.
|
||||
ps --sort=-pmem -eo pid:10,pmem:6,rss:15,ppid:10,cputime:10,nlwp:8,wchan:25,args:100 |
|
||||
grep -v ']$'
|
||||
fi
|
||||
echo "---"
|
||||
# always available greppable output; given difference in
|
||||
# meminfo output as described above...
|
||||
echo "peakmem_tracker low_point: $mem_available"
|
||||
echo "---"
|
||||
cat /proc/meminfo
|
||||
echo "---"
|
||||
# would hierarchial view be more useful (-H)? output is
|
||||
# not sorted by usage then, however, and the first
|
||||
# question is "what's using up the memory"
|
||||
#
|
||||
# there are a lot of kernel threads, especially on a 8-cpu
|
||||
# system. do a best-effort removal to improve
|
||||
# signal/noise ratio of output.
|
||||
ps --sort=-pmem -eo pid:10,pmem:6,rss:15,ppid:10,cputime:10,nlwp:8,wchan:25,args:100 |
|
||||
grep -v ']$'
|
||||
|
||||
# list processes that lock memory from swap
|
||||
if [[ $unevictable -ne $unevictable_point ]]; then
|
||||
unevictable_point=$unevictable
|
||||
sudo ./tools/mlock_report.py
|
||||
fi
|
||||
|
||||
echo "]]]"
|
||||
fi
|
||||
|
||||
sleep $SLEEP_TIME
|
||||
done
|
||||
}
|
59
tools/mlock_report.py
Executable file
59
tools/mlock_report.py
Executable file
@ -0,0 +1,59 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
# This tool lists processes that lock memory pages from swapping to disk.
|
||||
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
import psutil
|
||||
|
||||
|
||||
SUMMARY_REGEX = re.compile(r".*\s+(?P<locked>[\d]+)\s+KB")
|
||||
|
||||
|
||||
def main():
|
||||
try:
|
||||
print _get_report()
|
||||
except Exception as e:
|
||||
print "Failure listing processes locking memory: %s" % str(e)
|
||||
|
||||
|
||||
def _get_report():
|
||||
mlock_users = []
|
||||
for proc in psutil.process_iter():
|
||||
pid = proc.pid
|
||||
# sadly psutil does not expose locked pages info, that's why we
|
||||
# call to pmap and parse the output here
|
||||
try:
|
||||
out = subprocess.check_output(['pmap', '-XX', str(pid)])
|
||||
except subprocess.CalledProcessError as e:
|
||||
# 42 means process just vanished, which is ok
|
||||
if e.returncode == 42:
|
||||
continue
|
||||
raise
|
||||
last_line = out.splitlines()[-1]
|
||||
|
||||
# some processes don't provide a memory map, for example those
|
||||
# running as kernel services, so we need to skip those that don't
|
||||
# match
|
||||
result = SUMMARY_REGEX.match(last_line)
|
||||
if result:
|
||||
locked = int(result.group('locked'))
|
||||
if locked:
|
||||
mlock_users.append({'name': proc.name(),
|
||||
'pid': pid,
|
||||
'locked': locked})
|
||||
|
||||
# produce a single line log message with per process mlock stats
|
||||
if mlock_users:
|
||||
return "; ".join(
|
||||
"[%(name)s (pid:%(pid)s)]=%(locked)dKB" % args
|
||||
# log heavy users first
|
||||
for args in sorted(mlock_users, key=lambda d: d['locked'])
|
||||
)
|
||||
else:
|
||||
return "no locked memory"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue
Block a user