You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
275 lines
9.3 KiB
275 lines
9.3 KiB
From 3ce2235abbd28cefcef348ad7895e2529810ad7c Mon Sep 17 00:00:00 2001 |
|
From: Xunlei Pang <xlpang@redhat.com> |
|
Date: Thu, 3 Nov 2016 19:30:41 +0800 |
|
Subject: [PATCH] 99base: add memtrace-ko.sh to debug kernel module large |
|
memory consumption |
|
|
|
The current method for memory debug is to use "rd.memdebug=[0-3]", |
|
it is not enough for debugging kernel modules. For example, when we |
|
want to find out which kernel module consumes a large amount of memory, |
|
"rd.memdebug=[0-3]" won't help too much. |
|
|
|
A better way is needed to achieve this requirement, this is useful for |
|
kdump OOM debugging. |
|
|
|
The principle of this patch is to use kernel trace to track slab and |
|
buddy allocation calls during kernel module loading(module_init), thus |
|
we can analyze all the trace data and get the total memory consumption. |
|
As for large slab allocation, it will probably fall into buddy allocation, |
|
thus tracing "mm_page_alloc" alone should be enough for the purpose(this |
|
saves quite some trace buffer memory, also large free is quite unlikey |
|
during module loading, we neglect those memory free events). |
|
|
|
The trace events include memory calls under "tracing/events/": |
|
kmem/mm_page_alloc |
|
|
|
We also inpect the following events to detect the module loading: |
|
module/module_load |
|
module/module_put |
|
|
|
Since we use filters to trace events, the final trace data size won't |
|
be too big. Users can adjust the trace buffer size via "trace_buf_size" |
|
kernel boot command line as needed. |
|
|
|
We can get the module name and task pid from "module_load" event which |
|
also mark the beginning of the loading, and module_put called by the |
|
same task pid implies the end of the loading. So the memory events |
|
recorded in between by the same task pid are consumed by this module |
|
during loading(i.e. modprobe or module_init()). |
|
|
|
With these information, we can record the rough total memory(the larger, |
|
the more precise the result will be) consumption involved by each kernel |
|
module loading. |
|
|
|
Thus we introduce this shell script to find out which kernel module |
|
consumes a large amount of memory during loading. Use "rd.memdebug=4" |
|
as the tigger. |
|
|
|
After applying this patch and specifying "rd.memdebug=4", during booting |
|
it will print out something extra like below: |
|
0 pages consumed by "pata_acpi" |
|
0 pages consumed by "ata_generic" |
|
1 pages consumed by "drm" |
|
0 pages consumed by "ttm" |
|
0 pages consumed by "drm_kms_helper" |
|
835 pages consumed by "qxl" |
|
0 pages consumed by "mii" |
|
6 pages consumed by "8139cp" |
|
0 pages consumed by "virtio" |
|
0 pages consumed by "virtio_ring" |
|
9 pages consumed by "virtio_pci" |
|
1 pages consumed by "8139too" |
|
0 pages consumed by "serio_raw" |
|
0 pages consumed by "crc32c_intel" |
|
199 pages consumed by "virtio_console" |
|
0 pages consumed by "libcrc32c" |
|
9 pages consumed by "xfs" |
|
|
|
From the print, we see clearly that "qxl" consumed the most memory. |
|
|
|
This file will be installed as a separate executable named "tracekomem" |
|
in the following patch. |
|
|
|
Signed-off-by: Xunlei Pang <xlpang@redhat.com> |
|
--- |
|
modules.d/99base/memtrace-ko.sh | 191 ++++++++++++++++++++++++++++++++ |
|
1 file changed, 191 insertions(+) |
|
create mode 100755 modules.d/99base/memtrace-ko.sh |
|
|
|
diff --git a/modules.d/99base/memtrace-ko.sh b/modules.d/99base/memtrace-ko.sh |
|
new file mode 100755 |
|
index 00000000..ee035e15 |
|
--- /dev/null |
|
+++ b/modules.d/99base/memtrace-ko.sh |
|
@@ -0,0 +1,191 @@ |
|
+#!/bin/sh |
|
+ |
|
+# Try to find out kernel modules with large total memory allocation during loading. |
|
+# For large slab allocation, it will fall into buddy, also not trace "mm_page_free" |
|
+# considering large free is quite rare for module_init, thus saving tons of events |
|
+# to avoid trace data overwritten. |
|
+# |
|
+# Therefore, tracing "mm_page_alloc"alone should be enough for the purpose. |
|
+ |
|
+# "sys/kernel/tracing" has the priority if exists. |
|
+get_trace_base() { |
|
+ # trace access through debugfs would be obsolete if "/sys/kernel/tracing" is available. |
|
+ if [ -d "/sys/kernel/tracing" ]; then |
|
+ echo "/sys/kernel" |
|
+ else |
|
+ echo "/sys/kernel/debug" |
|
+ fi |
|
+} |
|
+ |
|
+# We want to enable these trace events. |
|
+get_want_events() { |
|
+ echo "module:module_put module:module_load kmem:mm_page_alloc" |
|
+} |
|
+ |
|
+get_event_filter() { |
|
+ echo "comm == systemd-udevd || comm == modprobe || comm == insmod" |
|
+} |
|
+ |
|
+is_trace_ready() { |
|
+ local trace_base want_events current_events |
|
+ |
|
+ trace_base=$(get_trace_base) |
|
+ ! [ -f "$trace_base/tracing/trace" ] && return 1 |
|
+ |
|
+ [ "$(cat $trace_base/tracing/tracing_on)" -eq 0 ] && return 1 |
|
+ |
|
+ # Also check if trace events were properly setup. |
|
+ want_events=$(get_want_events) |
|
+ current_events=$(echo $(cat $trace_base/tracing/set_event)) |
|
+ [ "$current_events" != "$want_events" ] && return 1 |
|
+ |
|
+ return 0 |
|
+} |
|
+ |
|
+prepare_trace() { |
|
+ local trace_base |
|
+ |
|
+ trace_base=$(get_trace_base) |
|
+ # old debugfs interface case. |
|
+ if ! [ -d "$trace_base/tracing" ]; then |
|
+ mount none -t debugfs $trace_base |
|
+ # new tracefs interface case. |
|
+ elif ! [ -f "$trace_base/tracing/trace" ]; then |
|
+ mount none -t tracefs "$trace_base/tracing" |
|
+ fi |
|
+ |
|
+ if ! [ -f "$trace_base/tracing/trace" ]; then |
|
+ echo "WARN: Mount trace failed for kernel module memory analyzing." |
|
+ return 1 |
|
+ fi |
|
+ |
|
+ # Active all the wanted trace events. |
|
+ echo "$(get_want_events)" > $trace_base/tracing/set_event |
|
+ |
|
+ # There are three kinds of known applications for module loading: |
|
+ # "systemd-udevd", "modprobe" and "insmod". |
|
+ # Set them as the global events filter. |
|
+ # NOTE: Some kernel may not support this format of filter, anyway |
|
+ # the operation will fail and it doesn't matter. |
|
+ echo "$(get_event_filter)" > $trace_base/tracing/events/kmem/filter 2>&1 |
|
+ echo "$(get_event_filter)" > $trace_base/tracing/events/module/filter 2>&1 |
|
+ |
|
+ # Set the number of comm-pid if supported. |
|
+ if [ -f "$trace_base/tracing/saved_cmdlines_size" ]; then |
|
+ # Thanks to filters, 4096 is big enough(also well supported). |
|
+ echo 4096 > $trace_base/tracing/saved_cmdlines_size |
|
+ fi |
|
+ |
|
+ # Enable and clear trace data for the first time. |
|
+ echo 1 > $trace_base/tracing/tracing_on |
|
+ echo > $trace_base/tracing/trace |
|
+ echo "Prepare trace success." |
|
+ return 0 |
|
+} |
|
+ |
|
+order_to_pages() |
|
+{ |
|
+ local pages=1 |
|
+ local order=$1 |
|
+ |
|
+ while [ "$order" != 0 ]; do |
|
+ order=$((order-1)) |
|
+ pages=$(($pages*2)) |
|
+ done |
|
+ |
|
+ echo $pages |
|
+} |
|
+ |
|
+parse_trace_data() { |
|
+ local module_name tmp_eval pages |
|
+ |
|
+ cat "$(get_trace_base)/tracing/trace" | while read pid cpu flags ts function args |
|
+ do |
|
+ # Skip comment lines |
|
+ if [ "$pid" = "#" ]; then |
|
+ continue |
|
+ fi |
|
+ |
|
+ pid=${pid##*-} |
|
+ function=${function%:} |
|
+ if [ "$function" = "module_load" ]; then |
|
+ # One module is being loaded, save the task pid for tracking. |
|
+ # Remove the trailing after whitespace, there may be the module flags. |
|
+ module_name=${args%% *} |
|
+ # Mark current_module to track the task. |
|
+ eval current_module_$pid="$module_name" |
|
+ tmp_eval=$(eval echo '${module_loaded_'${module_name}'}') |
|
+ if [ -n "$tmp_eval" ]; then |
|
+ echo "WARN: \"$module_name\" was loaded multiple times!" |
|
+ fi |
|
+ eval unset module_loaded_$module_name |
|
+ eval nr_alloc_pages_$module_name=0 |
|
+ continue |
|
+ fi |
|
+ |
|
+ module_name=$(eval echo '${current_module_'${pid}'}') |
|
+ if [ -z "$module_name" ]; then |
|
+ continue |
|
+ fi |
|
+ |
|
+ # Once we get here, the task is being tracked(is loading a module). |
|
+ if [ "$function" = "module_put" ]; then |
|
+ # Mark the module as loaded when the first module_put event happens after module_load. |
|
+ tmp_eval=$(eval echo '${nr_alloc_pages_'${module_name}'}') |
|
+ echo "$tmp_eval pages consumed by \"$module_name\"" |
|
+ eval module_loaded_$module_name=1 |
|
+ # Module loading finished, so untrack the task. |
|
+ eval unset current_module_$pid |
|
+ eval unset nr_alloc_pages_$module_name |
|
+ continue |
|
+ fi |
|
+ |
|
+ if [ "$function" = "mm_page_alloc" ]; then |
|
+ # Get order first, then convert to actual pages. |
|
+ pages=$(echo $args | sed -e 's/.*order=\([0-9]*\) .*/\1/') |
|
+ pages=$(order_to_pages "$pages") |
|
+ tmp_eval=$(eval echo '${nr_alloc_pages_'${module_name}'}') |
|
+ eval nr_alloc_pages_$module_name="$(($tmp_eval+$pages))" |
|
+ fi |
|
+ done |
|
+} |
|
+ |
|
+cleanup_trace() { |
|
+ local trace_base |
|
+ |
|
+ if is_trace_ready; then |
|
+ trace_base=$(get_trace_base) |
|
+ echo 0 > $trace_base/tracing/tracing_on |
|
+ echo > $trace_base/tracing/trace |
|
+ echo > $trace_base/tracing/set_event |
|
+ echo 0 > $trace_base/tracing/events/kmem/filter |
|
+ echo 0 > $trace_base/tracing/events/module/filter |
|
+ fi |
|
+} |
|
+ |
|
+show_usage() { |
|
+ echo "Find out kernel modules with large memory consumption during loading based on trace." |
|
+ echo "Usage:" |
|
+ echo "1) run it first to setup trace." |
|
+ echo "2) run again to parse the trace data if any." |
|
+ echo "3) run with \"--cleanup\" option to cleanup trace after use." |
|
+} |
|
+ |
|
+if [ "$1" = "--help" ]; then |
|
+ show_usage |
|
+ exit 0 |
|
+fi |
|
+ |
|
+if [ "$1" = "--cleanup" ]; then |
|
+ cleanup_trace |
|
+ exit 0 |
|
+fi |
|
+ |
|
+if is_trace_ready ; then |
|
+ echo "tracekomem - Rough memory consumption by loading kernel modules (larger value with better accuracy)" |
|
+ parse_trace_data |
|
+else |
|
+ prepare_trace |
|
+fi |
|
+ |
|
+exit $?
|
|
|