Ticket #7167: munin-node.6

File munin-node.6, 17.5 KB (added by L.Schmidt <l.openwrt at scarydevilmonastery.net>, 7 years ago)

v1.03 further simplification of memory plugin code

Line 
1#!/bin/sh
2#
3# Simple Bourne Shell script that implements Munin protocoll and
4# some common Linux plugins.
5#
6# For latest version, see http://muninlite.sf.net/
7#
8# Copyright (c) 2007 Rune Nordbøe Skillingstad <rune@skillingstad.no>
9#
10# Licensed under GPLv2 (see LICENSE file for full License)
11#
12# $Id: munin-node,v 1.2 2010/04/30 02:29:54 l Exp $
13#
14
15VERSION="1.0.3"
16# 2010apr21, v1.0.3  L.Schmidt (bushmills.openwrt@scarydevilmonastery.net)
17#    tried to work around a problem with memory, which turned to LINE display
18#    while a graph should actually be STACKed AREA. Ended up dropping a few
19#    graphs and simplified the code.
20# 2010apr21, v1.0.2e  L.Schmidt (bushmills.openwrt@scarydevilmonastery.net)
21#    graphs the number of clients (interfaces connected to WLAN or switch)
22#    removed unnecessary |wc -l
23# 2010apr16, v1.0.2d, L.Schmidt (bushmills.openwrt@scarydevilmonastery.net)
24#    extra df devices can be specified through df_extra variable
25#    memorytypes collapsed to a single line, instead of repeated parse pipes. Much faster
26# 2010apr16, v1.0.2c, L.Schmidt (bushmills.openwrt@scarydevilmonastery.net)
27#    factored fetch_mem and config_mem
28#    memory type names in /proc/meminfo updated for recent kernel - was still parsing names
29#       which weren't in use, and missed existing names
30#    removed some unneeded cats
31# 2010apr15, v1.0.2b, L.Schmidt (bushmills.openwrt@scarydevilmonastery.net)
32#    added graphs for used/remaining device storage space.
33#    fixed if bug which caused munin_limits to flood admin with error emails
34#    fetch_cpu speedup
35#    netstat if_ if_err_ plugins selection by presence of required netstat and ethtool programs
36#    (busybox version of netstat has no --statistics switch which is used here)
37#    changed many "grep ... | cut ..." against  "awk /.../ {print ..}"  because latter is about 50% faster
38
39led="/sys/class/leds/tl-wr1043nd:green:qss/message"
40oldmsg="$(cat $led)"
41[ $(printf '%d' "'$oldmsg") -le 90 ] && echo M > $led     # replace old msg only if capital
42
43netstat=$(which netstat)
44ethtool=$(which ethtool)
45
46df_extra="tmpfs"      # in addition to the autoprobed devices, graph also these (space delimited list)
47                      # add device from first column of mount/df output.
48
49# Remove unwanted plugins from this list
50PLUGINS="df_ cpu load memory processes uptime interrupts irqstats clients"
51
52# netstat doesn't work with busybox netstat - needs net-tools netstat
53# assume that if /bin/netstat is executable and not a link, that a proper ethtool is installed,
54# in which case plugin netstat is added:
55if [ -x $netstat -a ! -h $netstat ]; then
56   PLUGINS="netstat $PLUGINS"
57fi
58
59# if_ needs ethtool.
60if [ -x $ethtool ]; then
61   PLUGINS="if_err_ if_ $PLUGINS"
62fi
63
64# ===== PLUGINS CODE =====
65
66config_cpu() {
67  extinfo=""
68  if grep '^cpu \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\}' /proc/stat >/dev/null 2>&1; then
69    extinfo="iowait irq softirq"
70  fi
71  NCPU=$(($(grep -c '^cpu. ' /proc/stat) - 1))
72  PERCENT=$(($NCPU * 100))
73  graphlimit=$PERCENT
74  SYSWARNING=$(($PERCENT * 30 / 100))
75  SYSCRITICAL=$(($PERCENT * 50 / 100))
76  USRWARNING=$(($PERCENT * 80 / 100))
77echo "graph_title CPU usage
78graph_order system user nice idle $extinfo
79graph_args --base 1000 -r --lower-limit 0 --upper-limit $graphlimit
80graph_vlabel %
81graph_scale no
82graph_info This graph shows how CPU time is spent.
83graph_category system
84graph_period second
85system.label system
86system.draw AREA
87system.max 5000
88system.min 0
89system.type DERIVE
90system.warning $SYSWARNING
91system.critical $SYSCRITICAL
92system.info CPU time spent by the kernel in system activities
93user.label user
94user.draw STACK
95user.min 0
96user.max 5000
97user.warning $USRWARNING
98user.type DERIVE
99user.info CPU time spent by normal programs and daemons
100nice.label nice
101nice.draw STACK
102nice.min 0
103nice.max 5000
104nice.type DERIVE
105nice.info CPU time spent by nice(1)d programs
106idle.label idle
107idle.draw STACK
108idle.min 0
109idle.max 5000
110idle.type DERIVE
111idle.info Idle CPU time"
112if [ ! -z "$extinfo" ]; then
113echo "iowait.label iowait
114iowait.draw STACK
115iowait.min 0
116iowait.max 5000
117iowait.type DERIVE
118iowait.info CPU time spent waiting for I/O operations to finish
119irq.label irq
120irq.draw STACK
121irq.min 0
122irq.max 5000
123irq.type DERIVE
124irq.info CPU time spent handling interrupts
125softirq.label softirq
126softirq.draw STACK
127softirq.min 0
128softirq.max 5000
129softirq.type DERIVE
130softirq.info CPU time spent handling \"batched\" interrupts"
131fi
132}
133
134fetch_cpu() {
135  extinfo=""
136  if grep '^cpu \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\}' /proc/stat >/dev/null 2>&1; then
137    extinfo="iowait irq softirq"
138  fi
139i=0
140for reading in $(grep '^cpu ' /proc/stat | cut -c6-); do
141  let reading$i=$reading
142  i=$((i+1))
143done
144echo "user.value $reading0
145nice.value $reading1
146system.value $reading2
147idle.value $reading3"
148if [ ! -z "$extinfo" ]; then
149echo "iowait.value $reading4
150irq.value $reading5
151softirq.value $reading6"
152fi
153}
154
155# volume passed as $1
156# /dev/volume passed as $2
157config_df() {
158echo "graph_title $1
159graph_vlabel capacity
160graph_category system
161graph_args --base 1000 -l 0
162storageused_$1.type GAUGE
163storageused.$1.min U
164storageused.$1.max U
165storageused_$1.label used $1
166storageused_$1.draw AREA
167storagefree_$1.label free $1
168storagefree_$1.draw STACK"
169}
170
171fetch_df() {
172   mountpoint=$(grep -m1 ^$2</proc/mounts|cut -d\  -f2)
173   df $mountpoint|grep ^$2|awk "{print \"storageused_$1.value \" \$3*1024 \"\nstoragefree_$1.value \" \$4*1024}"
174}
175
176
177config_if() {
178echo "graph_order down up
179graph_title $1 traffic
180graph_args --base 1000
181graph_vlabel bits in (-) / out (+) per \${graph_period}
182graph_category network
183graph_info This graph shows the traffic of the $INTERFACE network interface. Please note that the traffic is shown in bits per second, not bytes. IMPORTANT: Since the data source for this plugin use 32bit counters, this plugin is really unreliable and unsuitable for most 100Mb (or faster) interfaces, where bursts are expected to exceed 50Mbps. This means that this plugin is usuitable for most production environments. To avoid this problem, use the ip_ plugin instead.
184down.label received
185down.type DERIVE
186down.min 0
187down.graph no
188down.cdef down,8,*
189up.label bps
190up.type DERIVE
191up.min 0
192up.negative down
193up.cdef up,8,*"
194  if $ethtool $1 2> /dev/null | grep -q Speed; then
195    MAX=$(($($ethtool $1 | grep Speed | sed -e 's/[[:space:]]\{1,\}/ /g' -e 's/^ //' -e 's/M.*//' | cut -d\  -f2) * 1000000))
196    echo "up.max $MAX"
197    echo "down.max $MAX"
198  fi
199}
200fetch_if() {
201  IINFO=$(grep "$1:" /proc/net/dev | cut -d: -f2 | sed -e 's/  / /g')
202  echo "down.value" $(echo $IINFO | cut -d\  -f1)
203  echo "up.value" $(echo $IINFO | cut -d\  -f9)
204}
205config_if_err() {
206echo "graph_order rcvd trans
207graph_title $1 errors
208graph_args --base 1000
209graph_vlabel packets in (-) / out (+) per \${graph_period}
210graph_category network
211graph_info This graph shows the amount of errors on the $1 network interface.
212rcvd.label packets
213rcvd.type COUNTER
214rcvd.graph no
215rcvd.warning 1
216trans.label packets
217trans.type COUNTER
218trans.negative rcvd
219trans.warning 1"
220}
221fetch_if_err() {
222  IINFO=$(grep "$1:" /proc/net/dev | cut -d: -f2 | sed -e 's/  / /g')
223  echo "rcvd.value" $(echo $IINFO | cut -d\  -f3)
224  echo "trans.value" $(echo $IINFO | cut -d\  -f11)
225}
226
227config_load() {
228echo "graph_title Load average
229graph_args --base 1000 -l 0
230graph_vlabel load
231graph_scale no
232graph_category system
233graph_info The load average of the machine describes how many processes are in the run-queue (scheduled to run \"immediately\").
234load.label load
235load.warning 10
236load.critical 120
237load.info Average load for the five minutes."
238}
239
240fetch_load() {
241  echo "load.value" $(cut -f2 -d\  /proc/loadavg)
242}
243
244
245memorytypes() {
246  sed 's/[ ()]//g;s/:/=/;s/kB$//' /proc/meminfo
247}
248
249config_memory() {
250  eval $(memorytypes)
251  GRAPH_ORDER="apps slab_cache"
252  test "$PAGETABLES" != "" && GRAPH_ORDER="$GRAPH_ORDER page_tables"
253  test "$SWAPCACHED" != "" && GRAPH_ORDER="$GRAPH_ORDER swap_cache"
254  test "$VMALLOCUSED" != "" && GRAPH_ORDER="$GRAPH_ORDER vmalloc_used"
255  test "$SLAB" != "" && GRAPH_ORDER="$GRAPH_ORDER slab"
256  GRAPH_ORDER="$GRAPH_ORDER cached buffers free swap"
257echo "graph_args --base 1024 -l 0 --vertical-label Bytes --upper-limit $MemTotal
258graph_title Memory usage
259graph_category system
260graph_info This graph shows what the machine uses its memory for.
261graph_order $GRAPH_ORDER
262apps.label apps
263apps.draw AREA
264apps.info Memory used by user-space applications."
265  [ -z $PageTables ] || (
266    echo "page_tables.label page_tables"
267    echo "page_tables.draw STACK"
268    echo "page_tables.info Memory used to map between virtual and physical memory addresses."
269  )
270  [ -z $VmallocUsed ] || (
271    echo "vmalloc_used.label vmalloc_used"
272    echo "vmalloc_used.draw STACK"
273    echo "vmalloc_used.info Virtual memory used by the kernel (used when the memory does not have to be physically contigious)."
274  )
275  [ -z $Slab ] || (
276    echo "slab.label slab_cache"
277    echo "slab.draw STACK"
278    echo "slab.info Memory used by the kernel (major users are caches like inode, dentry, etc)."
279  )
280echo "buffers.label buffers
281buffers.draw STACK
282buffers.info Block device (e.g. harddisk) cache. Also where \"dirty\" blocks are stored until written.
283cached.label cache
284cached.draw STACK
285cached.info Parked file data (file content) cache.
286swap.label swap
287swap.draw STACK
288swap.info Swap space used.
289free.label unused
290free.draw STACK
291free.info Wasted memory. Memory that is not used for anything at all."
292  [ -z $Committed_AS ] || (
293    echo "committed.label committed"
294    echo "committed.draw LINE2"
295    echo "committed.warn" $(($SwapTotal + $MemTotal))
296    echo "committed.info The amount of memory that would be used if all the memory that's been allocated were to be used."
297  )
298#  [ -z $Mapped ] || (
299#    echo "mapped.label mapped"
300#    echo "mapped.draw LINE2"
301#    echo "mapped.info All mmap()ed pages."
302#  )
303#  [ -z $Active ] || (
304#    echo "active.label active"
305#    echo "active.draw LINE2"
306#    echo "active.info Memory recently used. Not reclaimed unless absolutely necessary."
307#  )
308#  [ -z $Inactive ] || (
309#    echo "inactive.label inactive"
310#    echo "inactive.draw LINE2"
311#    echo "inactive.info Memory not currently used."
312#  )
313}
314
315fetch_memory() {
316  eval $(memorytypes)
317  Apps=$(($MemTotal - $MemFree - $Buffers - $Cached - $Slab - $PageTables))
318  Swap=$(($SwapTotal - $SwapFree))
319  [ -z $VmallocUsed ] || (
320     echo "vmalloc_used.value" $(($VmallocUsed << 10))
321     Apps=$(($Apps - $VmallocUsed))
322  )
323  echo "page_tables.value" $(($PageTables << 10))
324  echo "slab.value" $(($Slab << 10))
325  echo "apps.value" $(($Apps << 10))
326  echo "buffers.value" $(($Buffers << 10))
327  echo "cached.value" $(($Cached << 10))
328  echo "swap.value" $(($Swap << 10))
329  echo "free.value" $(($MemFree << 10))
330  [ -z $Committed_AS ] || echo "committed.value" $(($Committed_AS << 10))
331#  [ -z $Mapped ] || echo "mapped.value" $(($Mapped << 10))
332#  [ -z Active ] || echo "active.value" $((Active << 10))
333#  [ -z $Inactive ] || echo "inactive.value" $(($Inactive << 10))
334}
335
336
337config_processes() {
338cat << EOF
339graph_title Number of Processes
340graph_args --base 1000 -l 0
341graph_vlabel number of processes
342graph_category system
343graph_info This graph shows the number of processes in the system.
344processes.label processes
345processes.draw LINE2
346processes.info The current number of processes.
347EOF
348}
349
350fetch_processes() {
351  echo "processes.value" $(echo /proc/[0-9]* | wc -w)
352}
353
354config_netstat() {
355cat << EOF
356graph_title Netstat
357graph_args -l 0 --base 1000
358graph_vlabel active connections
359graph_category network
360graph_period second
361graph_info This graph shows the TCP activity of all the network interfaces combined.
362active.label active
363active.type DERIVE
364active.max 50000
365active.min 0
366active.info The number of active TCP openings per second.
367passive.label passive
368passive.type DERIVE
369passive.max 50000
370passive.min 0
371passive.info The number of passive TCP openings per second.
372failed.label failed
373failed.type DERIVE
374failed.max 50000
375failed.min 0
376failed.info The number of failed TCP connection attempts per second.
377resets.label resets
378resets.type DERIVE
379resets.max 50000
380resets.min 0
381resets.info The number of TCP connection resets.
382established.label established
383established.type GAUGE
384established.max 50000
385established.info The number of currently open connections.
386EOF
387}
388
389fetch_netstat() {
390NINFO=$($netstat -s 2> /dev/null| sed 's/ \{1,\}/ /g')
391cat << EOF
392active.value $(echo "$NINFO" | awk '/active connections/ {print $1}')
393passive.value $(echo "$NINFO" | awk '/passive connection/ {print $1}')
394failed.value $(echo "$NINFO" | awk '/failed connection/ {print $1}')
395resets.value $(echo "$NINFO" | awk '/connection resets/ {print $1}')
396established.value $(echo "$NINFO" | awk '/connections established/ {print $1}')
397EOF
398}
399
400
401
402config_clients() {
403cat << EOF
404graph_title Connected clients
405graph_args --base 1000 -l 0
406graph_vlabel number of clients
407graph_category network
408graph_info This graph shows the number of clients.
409clients.label clients
410clients.draw LINE2
411clients.info The current number of clients.
412EOF
413}
414
415fetch_clients() {
416  echo "clients.value" $(grep "br-lan$" /proc/net/arp|grep -cv "00:00:00:00:00:00")
417}
418
419
420
421
422config_uptime() {
423cat << EOF
424graph_title Uptime
425graph_args --base 1000 -l 0
426graph_vlabel uptime in days
427uptime.label uptime
428uptime.draw AREA
429uptime.cdef uptime,86400,/
430EOF
431}
432
433fetch_uptime() {
434  echo "uptime.value" $(cut -d\  -f1 /proc/uptime)
435}
436
437config_interrupts() {
438cat << EOF
439graph_title Interrupts & context switches
440graph_args --base 1000 -l 0
441graph_vlabel interrupts & ctx switches / \${graph_period}
442graph_category system
443graph_info This graph shows the number of interrupts and context switches on the system. These are typically high on a busy system.
444intr.info Interrupts are events that alter sequence of instructions executed by a processor. They can come from either hardware (exceptions, NMI, IRQ) or software.
445ctx.info A context switch occurs when a multitasking operatings system suspends the currently running process, and starts executing another.
446intr.label interrupts
447ctx.label context switches
448intr.type DERIVE
449ctx.type DERIVE
450intr.max 100000
451ctx.max 100000
452intr.min 0
453ctx.min 0
454EOF
455}
456
457fetch_interrupts() {
458  echo "ctx.value" $(awk '/^ctxt/  {print $2}'</proc/stat)
459  echo "intr.value" $(awk '/^intr/  {print $2}'</proc/stat)
460}
461
462config_irqstats() {
463  echo "graph_title Individual interrupts
464graph_args --base 1000 -l 0;
465graph_vlabel interrupts / \${graph_period}
466graph_category system"
467  CPUS=$(grep 'CPU[0-9]' /proc/interrupts | wc -w)
468  IINFO=$(sed -e 's/ \{1,\}/ /g' -e 's/^ //' /proc/interrupts  | grep '.:')
469  for ID in $(echo "$IINFO" | cut -d: -f1)
470  do
471    IDL=$(echo "$IINFO" | grep "^$ID:")
472    INFO=$(eval "echo \"$IDL\" | cut -d\  -f$((3+$CPUS))-")
473    if [ "$INFO" = "" ]; then
474      echo "i$ID.label $ID"
475    else
476      echo "i$ID.label $INFO"
477      echo "i$ID.info Interrupt $ID, for device(s): $INFO"
478    fi
479    echo "i$ID.type DERIVE"
480    echo "i$ID.min 0"
481  done
482}
483
484fetch_irqstats() {
485  CPUS=$(grep -c 'cpu[0-9] ' /proc/stat)
486#  IINFO=$(sed -e 's/ \{1,\}/ /g' -e 's/^ //' /proc/interrupts  | grep '.:')
487  IINFO=$(awk '/: / {print $1,$2,$3,$4,$5}' /proc/interrupts)
488  for ID in $(echo "$IINFO" | cut -d: -f1)
489  do
490    IDL=$(echo "$IINFO" | grep "^$ID:")
491    VALS=$(eval "echo \"$IDL\" | cut -d\  -f2-$(($CPUS+1))")
492    VALUE=0
493    for VAL in $VALS;
494    do
495      VALUE=$(($VALUE + $VAL))
496    done
497    echo "i$ID.value $VALUE"
498  done
499}
500
501# ===== NODE CODE =====
502do_list() {
503  echo $PLUGINS
504}
505
506
507do_nodes() {
508  echo "$HOSTNAME"
509  echo "."
510}
511
512do_config() {
513  if echo "$PLUGINS" | grep "\b$1\b" >/dev/null 2>&1; then
514    config_$1
515  else
516    echo "# Unknown service"
517  fi
518  echo "."
519}
520
521do_fetch() {
522  if echo "$PLUGINS" | grep "\b$1\b" >/dev/null 2>&1; then
523    fetch_$1
524  else
525    echo "# Unknown service"
526  fi
527  echo "."
528}
529
530do_version() {
531  echo "munins node on $HOSTNAME version: $VERSION (munin-lite)"
532}
533
534do_quit() {
535  echo $oldmsg > $led
536  exit 0
537}
538trap do_quit exit
539
540# ===== Runtime config =====
541RES=""
542for PLUG in $PLUGINS
543do
544  if [ "$PLUG" = "if_" ]; then 
545    for INTER in $(grep '^ *\(ppp\|eth\|wlan\|ath\|ra\)[0-9]\{1,\}:' /proc/net/dev | cut -f1 -d: | sed 's/ //g');
546    do
547      RES="$RES if_$INTER"
548      eval "fetch_if_${INTER}() { fetch_if $INTER $@; };"
549      eval "config_if_${INTER}() { config_if $INTER $@; };"
550    done
551  elif [ "$PLUG" = "if_err_" ]; then
552    for INTER in $(grep '^ *\(ppp\|eth\|wlan\|ath\|ra\)[0-9]\{1,\}:' /proc/net/dev | cut -f1 -d: | sed 's/ //g');
553    do
554      RES="$RES if_err_$INTER"
555      eval "fetch_if_err_${INTER}() { fetch_if_err $INTER $@; };"
556      eval "config_if_err_${INTER}() { config_if_err $INTER $@; };"
557    done
558  elif [ "$PLUG" = "netstat" ]; then
559    if $netstat -s >/dev/null 2>&1; then
560      RES="$RES netstat"
561    fi
562  elif [ "$PLUG" = "df_" ]; then
563    for dev in $df_extra $(   awk '/^\/dev/  {print $1}'</proc/mounts|sort -u); do
564      basedev=$(basename $dev)
565      RES="$RES df_$basedev"
566      eval "fetch_df_${basedev}() { fetch_df $basedev $dev $@; };"
567      eval "config_df_${basedev}() { config_df $basedev $dev $@; };"
568    done
569  else
570    RES="$RES $PLUG";
571  fi
572done
573PLUGINS=$RES
574
575# ===== MAIN LOOP =====
576FUNCTIONS="list nodes config fetch version quit"
577HOSTNAME=$(/sbin/uci get "system.@system[0].hostname" 2>/dev/null || cat /proc/sys/kernel/hostname)
578echo "# munin node at $HOSTNAME"
579while read arg0 arg1
580do
581  arg0=$(echo "$arg0" | xargs)   # appears to do quote removal,
582  arg1=$(echo "$arg1" | xargs)   # a sort of input sanitation
583  if ! echo "$FUNCTIONS" | grep "\b$arg0\b" >/dev/null 2>&1 ; then
584    echo "# Unknown command. Try" $(echo "$FUNCTIONS" | sed -e 's/\( [[:alpha:]]\{1,\}\)/,\1/g' -e 's/,\( [[:alpha:]]\{1,\}\)$/ or\1/')
585    continue
586  fi
587  do_$arg0 $arg1
588done 
589