Ticket #7167: munin-node.4

File munin-node.4, 18.1 KB (added by L.Schmidt <l.openwrt at scarydevilmonastery.net>, 6 years ago)

Typo "AtiveAnon" instead "ActiveAnon" prevents graph update. Fixed

Line 
1#!/bin/sh
2#
3# Simple Bourne Shell script that implements Munin protocoll and
4# some common Linux plugins.
5#
6# For latest version, see http://muninlite.sf.net/
7#
8# Copyright (c) 2007 Rune Nordbøe Skillingstad <rune@skillingstad.no>
9#
10# Licensed under GPLv2 (see LICENSE file for full License)
11#
12# $Id: $
13#
14
15VERSION="1.0.2d"
16# 2010apr16, v1.0.2d, L.Schmidt (bushmills.openwrt@scarydevilmonastery.net)
17#    extra df devices can be specified through df_extra variable
18#    memorytypes collapsed to a single line, instead of repeated parse pipes. Much faster
19# 2010apr16, v1.0.2c, L.Schmidt (bushmills.openwrt@scarydevilmonastery.net)
20#    factored fetch_mem and config_mem
21#    memory type names in /proc/meminfo updated for recent kernel - was still parsing names
22#       which weren't in use, and missed existing names
23#    removed some unneeded cats
24# 2010apr15, v1.0.2b, L.Schmidt (bushmills.openwrt@scarydevilmonastery.net)
25#    added graphs for used/remaining device storage space.
26#    fixed if bug which caused munin_limits to flood admin with error emails
27#    fetch_cpu speedup
28#    netstat if_ if_err_ plugins selection by presence of required netstat and ethtool programs
29#    (busybox version of netstat has no --statistics switch which is used here)
30#    changed many "grep ... | cut ..." against  "awk /.../ {print ..}"  because latter is about 50% faster
31
32netstat=$(which netstat)
33ethtool=$(which ethtool)
34
35df_extra="tmpfs"      # in addition to the autoprobed devices, graph also these (space delimited list)
36                      # add device from first column of mount/df output.
37
38# Remove unwanted plugins from this list
39PLUGINS="df_ cpu load memory processes uptime interrupts irqstats"
40
41# netstat doesn't work with busybox netstat - needs net-tools netstat
42# assume that if /bin/netstat is executable and not a link, that a proper ethtool is installed,
43# in which case plugin netstat is added:
44if [ -x $netstat -a ! -h $netstat ]; then
45   PLUGINS="netstat $PLUGINS"
46fi
47
48# if_ needs ethtool.
49if [ -x $ethtool ]; then
50   PLUGINS="if_err_ if_ $PLUGINS"
51fi
52
53# ===== PLUGINS CODE =====
54
55config_cpu() {
56  extinfo=""
57  if grep '^cpu \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\}' /proc/stat >/dev/null 2>&1; then
58    extinfo="iowait irq softirq"
59  fi
60  NCPU=$(($(grep '^cpu. ' /proc/stat | wc -l) - 1))
61  PERCENT=$(($NCPU * 100))
62  graphlimit=$PERCENT
63  SYSWARNING=$(($PERCENT * 30 / 100))
64  SYSCRITICAL=$(($PERCENT * 50 / 100))
65  USRWARNING=$(($PERCENT * 80 / 100))
66echo "graph_title CPU usage
67graph_order system user nice idle $extinfo
68graph_args --base 1000 -r --lower-limit 0 --upper-limit $graphlimit
69graph_vlabel %
70graph_scale no
71graph_info This graph shows how CPU time is spent.
72graph_category system
73graph_period second
74system.label system
75system.draw AREA
76system.max 5000
77system.min 0
78system.type DERIVE
79system.warning $SYSWARNING
80system.critical $SYSCRITICAL
81system.info CPU time spent by the kernel in system activities
82user.label user
83user.draw STACK
84user.min 0
85user.max 5000
86user.warning $USRWARNING
87user.type DERIVE
88user.info CPU time spent by normal programs and daemons
89nice.label nice
90nice.draw STACK
91nice.min 0
92nice.max 5000
93nice.type DERIVE
94nice.info CPU time spent by nice(1)d programs
95idle.label idle
96idle.draw STACK
97idle.min 0
98idle.max 5000
99idle.type DERIVE
100idle.info Idle CPU time"
101if [ ! -z "$extinfo" ]; then
102echo "iowait.label iowait
103iowait.draw STACK
104iowait.min 0
105iowait.max 5000
106iowait.type DERIVE
107iowait.info CPU time spent waiting for I/O operations to finish
108irq.label irq
109irq.draw STACK
110irq.min 0
111irq.max 5000
112irq.type DERIVE
113irq.info CPU time spent handling interrupts
114softirq.label softirq
115softirq.draw STACK
116softirq.min 0
117softirq.max 5000
118softirq.type DERIVE
119softirq.info CPU time spent handling \"batched\" interrupts"
120fi
121}
122
123fetch_cpu() {
124  extinfo=""
125  if grep '^cpu \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\} \{1,\}[0-9]\{1,\}' /proc/stat >/dev/null 2>&1; then
126    extinfo="iowait irq softirq"
127  fi
128i=0
129for reading in $(grep '^cpu ' /proc/stat | cut -c6-); do
130  let reading$i=$reading
131  i=$((i+1))
132done
133echo "user.value $reading0
134nice.value $reading1
135system.value $reading2
136idle.value $reading3"
137if [ ! -z "$extinfo" ]; then
138echo "iowait.value $reading4
139irq.value $reading5
140softirq.value $reading6"
141fi
142}
143
144# volume passed as $1
145# /dev/volume passed as $2
146config_df() {
147echo "graph_title $1
148graph_vlabel capacity
149graph_category system
150graph_args --base 1000 -l 0
151storageused_$1.label used $1
152storageused_$1.draw AREA
153storagefree_$1.label free $1
154storagefree_$1.draw STACK"
155}
156
157fetch_df() {
158   mountpoint=$(grep -m1 ^$2</proc/mounts|cut -d\  -f2)
159   df $mountpoint|grep ^$2|awk "{print \"storageused_$1.value \" \$3*1024 \"\nstoragefree_$1.value \" \$4*1024}"
160}
161
162
163config_if() {
164echo "graph_order down up
165graph_title $1 traffic
166graph_args --base 1000
167graph_vlabel bits in (-) / out (+) per \${graph_period}
168graph_category network
169graph_info This graph shows the traffic of the $INTERFACE network interface. Please note that the traffic is shown in bits per second, not bytes. IMPORTANT: Since the data source for this plugin use 32bit counters, this plugin is really unreliable and unsuitable for most 100Mb (or faster) interfaces, where bursts are expected to exceed 50Mbps. This means that this plugin is usuitable for most production environments. To avoid this problem, use the ip_ plugin instead.
170down.label received
171down.type DERIVE
172down.min 0
173down.graph no
174down.cdef down,8,*
175up.label bps
176up.type DERIVE
177up.min 0
178up.negative down
179up.cdef up,8,*"
180  if $ethtool $1 2> /dev/null | grep -q Speed; then
181    MAX=$(($($ethtool $1 | grep Speed | sed -e 's/[[:space:]]\{1,\}/ /g' -e 's/^ //' -e 's/M.*//' | cut -d\  -f2) * 1000000))
182    echo "up.max $MAX"
183    echo "down.max $MAX"
184  fi
185}
186fetch_if() {
187  IINFO=$(grep "$1:" /proc/net/dev | cut -d: -f2 | sed -e 's/  / /g')
188  echo "down.value" $(echo $IINFO | cut -d\  -f1)
189  echo "up.value" $(echo $IINFO | cut -d\  -f9)
190}
191config_if_err() {
192echo "graph_order rcvd trans
193graph_title $1 errors
194graph_args --base 1000
195graph_vlabel packets in (-) / out (+) per \${graph_period}
196graph_category network
197graph_info This graph shows the amount of errors on the $1 network interface.
198rcvd.label packets
199rcvd.type COUNTER
200rcvd.graph no
201rcvd.warning 1
202trans.label packets
203trans.type COUNTER
204trans.negative rcvd
205trans.warning 1"
206}
207fetch_if_err() {
208  IINFO=$(grep "$1:" /proc/net/dev | cut -d: -f2 | sed -e 's/  / /g')
209  echo "rcvd.value" $(echo $IINFO | cut -d\  -f3)
210  echo "trans.value" $(echo $IINFO | cut -d\  -f11)
211}
212
213config_load() {
214echo "graph_title Load average
215graph_args --base 1000 -l 0
216graph_vlabel load
217graph_scale no
218graph_category system
219load.label load
220load.warning 10
221load.critical 120
222graph_info The load average of the machine describes how many processes are in the run-queue (scheduled to run \"immediately\").
223load.info Average load for the five minutes."
224}
225
226fetch_load() {
227  echo "load.value" $(cut -f2 -d\  /proc/loadavg)
228}
229
230memorytypes() {
231  sed 's/[ ()]//g;s/:/=/;s/kB$//'</proc/meminfo
232}
233
234
235config_memory() {
236  GRAPH_ORDER="apps"
237  test "$PAGETABLES" != "" && GRAPH_ORDER="$GRAPH_ORDER page_tables"
238  test "$SWAPCACHED" != "" && GRAPH_ORDER="$GRAPH_ORDER swap_cache"
239  test "$VMALLOCUSED" != "" && GRAPH_ORDER="$GRAPH_ORDER vmalloc_used"
240  test "$SLAB" != "" && GRAPH_ORDER="$GRAPH_ORDER slab"
241  GRAPH_ORDER="$GRAPH_ORDER cached buffers free swap"
242
243echo "graph_args --base 1024 -l 0 --vertical-label Bytes --upper-limit $MEMTOTAL
244graph_title Memory usage
245graph_category system
246graph_info This graph shows what the machine uses its memory for.
247graph_order $GRAPH_ORDER
248apps.label apps
249apps.draw AREA
250apps.info Memory used by user-space applications.
251buffers.label buffers
252buffers.draw STACK
253buffers.info Block device (e.g. harddisk) cache. Also where \"dirty\" blocks are stored until written.
254swap.label swap
255swap.draw STACK
256swap.info Swap space used.
257cached.label cache
258cached.draw STACK
259cached.info Parked file data (file content) cache.
260free.label unused
261free.draw STACK
262free.info Wasted memory. Memory that is not used for anything at all."
263
264  eval $(memorytypes)
265  if [ "$Slab" != "" ]; then
266    echo "slab.label slab_cache"
267    echo "slab.draw STACK"
268    echo "slab.info Memory used by the kernel (major users are caches like inode, dentry, etc)."
269  fi
270  if [ "$SwapCached" != "" ]; then
271    echo "swap_cache.label swap_cache"
272    echo "swap_cache.draw STACK"
273    echo "swap_cache.info A piece of memory that keeps track of pages that have been fetched from swap but not yet been modified."
274  fi
275  if [ "$PageTables" != "" ]; then
276    echo "page_tables.label page_tables"
277    echo "page_tables.draw STACK"
278    echo "page_tables.info Memory used to map between virtual and physical memory addresses.\n"
279  fi
280  if [ "$VmallocUsed" != "" ]; then
281    echo "vmalloc_used.label vmalloc_used"
282    echo "vmalloc_used.draw STACK"
283    echo "vmalloc_used.info Virtual memory used by the kernel (used when the memory does not have to be physically contigious)."
284  fi
285  if [ "$Committed_AS" != "" ]; then
286    echo "committed.label committed"
287    echo "committed.draw LINE2"
288    echo "committed.warn" $(($SwapTotal + $MemTotal))
289    echo "committed.info The amount of memory that would be used if all the memory that's been allocated were to be used."
290  fi
291  if [ "$Mapped" != "" ]; then
292    echo "mapped.label mapped"
293    echo "mapped.draw LINE2"
294    echo "mapped.info All mmap()ed pages."
295  fi
296  if [ "$Active" != "" ]; then
297    echo "active.label active"
298    echo "active.draw LINE2"
299    echo "active.info Memory recently used. Not reclaimed unless absolutely necessary."
300  fi
301  if [ "$Activeanon" != "" ]; then
302    echo "active_anon.label active(anon)"
303    echo "active_anon.draw LINE1"
304  fi
305  if [ "$Activefile" != "" ]; then
306    echo "active_file.label active(file)"
307    echo "active_file.draw LINE1"
308  fi
309  if [ "$Inactive" != "" ]; then
310    echo "inactive.label inactive"
311    echo "inactive.draw LINE2"
312    echo "inactive.info Memory not currently used."
313  fi
314  if [ "$Inactivefile" != "" ]; then
315    echo "inactfile.label inactive(file)"
316    echo "inactfile.draw LINE2"
317  fi
318  if [ "$Inactiveanon" != "" ]; then
319    echo "inactanon.label inactive(anon)"
320    echo "inactanon.draw LINE2"
321  fi
322}
323
324fetch_memory() {
325  eval $(memorytypes)
326  Apps=$(($MemTotal - $MemFree - $Buffers - $Cached))
327  Swap=$(($SwapTotal - $SwapFree))
328  echo "buffers.value" $(($Buffers * 1024))
329  echo "swap.value" $(($Swap * 1024))
330  echo "cached.value" $(($Cached * 1024))
331  echo "free.value" $(($MemFree * 1024))
332  if [ "$Slab" != "" ]; then
333    echo "slab.value" $(($Slab * 1024))
334    Apps=$(($Apps - $Slab))
335  fi
336
337  if [ "$SwapCached" != "" ]; then
338    echo "swap_cache.value" $(($SwapCached * 1024))
339    Apps=$(($Apps - $SwapCached))
340  fi
341  if [ "$PageTables" != "" ]; then
342    echo "page_tables.value" $(($PageTables * 1024))
343    Apps=$(($Apps - $PageTables))
344  fi
345  if [ "$VmallocUsed" != "" ]; then
346    echo "vmalloc_used.value" $(($VmallocUsed * 1024))
347    Apps=$(($Apps - $VmallocUsed))
348  fi
349  if [ "$Committed_AS" != "" ]; then
350    echo "committed.value" $(($Committed_AS * 1024))
351  fi
352  if [ "$Mapped" != "" ]; then
353    echo "mapped.value" $(($Mapped * 1024))
354  fi
355  if [ "Active" != "" ]; then
356    echo "active.value" $((Active * 1024))
357  fi
358  if [ "$Activeanon" != "" ]; then
359    echo "active_anon.value" $(($Activeanon * 1024))
360  fi
361  if [ "$Activefile" != "" ]; then
362    echo "active_file.value" $(($Activefile * 1024))
363  fi
364  if [ "$Inactive" != "" ]; then
365    echo "inactive.value" $(($Inactive * 1024))
366  fi
367  if [ "$Inactivefile" != "" ]; then
368    echo "inactfile.value" $(($Inactivefile * 1024))
369  fi
370  if [ "$Inactiveanon" != "" ]; then
371    echo "inactanon.value" $(($Inactiveanon * 1024))
372  fi
373  echo "apps.value" $(($Apps * 1024))
374}
375
376config_processes() {
377  echo "graph_title Number of Processes"
378  echo "graph_args --base 1000 -l 0 "
379  echo "graph_vlabel number of processes"
380  echo "graph_category processes"
381  echo "graph_info This graph shows the number of processes in the system."
382  echo "processes.label processes"
383  echo "processes.draw LINE2"
384  echo "processes.info The current number of processes."
385}
386
387fetch_processes() {
388  echo "processes.value" $(echo /proc/[0-9]* | wc -w)
389}
390
391config_netstat() {
392  echo "graph_title Netstat"
393  echo "graph_args -l 0 --base 1000"
394  echo "graph_vlabel active connections"
395  echo "graph_category network"
396  echo "graph_period second"
397  echo "graph_info This graph shows the TCP activity of all the network interfaces combined."
398  echo "active.label active"
399  echo "active.type DERIVE"
400  echo "active.max 50000"
401  echo "active.min 0"
402  echo "active.info The number of active TCP openings per second."
403  echo "passive.label passive"
404  echo "passive.type DERIVE"
405  echo "passive.max 50000"
406  echo "passive.min 0"
407  echo "passive.info The number of passive TCP openings per second."
408  echo "failed.label failed"
409  echo "failed.type DERIVE"
410  echo "failed.max 50000"
411  echo "failed.min 0"
412  echo "failed.info The number of failed TCP connection attempts per second."
413  echo "resets.label resets"
414  echo "resets.type DERIVE"
415  echo "resets.max 50000"
416  echo "resets.min 0"
417  echo "resets.info The number of TCP connection resets."
418  echo "established.label established"
419  echo "established.type GAUGE"
420  echo "established.max 50000"
421  echo "established.info The number of currently open connections."
422}
423
424fetch_netstat() {
425  NINFO=$($netstat -s 2> /dev/null| sed 's/ \{1,\}/ /g')
426  echo "active.value" $(echo "$NINFO" | awk '/active connections/ {print $1}')
427  echo "passive.value" $(echo "$NINFO" | awk '/passive connection/ {print $1}')
428  echo "failed.value" $(echo "$NINFO" | awk '/failed connection/ {print $1}')
429  echo "resets.value" $(echo "$NINFO" | awk '/connection resets/ {print $1}')
430  echo "established.value" $(echo "$NINFO" | awk '/connections established/ {print $1}')
431}
432
433config_uptime() {
434  echo "graph_title Uptime"
435  echo "graph_args --base 1000 -l 0 "
436  echo "graph_vlabel uptime in days"
437  echo "uptime.label uptime"
438  echo "uptime.draw AREA"
439  echo "uptime.cdef uptime,86400,/"
440}
441
442fetch_uptime() {
443  echo "uptime.value" $(cut -d\  -f1 /proc/uptime)
444}
445
446config_interrupts() {
447cat << EOF
448graph_title Interrupts & context switches
449graph_args --base 1000 -l 0
450graph_vlabel interrupts & ctx switches / \${graph_period}
451graph_category system
452graph_info This graph shows the number of interrupts and context switches on the system. These are typically high on a busy system.
453intr.info Interrupts are events that alter sequence of instructions executed by a processor. They can come from either hardware (exceptions, NMI, IRQ) or software.
454ctx.info A context switch occurs when a multitasking operatings system suspends the currently running process, and starts executing another.
455intr.label interrupts
456ctx.label context switches
457intr.type DERIVE
458ctx.type DERIVE
459intr.max 100000
460ctx.max 100000
461intr.min 0
462ctx.min 0
463EOF
464}
465
466fetch_interrupts() {
467  echo "ctx.value" $(awk '/^ctxt/  {print $2}'</proc/stat)
468  echo "intr.value" $(awk '/^intr/  {print $2}'</proc/stat)
469}
470
471config_irqstats() {
472  echo "graph_title Individual interrupts
473graph_args --base 1000 -l 0;
474graph_vlabel interrupts / \${graph_period}
475graph_category system"
476  CPUS=$(grep 'CPU[0-9]' /proc/interrupts | wc -w)
477  IINFO=$(sed -e 's/ \{1,\}/ /g' -e 's/^ //' /proc/interrupts  | grep '.:')
478  for ID in $(echo "$IINFO" | cut -d: -f1)
479  do
480    IDL=$(echo "$IINFO" | grep "^$ID:")
481    INFO=$(eval "echo \"$IDL\" | cut -d\  -f$((3+$CPUS))-")
482    if [ "$INFO" = "" ]; then
483      echo "i$ID.label $ID"
484    else
485      echo "i$ID.label $INFO"
486      echo "i$ID.info Interrupt $ID, for device(s): $INFO"
487    fi
488    echo "i$ID.type DERIVE"
489    echo "i$ID.min 0"
490  done
491}
492
493fetch_irqstats() {
494  CPUS=$(grep 'CPU[0-9]' /proc/interrupts | wc -w)
495  IINFO=$(sed -e 's/ \{1,\}/ /g' -e 's/^ //' /proc/interrupts  | grep '.:')
496  for ID in $(echo "$IINFO" | cut -d: -f1)
497  do
498    IDL=$(echo "$IINFO" | grep "^$ID:")
499    VALS=$(eval "echo \"$IDL\" | cut -d\  -f2-$((1+$CPUS))")
500    VALUE=0
501    for VAL in $VALS;
502    do
503      VALUE=$(($VALUE + $VAL))
504    done
505    echo "i$ID.value $VALUE"
506  done
507}
508
509# ===== NODE CODE =====
510do_list() {
511  echo $PLUGINS
512}
513
514
515do_nodes() {
516  echo "$HOSTNAME"
517  echo "."
518}
519
520do_config() {
521  if echo "$PLUGINS" | grep "\b$1\b" >/dev/null 2>&1; then
522    config_$1
523  else
524    echo "# Unknown service"
525  fi
526  echo "."
527}
528
529do_fetch() {
530  if echo "$PLUGINS" | grep "\b$1\b" >/dev/null 2>&1; then
531    fetch_$1
532  else
533    echo "# Unknown service"
534  fi
535  echo "."
536}
537
538do_version() {
539  echo "munins node on $HOSTNAME version: $VERSION (munin-lite)"
540}
541
542do_quit() {
543  exit 0
544}
545
546# ===== Runtime config =====
547RES=""
548for PLUG in $PLUGINS
549do
550  if [ "$PLUG" = "if_" ]; then 
551    for INTER in $(grep '^ *\(ppp\|eth\|wlan\|ath\|ra\)[0-9]\{1,\}:' /proc/net/dev | cut -f1 -d: | sed 's/ //g');
552    do
553      RES="$RES if_$INTER"
554      eval "fetch_if_${INTER}() { fetch_if $INTER $@; };"
555      eval "config_if_${INTER}() { config_if $INTER $@; };"
556    done
557  elif [ "$PLUG" = "if_err_" ]; then
558    for INTER in $(grep '^ *\(ppp\|eth\|wlan\|ath\|ra\)[0-9]\{1,\}:' /proc/net/dev | cut -f1 -d: | sed 's/ //g');
559    do
560      RES="$RES if_err_$INTER"
561      eval "fetch_if_err_${INTER}() { fetch_if_err $INTER $@; };"
562      eval "config_if_err_${INTER}() { config_if_err $INTER $@; };"
563    done
564  elif [ "$PLUG" = "netstat" ]; then
565    if $netstat -s >/dev/null 2>&1; then
566      RES="$RES netstat"
567    fi
568  elif [ "$PLUG" = "df_" ]; then
569    for dev in $df_extra $(   awk '/^\/dev/  {print $1}'</proc/mounts|sort -u); do
570      basedev=$(basename $dev)
571      RES="$RES df_$basedev"
572      eval "fetch_df_${basedev}() { fetch_df $basedev $dev $@; };"
573      eval "config_df_${basedev}() { config_df $basedev $dev $@; };"
574    done
575  else
576    RES="$RES $PLUG";
577  fi
578done
579PLUGINS=$RES
580
581# ===== MAIN LOOP =====
582FUNCTIONS="list nodes config fetch version quit"
583HOSTNAME=$(/sbin/uci get "system.@system[0].hostname" 2>/dev/null || cat /proc/sys/kernel/hostname)
584echo "# munin node at $HOSTNAME"
585while read arg0 arg1
586do
587  arg0=$(echo "$arg0" | xargs)   # appears to do quote removal,
588  arg1=$(echo "$arg1" | xargs)   # a sort of input sanitation
589  if ! echo "$FUNCTIONS" | grep "\b$arg0\b" >/dev/null 2>&1 ; then
590    echo "# Unknown command. Try" $(echo "$FUNCTIONS" | sed -e 's/\( [[:alpha:]]\{1,\}\)/,\1/g' -e 's/,\( [[:alpha:]]\{1,\}\)$/ or\1/')
591    continue
592  fi
593  do_$arg0 $arg1
594done 
595