#!/bin/bash
# vim: cindent:shiftwidth=4:tabstop=4:smarttab:textwidth=100

set -o posix
set -o errexit
set -o pipefail
set -o nounset
#set -o xtrace

#$title$ Linux kernel and Xen hypervisor ring buffer analysis
#$check$ suspicious dmesg entries
#$ref$ dmesg(1), xl(1)
#$author$ Rafal Rzeczkowski
#$version$ 1.3.3

level_check short

#CHANGELOG
#0.50	initial
#0.51	level=short
#0.52	add check for critical-level errors
#0.53	add check for mptscsih driver bad status
#0.54	add check for Xen vif disconnect
#0.60	unified Xen/Linux dmesg check
#0.61	applications crashing via segfault
#0.62	ignore segfaults on LTSP
#0.63	check [3]
#0.64	added help messages
#0.65	mawk-compatible value assignments
#0.66	errexit compatible status check
#0.67	added check for EXTx-fs error
#0.68	added check for "trap divide error"
#0.69	generalized "page allocation failure" check
#0.70	check [6] I/O errors
#0.71	additional help messages
#0.72	sample [7]: correctable memory errors on Intel E7520 under Xen
#0.73	sample [8]: nf_conntrack: table full, dropping packet
#0.74	compatibility with enhanced dmesg command
#0.75	compatibility with Xen 4.4
#0.76	sample [9]: xen:balloon:
#0.77	remove sample 9 - appears even when not enabled
#0.78	sample [10]: WARNING: CPU:
#0.79	sample [11]: Invalidating snapshot:
#0.80	split patterns into multiple lines
#0.81	sample [12]: general protection (WARN)
#0.82	change awk logic to always select a critical error (over a warning)
#0.83	set special case of sample [12] CRIT when target is libresolv [CVE-2015-7547]
#0.84	added IGNORE framework, integrated LTSP handler
#0.85	add sample [13]: extends beyond EOD, truncated
#0.86	add sample [14]: IPv6 duplicate address
#0.87	merge samples [6] and [15]
#0.88	introduce caution category
#0.89	add sample [16]: task blocked
#0.90	add sample [17] and [18] (Linux 4.19.0-6-amd64)
#0.91	generalize sample [15] for Linux 4.19
#0.92	separate samples [6] and [15] again
#0.93	suppress spurious terminal output
#0.94	add sample [19] (Linux 4.19.0-9-cloud-amd64)
#0.95	integrate with the the FIX API
#0.96	add samples [20] and [21] (Linux 3.16.0-11-amd64)
#0.9.7	add sample [22]
#0.9.8	add sample [23]
#0.9.9	add samples [24] and [25]
#1.0.0	add sample [26]
#1.0.1	add sample [27]
#1.0.2	reduce severity on DRBD samples that only result in connection flaps
#1.0.3	add sample [28]
#1.1.0	support dmesg retrieval via systemd journalctl
#1.2.0	save boot dmesg buffer to log file under systemd
#1.2.1	add sample [29]
#1.2.2	save previous dmesg log under systemd
#1.2.3	add sample [30]
#1.3.0	retrieve messages from the kernel ring buffer on SOE5 (drop journalctl)
#1.3.1	add sample [31]
#1.3.2	add sample [32]
#1.3.3	source /etc/os-release from syscheck supervisor

declare -r SYS_TYPE=${BASH_SOURCE[0]##*/}

declare -r -a PROBLEMS=(DMESG_ANOMALY PLUGIN_FAILURE)
test -n ${#PROBLEMS[@]}

if [[ $SYS_TYPE = 'dmesg' ]]; then
	# Linux kernel
	declare -r CRIT='
fatal error; disabling device$|
^EXT[2-4]-fs error|
^(end_request|print_req_error): .+ error, dev .+, sector|
^Buffer I/O error on dev|
^nf_conntrack: table full|
general protection .+ in libresolv|
^print_req_error: I/O error, dev .+, sector|
: rejecting I/O to dead device|
error count since last fsck:|
IPVS: wrr: TCP .+ - no destination available|
IPv6: .+: IPv6 duplicate address .+ used by .+ detected|
PLACEHOLDER:AWiSIx6CyIAtlvoodVzSR1wi0v9OQAmi'
	declare -r WARN='
^Out of memory|
^[a-z]+: page allocation failure[.]|
^rpcbind|
^mptscsih: ioc[0-9]: attempting task abort|
: segfault at |
^WARNING: CPU: [0-9]+ PID: [0-9]+ at |
general protection |
IPv6 duplicate address |
task .+ blocked for more than [0-9]+ seconds|
swiotlb buffer is full|
warning: mounting fs with errors, running e2fsck is recommended|
block drbd[0-9]+: Online verify found [0-9]+ [0-9]+k block out of sync!|
megaraid_sas .+ Controller encountered a fatal error and was reset|
nf_conntrack: table full, dropping packet|
trap (divide error|invalid opcode) ip:[[:xdigit:]]+ sp:[[:xdigit:]]+ error:0 in|
CIFS: VFS: directory entry name would overflow frame end of buf|
PLACEHOLDER:AWiSIx6CyIAtlvoodVzSR1wi0v9OQAmi'
	declare -r CAUTION='
Non-Fatal Error DRAM Controller|
^Clocksource tsc unstable|
^device-mapper: snapshots: Invalidating snapshot:|
extends beyond EOD, truncated$|
^systemd-udevd.+ Invalid key/value pair,|
block drbd[0-9]+: Digest integrity check FAILED:|
block drbd[0-9]+: Digest mismatch, buffer modified by upper layers during write:|
clearing Tx timestamp hang|
PLACEHOLDER:AWiSIx6CyIAtlvoodVzSR1wi0v9OQAmi'
	declare -r IGNORE='
(gtk-gnash|kodi.bin).*: segfault at |
PLACEHOLDER:AWiSIx6CyIAtlvoodVzSR1wi0v9OQAmi'

	if [[ $VERSION_ID -ge 12 ]]; then
		# SOE5
		declare -r CMD='dmesg --notime --color=never'
		declare -r DMESG_LOG='/var/log/dmesg'
		if [[ ! -e $DMESG_LOG ]] || [[ '/proc' -nt $DMESG_LOG ]]; then
			mv --force $DMESG_LOG $DMESG_LOG.0
			dmesg --kernel > $DMESG_LOG
			echodebug "saved current dmesg buffer to $DMESG_LOG"
		fi
	elif [[ $VERSION_ID -ge 8 ]]; then
		# SOE4.2/4.3
		# new-style dmesg command requires fancy options turned off
		declare -r CMD='dmesg --notime --color=never'
	else
		# SOE4.0/4.1
		# plain dmesg
		declare -r CMD='dmesg'
	fi
elif [[ $SYS_TYPE = 'xen-dmesg' ]]; then
	# Xen hypervisor
	declare -r CRIT='
Platform timer appears to have unexpectedly wrapped|
PLACEHOLDER:AWiSIx6CyIAtlvoodVzSR1wi0v9OQAmi'
	declare -r WARN='
PLACEHOLDER:AWiSIx6CyIAtlvoodVzSR1wi0v9OQAmi'
	declare -r CAUTION='
PLACEHOLDER:AWiSIx6CyIAtlvoodVzSR1wi0v9OQAmi'
	declare -r IGNORE='
PLACEHOLDER:AWiSIx6CyIAtlvoodVzSR1wi0v9OQAmi'
	declare -r XEN_TOOLSTACK='xl'
	declare -r CMD="$XEN_TOOLSTACK dmesg"
fi

TMP_DMESG=$(mktemp)
if $CMD > "$TMP_DMESG"; then
	echodebug "extracted dmesg log via {$CMD}"
else
	fail critical "failed to run dmesg command {$CMD}"
	helpmsg 'general plugin failure'
	problem PLUGIN_FAILURE "$CMD"
	exit
fi

TMP_AWK=$(mktemp)
if awk -v DEBUG="$DEBUG" -v CRIT="${CRIT//$'\n'/}" -v WARN="${WARN//$'\n'/}" \
	-v CAUTION="${CAUTION//$'\n'/}" -v IGNORE="${IGNORE//$'\n'/}" '
{
	if ($0~IGNORE){}
	else if ($0~CRIT){msg=$0;err=3;exit}
	else if ($0~WARN){msg=$0;err=2}
	else if ($0~CAUTION && !err){msg=$0;err=1}
}
END {
	if (err) {print msg;exit err} else {print NR}
}
	' "$TMP_DMESG" > "$TMP_AWK"
then
	AWK_ERROR=$?
else
	AWK_ERROR=$?
fi
rm "$TMP_DMESG"

if [[ $AWK_ERROR -eq 0 ]]; then
	ring_buffer_entry_count=$(<"$TMP_AWK")
	entry_plural=$(plural_text $ring_buffer_entry_count 'entry' 'entries')
	echodebug "$ring_buffer_entry_count ring buffer $entry_plural scanned"
	ok
elif [[ $AWK_ERROR -gt 3 ]]; then
	fail warning "invalid return code $AWK_ERROR"
	helpmsg 'general plugin failure'
	problem PLUGIN_FAILURE $AWK_ERROR
else
	case $AWK_ERROR in
		1) LEVEL='caution';;
		2) LEVEL='warning';;
		3) LEVEL='critical';;
	esac
	fail $LEVEL "$(<"$TMP_AWK")"
	helpmsg 'investigate the underlying problem'
	problem DMESG_ANOMALY $LEVEL
fi

rm "$TMP_AWK"

# EXAMPLES:

# [0] http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=701744
# Mar  3 18:03:10 xen2 kernel: vif3.0: Frag is bigger than frame.
# Mar  3 18:03:10 xen2 kernel: vif3.0: fatal error; disabling device
# Mar  3 18:03:10 xen2 kernel: xenbr0: port 3(vif3.0) entering disabled state

# [1] http://xen.1045712.n5.nabble.com/Xen-4-TSC-problems-td3396848.html
# (XEN) Platform timer appears to have unexpectedly wrapped 10 or more times.
# Clocksource tsc unstable (delta = -2999660318481 ns)

# [2]
# perl[28600]: segfault at 95 ip 00007fec58b1aff5 sp 00007fffe8a46230 error 4 in libperl.so.5.10.0[7fec58a7e000+166000]

# [3]
# swapper: page allocation failure. order:0, mode:0x4020
# Pid: 0, comm: swapper Not tainted 2.6.32.60.2 #2

# [4]
# entropy_broker[7870] trap divide error ip:52e81f sp:7fffa709b6a0 error:0 in entropy_broker[400000+32d000]

# [5] (sparc64)
# awk: page allocation failure. order:4, mode:0x40d0

# [6]
# Buffer I/O error on device xvdb1, logical block 130080
# Buffer I/O error on dev sdc, logical block 55786, async page read

# [7]
# EDAC e752x: Non-Fatal Error DRAM Controller
# EDAC MC0: CE page 0xcdae9, offset 0xc0, grain 4096, syndrome 0x3f6, row 6, channel 0, label "": e752x CE

# [8]
# nf_conntrack: table full, dropping packet.

# [9]
# xen:balloon: Cannot add additional memory (-17)

# [10]
# WARNING: CPU: 0 PID: 2586 at /build/linux-xkTWug/linux-3.16.7-ckt11/drivers/gpu/drm/i915/intel_display.c:3324 intel_crtc_wait_for_pending_flips+0x165/0x170 [i915]()

# [11]
# device-mapper: snapshots: Invalidating snapshot: Unable to allocate exception.

# [12]
# traps: wget[5007] general protection ip:7f27de618c55 sp:7ffe6633b580 error:0 in libresolv-2.19.so[7f27de611000+14000]

# [13]
# sdb: p1 size 62325760 extends beyond EOD, truncated

# [14]
# eth0: IPv6 duplicate address 2604:1500:0:c8::26 detected!

# [15]
# end_request: critical medium error, dev sda, sector 965465
# end_request: I/O error, dev xvdb1, sector 1044456
# print_req_error: critical medium error, dev sdc, sector 55786

# [16]
# kernel: INFO: task find:10579 blocked for more than 120 seconds.

# [17]
# print_req_error: I/O error, dev sdc, sector 100746368

# [18]
# sd 8:0:0:0: rejecting I/O to dead device

# [19]
# ixgbevf 0000:00:01.2: swiotlb buffer is full (sz: 8192 bytes)

# [20]
# EXT4-fs (sdc): warning: mounting fs with errors, running e2fsck is recommended

# [21]
# EXT4-fs (sdc): error count since last fsck: 6

# [22]
# IPVS: wrr: TCP [2604:6400:0:1a::20]:143 - no destination available

# [23]
# IPv6: eth0: IPv6 duplicate address 2620:120:8000:1f4:216:3eff:fe91:4 used by 00:16:3e:91:97:47 detected!

# [24]
# kernel: block drbd1: Online verify found 17 4k block out of sync!

# [25]
# kernel: block drbd19: Digest integrity check FAILED: 2359280s +4096

# [26]
# systemd-udevd[411]: /etc/udev/rules.d/91-permissions.rules:93: Invalid key/value pair, starting at character 37 (',')

# [27]
# block drbd17: Digest mismatch, buffer modified by upper layers during write: 6731088s +4096

# [28]
# megaraid_sas 0000:82:00.0: 65155 (695766703s/0x0020/CRIT) - Controller encountered a fatal error and was reset

# [29]
# nf_conntrack: nf_conntrack: table full, dropping packet

# [30]
# kernel: traps: sudo[13436] trap invalid opcode ip:448030 sp:bfc4a80c error:0 in sudo[447000+2e000]

# [31]
# CIFS: VFS: directory entry name would overflow frame end of buf 00000000282d1ea0

# [32]
# igb 0000:01:00.0: clearing Tx timestamp hang
