#!/bin/bash
# vim: cindent:shiftwidth=4:tabstop=4:smarttab:textwidth=100

#$title$ Intelligent Platform Management Interface
#$check$ suspicious entries in the System Event Log (SEL)
#$ref$ man ipmitool, /var/log/ipmi
#$author$ Rafal Rzeczkowski
#$version$ 0.55

level_check long

#CHANGELOG
#0.50	initial
#0.51	restyle according to https://kb.clearcable.ca/KB/ProgrammingStyleStandards
#0.52	increase reporting verbosity
#0.53	detect Processor|IERR (Processor Internal Error)
#0.54	streamline message level priority filtering
#0.55	integrate with the the FIX API

declare -r LOG_DIR='/var/log/ipmi'
declare -r LOG="$LOG_DIR/sel.log"
declare -r LOG_NAME='IPMI System Event Log'
declare -r -a PROBLEMS=(SEL_ANOMALY)

if [[ ! -d "$LOG_DIR" ]]; then
	mkdir "$LOG_DIR"
fi

TMP=$(mktemp)
ipmitool sel elist > "$TMP"
chmod 444 "$TMP"

if [[ ! -f "$LOG" ]]; then
	mv "$TMP" "$LOG"
	echodebug "$LOG_NAME initial save"
elif cmp --silent "$TMP" "$LOG"; then
	rm "$TMP"
	echodebug "$LOG_NAME is unchanged"
elif [[ $(stat --format=%s "$TMP") -gt $(stat --format=%s "$LOG") ]]; then
	mv "$TMP" "$LOG"
	echodebug "$LOG_NAME has grown"
else
	mv "$LOG" "$LOG".$(date --iso-8601)
	mv "$TMP" "$LOG"
	echodebug "$LOG_NAME has been previously cleared"
fi
LOG_SIZE=$(wc --lines < $LOG)
echodebug "$LOG_NAME at $LOG has $LOG_SIZE entry(ies)"

ERROR_LEVEL=''
ERROR_MESSAGE=''
function register_event () {
	local LEVEL=$1
	local MSG=$2
	echodebug "register_event($LEVEL,{$MSG})"

	if [[ -z "$ERROR_LEVEL" ||
		( "$ERROR_LEVEL" = 'caution' && $LEVEL != 'caution' ) ||
		( "$ERROR_LEVEL" = 'warning' && $LEVEL = 'critical' ) ]]; then
		ERROR_LEVEL=$LEVEL
		ERROR_MESSAGE="$MSG"
	fi
}

while IFS='|' read ID DATE TIME SOURCE STATUS STATE MSG; do
	if [[ "$STATE" != 'Asserted' ]]; then
		continue
	fi
	case "$SOURCE" in
		'Physical Security Intrusion')
			register_event warning "$SOURCE: $STATUS"
			;;
		Temperature*|Current*|Voltage*|Fan*)
			case "$STATUS" in
				*Non-critical*)
					register_event warning "$SOURCE: $MSG"
					;;
				*Critical*)
					register_event critical "$SOURCE: $MSG"
					;;
			esac
			;;
		Processor*)
			case "$STATUS" in
				'Limit Exceeded'|'IERR')
					register_event warning "$SOURCE: $STATUS"
					;;
			esac
			;;
		Platform*)
			case "$STATUS" in
				'Predictive Failure Asserted')
					register_event warning "$SOURCE: $STATUS"
					;;
			esac
			;;
		Power\ Supply*)
			case "$STATUS" in
				*\ [Ll]ost|'Predictive Failure Asserted'|*\ going\ *)
					register_event warning "$SOURCE: $STATUS"
					;;
			esac
			;;
		Memory*)
			case "$STATUS" in
				'')
					register_event warning "$SOURCE"
					;;
				*Non-critical*|*Disabled*)
					register_event warning "$SOURCE: $STATUS"
					;;
				*Critical*|*Uncorrectable*)
					register_event critical "$SOURCE: $STATUS"
					;;
			esac
			;;
		Drive*)
			case "$STATUS" in
				'Drive Fault')
					register_event warning "$SOURCE: $STATUS"
					;;
			esac
			;;
		'System Boot'*)
			#register_event caution "$SOURCE: $STATUS"
			;;

		'System Event SEL_FULLNESS')
			register_event warning "$SOURCE: $MSG"
			;;
	esac
done < <( tac $LOG | sed 's/ | /|/g' )

if [[ -n "$ERROR_LEVEL" ]]; then
	fail $ERROR_LEVEL "$ERROR_MESSAGE"
	helpmsg 'examine/confirm the hardware fault in the LOM web GUI'
	problem SEL_ANOMALY $LOG
else
	ok
fi

# [EXAMPLES]
#  2 | 04/12/2016 | 15:30:34 | Physical Security Intrusion | General Chassis intrusion () | Asserted
#  3 | 04/12/2016 | 15:30:39 | Physical Security Intrusion | General Chassis intrusion () | Deasserted
# 14 | 04/13/2016 | 15:00:27 | Temperature Inlet Temp | Upper Non-critical going high | Asserted | Reading 42 > Threshold 42 degrees C
# 15 | 04/14/2016 | 08:47:52 | Temperature Inlet Temp | Upper Critical going high | Asserted | Reading 47 > Threshold 47 degrees C
#220 | 05/10/2017 | 03:03:05 | Memory Mem ECC Warning | Transition to Non-critical from OK | Asserted
#221 | 05/10/2017 | 03:03:58 | Memory Mem ECC Warning | Transition to Critical from less severe | Asserted
#1da | 05/06/2016 | 12:35:08 | Power Supply PS Redundancy | Redundancy Lost | Asserted
#1db | 05/06/2016 | 12:35:12 | Power Supply Status | Power Supply AC lost | Asserted
# 46 | 02/05/2016 | 00:56:10 | Power Supply PS_RDNDNT_MODE | Redundancy Lost | Asserted
# 47 | 02/05/2016 | 00:56:11 | Power Supply PSU1_STATUS | Power Supply AC lost | Asserted
#5c2 | 04/17/2017 | 03:15:50 | Memory DDR4_P2_G1_ECC |  | Asserted
#20a | 08/31/2012 | 02:54:32 | Current PSU1_IOUT | Upper Critical going high | Asserted | Reading 199 > Threshold 58 Amps
#113 | 06/16/2011 | 18:57:16 | Voltage P1V5_DDR3_CPU2 | Lower Critical going low  | Asserted | Reading 1.35 < Threshold 1.43 Volts
#131 | 06/03/2017 | 05:35:56 | Fan F0/TACH | Lower Non-critical going low  | Asserted | Reading 480 < Threshold 1920 RPM
#968 | 08/09/2013 | 22:05:07 | System Event SEL_FULLNESS | Upper Critical going high | Asserted | Reading 80 > Threshold 80 unspecified
#968 | 01/16/2017 | 09:28:38 | System Event SEL_FULLNESS | Upper Non-critical going high | Asserted | Reading 80 > Threshold 80 unspecified
#1ae | 05/23/2013 | 13:50:34 | Processor IRQ_P2_RDIM_EVNT | Limit Exceeded | Asserted
# 22 | 10/20/2016 | 20:52:22 | Power Supply PMBUS_ALERT | Predictive Failure Asserted | Asserted
#3f9 | 05/01/2015 | 05:26:01 | Power Supply PSU2_PIN | Upper Critical going high | Deasserted | Reading 36 > Threshold 680 Watts
#3fa | 05/01/2015 | 05:26:01 | Power Supply PSU2_PIN | Upper Non-critical going high | Deasserted | Reading 36 > Threshold 652 Watts
#13b | 01/01/2009 | 00:05:58 | Memory #0x02 | Uncorrectable ECC | Asserted
#104 | 08/20/2012 | 14:30:12 | Memory #0x12 | Device Disabled | Asserted
# 9f | 08/15/2020 | 12:52:29 | Processor | IERR | Asserted
