#!/bin/bash
# vim: cindent:shiftwidth=4:tabstop=4:smarttab:textwidth=100

#$title$ Error Detection And Correction (EDAC)
#$check$ EDAC counters in sysfs for memory and PCI bus
#$ref$ edac-util(1)
#$author$ Rafal Rzeczkowski
#$version$ 0.57

level_check short

#CHANGELOG
#0.50	initial
#0.51	use the correct variable to generate a CE failure report
#0.52	dispense with (faulty) calculations of CE and UE per csrow
#0.53	restyle according to https://kb.clearcable.ca/KB/ProgrammingStyleStandards
#0.54	ignore pci_nonparity_count incremented by 1 when setting check_pci_errors
#0.55	wait for error count to increment after enabling PCI checks
#0.56	skip memory controller scan if mc0 is not present
#0.57	skip CSROWs on memory controllers that are not in use (DIMM not installed)

# https://www.kernel.org/doc/Documentation/edac.txt
# https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-devices-edac

declare -r SYSFS_BASE='/sys/devices/system/edac'

shopt -s nullglob

if cd $SYSFS_BASE; then
	echodebug 'EDAC - Error Detection And Correction'
else
	fail warning "missing sysfs directory $SYSFS_BASE"
	exit
fi

enable_pci() {
	CHECK_PCI_ERRORS=$(<check_pci_errors)
	ENABLED_FLAG=1
	SETTLE_DELAY=5
	if [[ $CHECK_PCI_ERRORS -ne $ENABLED_FLAG ]]; then
		echodebug 'enabling PCI error check...'
		echo $ENABLED_FLAG > check_pci_errors
		sleep $SETTLE_DELAY
		echodebug "PCI error check activated with initial $(<pci_nonparity_count) spurious error(s)"
		state_save < pci_nonparity_count
	fi
}

scan_pci() {
	NPE=$(<pci_nonparity_count)
	NPE_INIT=$(state_load)
	NPE_INIT=${NPE_INIT:-0}	# needed for plugin upgrade
	NPE=$(( NPE - NPE_INIT ))
	PE=$(<pci_parity_count)

	if [[ $NPE -gt 0 || $PE -gt 0 ]]; then
		fail warning "$NPE/$PE PCI parity errors"
		helpmsg 'motherboard/PSU failure?'
		exit
	else
		echodebug "pci: NPE/PE=$NPE/$PE"
	fi

	for BUS in pci[0-9]*; do
		cd $SYSFS_BASE/$CLASS/$BUS
		NPE_COUNT=$(<npe_count)
		PE_COUNT=$(<pe_count)
		echodebug "$BUS: NPE/PE=$NPE_COUNT/$PE_COUNT"
		if [[ $NPE_COUNT -gt 0 || $PE_COUNT -gt 0 ]]; then
			fail warning "$BUS: $NPE_COUNT/$PE_COUNT PCI parity errors"
			helpmsg 'motherboard/PSU failure?'
			exit
		fi
	done
}

scan_mc() {
	for CONTROLLER in mc*; do
		cd $SYSFS_BASE/$CLASS/$CONTROLLER
		MC_NAME=$(<mc_name)
		SIZE_MB=$(<size_mb)
		UE_COUNT=$(<ue_count)
		CE_COUNT=$(<ce_count)
		echodebug "$CONTROLLER: $MC_NAME (${SIZE_MB}MB) UE/CE=$UE_COUNT/$CE_COUNT"

		# "Chip select" hierarchy
		cd $SYSFS_BASE/$CLASS/$CONTROLLER
		for CSROW in csrow*; do
			cd $SYSFS_BASE/$CLASS/$CONTROLLER/$CSROW
			CS_SIZE_MB=$(<size_mb)
			CS_MEM_TYPE=$(<mem_type)
			CS_EDAC_MODE=$(<edac_mode)
			CS_UE_COUNT=$(<ue_count)
			CS_CE_COUNT=$(<ce_count)
			CHANNELs=(ch*_ce_count)
			CHANNEL_STRING=""
			for CHANNEL in ${CHANNELs[*]}; do
				CHANNEL=${CHANNEL#ch}
				CHANNEL=${CHANNEL%_ce_count}
				test -n "$CHANNEL_STRING" && CHANNEL_STRING+='+'
				CHANNEL_STRING+=$CHANNEL
			done
			echodebug "- $CSROW: $CS_MEM_TYPE ${CS_SIZE_MB}MB ECC=$CS_EDAC_MODE channels=$CHANNEL_STRING UE/CE=$CS_UE_COUNT/$CS_CE_COUNT"
		done

		# info display only on integrated controllers
		cd $SYSFS_BASE/$CLASS/$CONTROLLER
		for DIMM in dimm*; do
			cd $SYSFS_BASE/$CLASS/$CONTROLLER/$DIMM
			DIMM_EDAC_MODE=$(<dimm_edac_mode)
			DIMM_MEM_TYPE=$(<dimm_mem_type)
			SIZE=$(<size)
			echodebug "-- $DIMM: $DIMM_MEM_TYPE ${SIZE}MB"
		done

		if [[ $UE_COUNT -gt 0 ]]; then
			fail critical "$UE_COUNT Uncorrectable Errors on memory controller $MC_NAME"
			helpmsg 'replace the faulty memory module immediately'
			exit
		elif [[ $CE_COUNT -gt 0 ]]; then
			fail warning "$CE_COUNT Correctable Errors on memory controller $MC_NAME"
			helpmsg 'replace the faulty memory module soon'
			exit
		fi
	done
}

SCAN_COUNT=0
for CLASS in *; do
	test -d $SYSFS_BASE/$CLASS || continue
	cd $SYSFS_BASE/$CLASS
	case $CLASS in
		mc)
			test -d 'mc0' || continue # at least one entry needs to be present
			echodebug '[RAM controllers]'
			scan_mc
			SCAN_COUNT=$(( SCAN_COUNT + 1 ))
			;;
		pci)
			echodebug '[PCI busses]'
			enable_pci
			scan_pci
			SCAN_COUNT=$(( SCAN_COUNT + 1 ))
			;;
		power)
			# no useful data
			;;
		*)
			echodebug "unknown class {$CLASS}"
			;;
	esac
done

if [[ $SCAN_COUNT -gt 0 ]]; then
	ok
else
	unknown
fi

exit
