#!/bin/bash
# vim: cindent:shiftwidth=4:tabstop=4:smarttab:textwidth=100

#$title$ Network Time Protocol
#$check$ list of associations shows synchronization to multiple peers
#$ref$ http://doc.ntp.org/4.2.6p2/ntpq.html
#$author$ Rafal Rzeczkowski
#$version$ 0.9.2

set -o errexit
set -o pipefail
set -o nounset
#set -o xtrace

level_check short settling

#CHANGELOG
#0.50	initial
#0.51	accept single peer on single xen configuration
#0.60	accept working PTP_SLAVE as a substitute for NTP sync
#0.61	recognize PTP_PASSIVE Port_state
#0.62	verify NTP GNSS operation under PTP_MASTER status
#0.62	corrected exit logic when encountering error during associations scan
#0.63	no need to test if IPv6 is disabled - ntpq will use 127.0.0.1 if needed
#0.64	restyle according to https://kb.clearcable.ca/KB/ProgrammingStyleStandards
#0.65	use bash "here string" instead of temporary file
#0.66	increase allowed PPS offset to 100us (observed 50us on a synchronized system under load)
#0.67	allow NMEA source status of both "x" and "+"
#0.68	ignore PPS sources that are not "serial" based (eg. "ptp" type on APU2)
#0.69	allow NMEA source status of "-"
#0.70	run this plugin under short interval
#0.72	select IPv4 localhost target when system has IPv6 disabled
#0.73	downgrade failure report level to caution
#0.74	remove redundant API error message
#0.75	support linuxptp software instead of ptpd
#0.76	flag with "requires_settling_time" keyword for on_boot skip
#0.77	synchronize flag presentation with updated level_check API
#0.7.8	parse ntpsec peers output for GNSS systems
#0.7.9	improve robustness of peers output parser for tally character code
#0.7.10	skip redundant associations parse with GNSS hardware
#0.7.11	recognize PTP_PASSIVE Port_state (again?)
#0.8.0	move PTP tests to a separate plugin
#0.9.0	move NTPsec tests to a separate plugin
#0.9.1	ignore unsynchronized status when PTP daemon is running
#0.9.2	detect the absence of all peers explicitly

DISABLE_IPV6=$(sysctl -n net.ipv6.conf.all.disable_ipv6)
if [[ $DISABLE_IPV6 -gt 0 ]]; then
	LOCALHOST='localhost'
else
	LOCALHOST='ip6-localhost'
fi
echodebug "selected $LOCALHOST as the query target"

NTPQ_AS_OUTPUT=$(ntpq -n -c associations $LOCALHOST)
if [[ "$NTPQ_AS_OUTPUT" = "No association ID's returned" ]]; then
	fail warning 'not connected to any peers'
	exit
fi

if NTPQ_AS_MSG=$(awk <<< "$NTPQ_AS_OUTPUT" '
BEGIN { section="header" }
{
	if ($0 == "") {
	} else if (section == "header") {
		section="hline"
		if ($7 != "condition") {
			api_error_msg="ntpq API failure: expected {condition}, received {" $7 "} header"
			api_error_code=1
			exit
		}
	} else if (section == "hline" && $0 ~ /^=+$/) {
		section="list"
	} else if (section == "list") {
		association_count++
		condition=$7
		if (condition == "sys.peer") {
			sys++
		} else if (condition == "candidate" || condition == "candidat") {
			candidate++
		}
	} else {
		api_error_msg="ntpq API failure: unknown parser state"
		api_error_code=1
		exit
	}
}
END {
	if (api_error_code) {
		print api_error_msg
		exit 3
	} else if (sys == 1 && association_count == 1) {
		print "synchronized with a single peer"
		exit 0
	} else if (sys > 0 && candidate > 0) {
		print "synchronized, and with backup peers"
		exit 0
	} else if (sys > 0) {
		print "synchronized, but without valid backup peers"
		exit 1
	} else {
		print "unsynchronized"
		exit 2
	}
}
')
then
	NTPQ_AS_CODE=$?
else
	NTPQ_AS_CODE=$?
fi

if [[ $NTPQ_AS_CODE -eq 0 ]]; then
	echodebug "$NTPQ_AS_MSG"
	ok
elif [[ $NTPQ_AS_CODE -eq 1 ]]; then
	fail caution "$NTPQ_AS_MSG"
	helpmsg 'ensure the NTP daemon is running; check output of {ntpq -c peers} and {ntpq -c associations}'
elif [[ $NTPQ_AS_CODE -eq 2 ]]; then
	declare -r PTP4L_UDS_ADDRESS='/run/ptp4l'
	if [[ -S $PTP4L_UDS_ADDRESS ]]; then
		echodebug 'unsynchronized, but PTP daemon is running'
		ok
	else
		fail caution "$NTPQ_AS_MSG"
		helpmsg 'ensure configured peers are providing NTP service'
	fi
elif [[ $NTPQ_AS_CODE -ge 3 ]]; then
	fail warning "$NTPQ_AS_MSG"
	helpmsg 'internal error - contact the plugin maintainer'
	exit 1
fi

#ind assid status  conf reach auth condition  last_event cnt
#===========================================================
#  1 25673  8011   yes    no  none    reject    mobilize  1
#  2 25674  8011   yes    no  none    reject    mobilize  1
#  3 25675  941a   yes   yes  none candidate    sys_peer  1
#  4 25676  8011   yes    no  none    reject    mobilize  1
#  5 25677  961a   yes   yes  none  sys.peer    sys_peer  1

#     remote           refid      st t when poll reach   delay   offset  jitter
#==============================================================================
# 208.80.96.96    .STEP.          16 u    - 1024    0    0.000    0.000   0.000
# 162.244.25.186  .STEP.          16 u    - 1024    0    0.000    0.000   0.000
#*2001:4900:1:392 209.51.161.238   2 u  584 1024  377    6.403    0.384   0.821
# 199.182.221.110 .STEP.          16 u    - 1024    0    0.000    0.000   0.000
#+2001:470:1d:34b 200.98.196.212   2 u  300 1024  377    6.447   -0.675   0.536
