#! /bin/bash
# vim: cindent:shiftwidth=4:tabstop=4:expandtab

# file-sync-cluster
# An alternative version of file-sync for multi-prov cluster NOMS deployments

# Support debug logging for troubleshooting
LOG_FILE="/var/log/file-sync-cluster.debug.log"
if [[ ${1} == "debug" ]]; then
    exec &> >( tee -a "$LOG_FILE" )
    echo "--------------------------------------------------------------"
    echo "Starting file-sync-cluster run at $(date)"
    echo "--------------------------------------------------------------"
fi

set -o errexit
set -o pipefail
set -o nounset
set -o xtrace

#CHANGELOG
#0.50    initial based on existing file-sync
#0.60    support for DHCPv6 on provs
#0.70    Added rsync exclude for tempfiles created in /srv/tftp by cm-gen process at AC
#0.80    Changed inline exclude to exclude-from file for /srv/tftp syncs (Phase 1a)


SECRETS='/etc/rsyncd.secrets'
SRC_HOST="noms.$( hostname -d )"
if [[ -f /usr/local/etc/file-sync.host ]]; then
    SRC_HOST="$( cat /usr/local/etc/file-sync.host )"
fi
PRIMARY_DHCP_HOST="prov1.$( hostname -d )"

DHCP_DIR='/etc/dhcp'
DHCP6_DIR='/etc/dhcp6'
LEASES_DIR='/var/lib/dhcp'

JOB_NAME="file-sync-$$"

TFTP_DIR='/srv/tftp'
RUNSTAMP='/run/file-sync.timestamp'
LOCK_BASE='/run/lock/file-sync'

LOCK_DHCP="${LOCK_BASE}.dhcp"

SYNC_TIMEOUT=90
LOCK_TIMEOUT=60
RSYNC="rsync --delete --delete-after --archive --itemize-changes --timeout=${SYNC_TIMEOUT}"

HOSTNAME="$( hostname )"
DOMAIN="$( sed "s/prov[0-9].//" <<<"${HOSTNAME}" )"
CLUSTER="$( sed "s/prov[0-9].//; s/cm.//; s/cpe.//;" <<<"${HOSTNAME}" )"
CTYPE="$( sed "s/prov[0-9].//; s/.${CLUSTER}//;" <<<"${HOSTNAME}" )"
if [[ ${CTYPE} == ${CLUSTER} ]]; then
    # The removal of /./${CLUSTER} in CTYPE didn't succeed, so that these two are identical,
    # which means that there is no sub "cluster type" of cm/cpe here. Call this "all".
    CTYPE="all"
fi
PROV_ID="$( grep --extended-regexp --only-matching '[0-9]' <<<"$( hostname --short )" || echo "0" )"

function rsync_password_for {
    USER_QUERY="$1"
    awk --field-separator ':' --assign USER=${USER_QUERY} '{if ($1==USER){print $2}}' ${SECRETS}
}

function check_failover_state {
    # State check from dhcpd syscheck
    STATES=($(TZ=UTC awk '
#failover peer "dhcp" state {
#  my state normal at 4 2014/07/24 16:38:16;
#  partner state normal at 4 2014/07/24 16:37:24;
#}
{
        sub(";","",$0);
        if ($1 == "failover" && $2 == "peer" && $4 == "state") {
                failover_stanza=1
                next;
        } else if ($1 == "}") {
                failover_stanza=""
                next;
        }

        if (failover_stanza) {
                if ($1 == "mclt") {
                        mclt=$2;
                }
                if ($2 == "state" && ($1 == "my" || $1 == "partner")) {
                        #dhcpd.leases(5)
                        #The date is specified in two ways, depending on the configuration value for the
                        #db-time-format parameter. If it was set to default, then the date fields appear
                        #as follows: weekday year/month/day hour:minute:second
                        weekday=$5
                        split($6,ymd,"/");
                        split($7,hms,":");
                        state_value[$1]=$3;
                        state_time[$1]=systime()-mktime(ymd[1] " " ymd[2] " " ymd[3] " " hms[1] " " hms[2] " " hms[3]);
                }
        }
}
END {
        print state_value["my"] ":" state_time["my"] "\t" state_value["partner"] ":" state_time["partner"] "\t" mclt;
}' ${LEASES_DIR}/dhcpd.leases ))
    test ${#STATES[*]} -eq 0 && STATES=('unknown:-1')
    STATE=${STATES[0]%%:*}
    STATE_LIFETIME=${STATES[0]##*:}
    if [[ $STATE = 'normal' ]]; then
        return 0
    else
        return 1
    fi
}

# Begin flock section
(

flock --exclusive --wait ${LOCK_TIMEOUT} 200    # also used by dhcpd syscheck plugin

syscheck restrict ${JOB_NAME} $(( 2 * ${SYNC_TIMEOUT} ))

USER='prov'
PASSWORD_FILE=$( mktemp )
rsync_password_for ${USER} > ${PASSWORD_FILE}

DHCP_RSYNC_OPTIONS="--password-file=${PASSWORD_FILE} --no-owner --no-group"

#
# Phase 1a: TFTP sync
#
if [[ -d ${TFTP_DIR} ]]; then
    if [[ -w ${TFTP_DIR} ]]; then
        TFTP_EXCLUDE_FILE=$( mktemp )
        cat > ${TFTP_EXCLUDE_FILE} << 'EOF'
etc/
device-firmware/
termsys-firmware/
tempfile-*
*cm.txt
EOF
        ${RSYNC} ${DHCP_RSYNC_OPTIONS} --exclude-from=${TFTP_EXCLUDE_FILE} --no-itemize-changes \
            ${USER}@${SRC_HOST}::tftpboot/ ${TFTP_DIR}/
        rm ${TFTP_EXCLUDE_FILE}
    fi
fi

#
# Phase 1b: Firmware sync (fwprov-only)
#
if [[ -d ${TFTP_DIR}/device-firmware ]]; then
    if [[ -w ${TFTP_DIR}/device-firmware ]]; then
        ${RSYNC} ${DHCP_RSYNC_OPTIONS} --no-itemize-changes \
            ${USER}@${SRC_HOST}::tftpboot/device-firmware/ ${TFTP_DIR}/device-firmware/
    fi
fi

#
# Phase 2: DHCPv4 sync
#
if [[ -d ${DHCP_DIR} ]]; then
    LOG=$( mktemp )

    # synchronize main configuration without customer provisioning data
    EXCLUDE_FILE=$( mktemp )
    cat > ${EXCLUDE_FILE} << 'EOF'
dhclient*
dhcpd.conf
dhcpd6.conf
CableLabs
global
time-offset
failover
boot
subnet
subnet6
subnets/
server
modems/
*.local
*~
*.swp
*.bak
EOF

    ${RSYNC} ${DHCP_RSYNC_OPTIONS} --chmod='go-w' \
        --exclude-from=${EXCLUDE_FILE} ${USER}@${SRC_HOST}::dhcp/ ${DHCP_DIR}/ > ${LOG}
    rm ${EXCLUDE_FILE}

    # Synchronize the modems directory
    ${RSYNC} ${DHCP_RSYNC_OPTIONS} --chmod=Fugo=r \
        ${USER}@${SRC_HOST}::dhcp/modems/ ${DHCP_DIR}/modems/ >> ${LOG}

    # Synchronize the subnets directory without logging
    # We only care about changes here if they trigger /etc/dhcp/subnet to regenerate
    ${RSYNC} ${DHCP_RSYNC_OPTIONS} --chmod=Fugo=r --delete-excluded --no-itemize-changes \
        ${USER}@${SRC_HOST}::dhcp/subnets/ ${DHCP_DIR}/subnets/

    # Regenerate the cluster-specific subnet file
    TMP_SUBNET_HEADER=$( mktemp )
    TMP_SUBNET_SUBNETS=$( mktemp )
    echo "############"                                                     >> ${TMP_SUBNET_HEADER}
    echo "### STOP ###"                                                     >> ${TMP_SUBNET_HEADER}
    echo "############"                                                     >> ${TMP_SUBNET_HEADER}
    echo "# This file is automatically generated by 'file-sync-cluster'"    >> ${TMP_SUBNET_HEADER}
    echo "# DO NOT EDIT THIS FILE OR IT WILL BE OVERWRITTEN IMMEDIATELY"    >> ${TMP_SUBNET_HEADER}
    echo "# All edits must be made in NOMS without exception"               >> ${TMP_SUBNET_HEADER}
    echo ""                                                                 >> ${TMP_SUBNET_HEADER}
    if [[ -f ${DHCP_DIR}/subnet.header ]]; then
            cat ${DHCP_DIR}/subnet.header                                   >> ${TMP_SUBNET_HEADER}
    fi
    if [[ -f ${DHCP_DIR}/subnets/clusters/${CLUSTER}.conf && $( wc -l < ${DHCP_DIR}/subnets/clusters/${CLUSTER}.conf ) -gt 0 ]]; then
        for include in $( grep 'include' ${DHCP_DIR}/subnets/clusters/${CLUSTER}.conf | awk -F '"' '{ print $2 }' ); do
            # Handle differing DHCP_DIR for testing
            if grep -q "^/etc/dhcp/" <<<"${include}" && ! grep -q "^${DHCP_DIR}/" <<<"${include}"; then
                include="$( sed "s|/etc/dhcp/|${DHCP_DIR}/|" <<<"${include}" )"
            fi
            if [[ "${include}" =~ .*${CTYPE}\.conf$ ]]; then
                cat ${include}                                              >> ${TMP_SUBNET_SUBNETS}
                echo                                                        >> ${TMP_SUBNET_SUBNETS}
            fi
        done
    fi

    # Concatenate the header and subnets together if there are subnets
    if [[ -s ${TMP_SUBNET_SUBNETS} ]]; then
        TMP_SUBNET=$( mktemp )
        cat ${TMP_SUBNET_HEADER} ${TMP_SUBNET_SUBNETS} > ${TMP_SUBNET}
        rm -f ${TMP_SUBNET_HEADER}
        rm -f ${TMP_SUBNET_SUBNETS}
    else
        rm -f ${TMP_SUBNET_HEADER}
        rm -f ${TMP_SUBNET_SUBNETS}
        # Clean up a running DHCP process (e.g. if all CMTSes were removed from this cluster
        if [[ -f ${DHCP_DIR}/subnet ]]; then
            service isc-dhcp-server stop || true
            mv ${DHCP_DIR}/subnet ${DHCP_DIR}/subnet.store_$(date +%s) || true
            syscheck check dhcpd || true
        fi
        syscheck release ${JOB_NAME}
        # Before exiting remove left over files (RT#61859)
        if [ -f "$PASSWORD_FILE" ]; then
            rm "$PASSWORD_FILE"
        fi
        if [ -f "$LOG" ]; then
            rm "$LOG"
        fi
        exit
    fi

    SUBNET_CHANGE="n"
    if ! cmp --silent ${TMP_SUBNET} ${DHCP_DIR}/subnet; then
        chmod --reference ${DHCP_DIR}/subnet ${TMP_SUBNET} || true
        chown --reference ${DHCP_DIR}/subnet ${TMP_SUBNET} || true
        # Do it this way to avoid regenerating timestamps on the /etc/dhcp directory
        mv ${TMP_SUBNET} ${DHCP_DIR}/subnet
        SUBNET_CHANGE="y"
    else
        rm -f ${TMP_SUBNET}
    fi

    function dhcp_restart {
        set +o errexit
        RESTART_REASON="$1"
        logger -p lpr.notice -t $( basename $0 ) "${RESTART_REASON}"

        # Check DHCP syntax or fail
        if ! dhcpd -t -cf /etc/dhcp/dhcpd.conf &>/dev/null; then
            logger -p lpr.error -t $( basename $0 ) "Failed to validate DHCPv4 config! Not restarting DHCP."
            return
        fi

        # Wait for (${PROV_ID}-1)*5 seconds then begin checking for failover state
        logger -p lpr.notice -t $( basename $0 ) "Waiting for DHCPv4 peer restart (max 30s)"
        WAIT_SECS=$(( ( ${PROV_ID} - 1 ) * 5 ))
        sleep ${WAIT_SECS}
        count=0
        while ! check_failover_state; do
            sleep 2
            count=$(( ${count} + 1 ))
            if [[ ${count} -gt 15 ]]; then
                break
            fi
        done

        res_count=0
        while true; do
            logger -p lpr.notice -t $( basename $0 ) "Running DHCPv4 restart"
            tstart=$(date +%s)
            dhcp_restart_output=$( service isc-dhcp-server restart 2>&1 200>/dev/null )
            dhcp_restart_ret=$?
            tend=$(date +%s)
            ttot=$(( ${tend} - ${tstart} ))
            res_count=$(( ${res_count} + 1 ))
            if [[ ${dhcp_restart_ret} -ne 0 ]]; then
                logger -p lpr.error -t $( basename $0 ) "DHCPv4 restart failed (code ${dhcp_restart_ret}) in ${ttot}s"
                logger -p lpr.error -t $( basename $0 ) "${dhcp_restart_output}"
                if [[ ${res_count} -ge 5 ]]; then
                    logger -p lpr.error -t $( basename $0 ) "Failed to restart DHCPv4 5 times, aborting"
                    exit 1
                fi
                logger -p lpr.info -t $( basename $0 ) "Retrying DHCPv4 restart in 5 seconds"
                sleep 5
            else
                logger -p lpr.notice -t $( basename $0 ) "DHCPv4 restart succeeded in ${ttot}s"
                break
            fi
        done
        set -o errexit
    }

    # detect daylight saving time changes
    # and automatically restart DHCP to force time-offset update
    DATE_CUR=$( date --utc +%s )
    TIMEDIFF_MAX=1800
    if [[ -s ${RUNSTAMP} ]]; then
        # subsequent run (since reboot)
        DATE_LAST=$(<${RUNSTAMP})
        TIMEDIFF=$(( ${DATE_CUR} - ${DATE_LAST} ))
    else
        TIMEDIFF=0
    fi
    echo ${DATE_CUR} > ${RUNSTAMP}

    CHANGE_COUNT=$( cat ${LOG} | grep --invert-match --extended-regexp '\./$' | wc --lines || true )

    if [[ -x /etc/init.d/isc-dhcp-server ]]; then
        if [[ ${SUBNET_CHANGE} == y ]]; then
            dhcp_restart "generated ${DHCP_DIR}/subnet file has changed" &
        elif [[ ${CHANGE_COUNT} -gt 0 ]]; then
            dhcp_restart "${CHANGE_COUNT} change(s) in ${DHCP_DIR} directory" &
        elif [[ ${TIMEDIFF} -gt ${TIMEDIFF_MAX} || ${TIMEDIFF} -lt 0 ]]; then
            dhcp_restart "time skew of ${TIMEDIFF} seconds" &
        fi
    fi

    rm ${LOG}
fi

#
# Phase 3 DHCPv6 sync
#
if [[ -d ${DHCP6_DIR} && -f ${DHCP6_DIR}/enabled ]]; then
    LOG=$( mktemp )

    # synchronize main configuration without customer provisioning data
    EXCLUDE_FILE=$( mktemp )
    cat > ${EXCLUDE_FILE} << 'EOF'
dhclient*
dhcpd.conf
dhcpd6.conf
CableLabs
enabled
global
group
time-offset
failover
boot
subnet
subnet6
subnets/
modems/
modems
server
*.local
option.*
*~
*.swp
*.bak
EOF

    ${RSYNC} ${DHCP_RSYNC_OPTIONS} --chmod='go-w' \
        --exclude-from=${EXCLUDE_FILE} ${USER}@${SRC_HOST}::dhcp/ ${DHCP6_DIR}/ > ${LOG}
    rm ${EXCLUDE_FILE}

    # Link the group file
    if [[ ! -L ${DHCP6_DIR}/group ]]; then
        ln -sf ${DHCP6_DIR}/group6 ${DHCP6_DIR}/group
    fi

    # Link the modems directory
    if [[ -d ${DHCP_DIR}/modems ]]; then
        if [[ ! -L ${DHCP6_DIR}/modems ]]; then
            ln -sf ${DHCP_DIR}/modems ${DHCP6_DIR}/modems
        fi
    else
        ${RSYNC} ${DHCP_RSYNC_OPTIONS} --chmod=Fugo=r \
            ${USER}@${SRC_HOST}::dhcp/modems/ ${DHCP6_DIR}/modems/ >> ${LOG}
    fi

    # Synchronize the subnets directory without logging
    # We only care about changes here if they trigger /etc/dhcp/subnet to regenerate
    ${RSYNC} ${DHCP_RSYNC_OPTIONS} --chmod=Fugo=r --delete-excluded --no-itemize-changes \
        ${USER}@${SRC_HOST}::dhcp/subnets/ ${DHCP6_DIR}/subnets/

    # Ensure local links are present
    for linkfile in ${DHCP_DIR}/time-offset; do
        if [[ ! -L $( sed "s|${DHCP_DIR}|${DHCP6_DIR}|g" <<<"${linkfile}" ) ]]; then
            ln -sf ${linkfile} $( sed "s|${DHCP_DIR}|${DHCP6_DIR}|g" <<<"${linkfile}" )
        fi
    done

    # Regenerate the cluster-specific subnet file
    TMP_SUBNET_HEADER=$( mktemp )
    TMP_SUBNET_SUBNETS=$( mktemp )
    echo "############"                                                     >> ${TMP_SUBNET_HEADER}
    echo "### STOP ###"                                                     >> ${TMP_SUBNET_HEADER}
    echo "############"                                                     >> ${TMP_SUBNET_HEADER}
    echo "# This file is automatically generated by 'file-sync-cluster'"    >> ${TMP_SUBNET_HEADER}
    echo "# DO NOT EDIT THIS FILE OR IT WILL BE OVERWRITTEN IMMEDIATELY"    >> ${TMP_SUBNET_HEADER}
    echo "# All edits must be made in NOMS without exception"               >> ${TMP_SUBNET_HEADER}
    echo ""                                                                 >> ${TMP_SUBNET_HEADER}
    if [[ -f ${DHCP6_DIR}/subnet6.header ]]; then
            cat ${DHCP6_DIR}/subnet6.header                                 >> ${TMP_SUBNET_HEADER}
    fi
    if [[ -f ${DHCP6_DIR}/subnets/clusters6/${CLUSTER}.conf && $( wc -l < ${DHCP6_DIR}/subnets/clusters6/${CLUSTER}.conf ) -gt 0 ]]; then
        for include in $( grep 'include' ${DHCP6_DIR}/subnets/clusters6/${CLUSTER}.conf | awk -F '"' '{ print $2 }' ); do
            # Adjust include paths to dhcp6
            include="$( sed "s|/etc/dhcp/|${DHCP6_DIR}/|" <<<"${include}" )"
            if [[ "${include}" =~ .*${CTYPE}6\.conf$ ]]; then
                cat ${include}                                              >> ${TMP_SUBNET_SUBNETS}
                echo                                                        >> ${TMP_SUBNET_SUBNETS}
            fi
        done
    fi

    # Concatenate the header and subnets together if there are subnets
    if [[ -s ${TMP_SUBNET_SUBNETS} ]]; then
        TMP_SUBNET=$( mktemp )
        cat ${TMP_SUBNET_HEADER} ${TMP_SUBNET_SUBNETS} > ${TMP_SUBNET}
        rm -f ${TMP_SUBNET_HEADER}
        rm -f ${TMP_SUBNET_SUBNETS}
    else
        rm -f ${TMP_SUBNET_HEADER}
        rm -f ${TMP_SUBNET_SUBNETS}
        # Clean up a running DHCP process (e.g. if all CMTSes were removed from this cluster)
        if [[ -f ${DHCP6_DIR}/subnet ]]; then
            service isc-dhcp6-server stop || true
            mv ${DHCP6_DIR}/subnet ${DHCP6_DIR}/subnet.store_$(date +%s) || true
            syscheck check dhcpd || true
        fi
        syscheck release ${JOB_NAME}
        # Before exiting remove left over files (RT#61859)
        if [ -f "$PASSWORD_FILE" ]; then
            rm "$PASSWORD_FILE"
        fi
        if [ -f "$LOG" ]; then
            rm "$LOG"
        fi
        exit
    fi

    SUBNET_CHANGE="n"
    if ! cmp --silent ${TMP_SUBNET} ${DHCP6_DIR}/subnet; then
        chmod --reference ${DHCP6_DIR}/subnet ${TMP_SUBNET} || true
        chown --reference ${DHCP6_DIR}/subnet ${TMP_SUBNET} || true
        # Do it this way to avoid regenerating timestamps on the /etc/dhcp directory
        mv ${TMP_SUBNET} ${DHCP6_DIR}/subnet
        SUBNET_CHANGE="y"
    else
        # Noop, we might need this later
        true
    fi

    function dhcp6_restart {
        set +o errexit
        RESTART_REASON="$1"
        logger -p lpr.notice -t $( basename $0 ) "${RESTART_REASON}"

        # Ensure a leases database exists (not autocreated)
        if [[ ! -f ${LEASES_DIR}/dhcpd6.leases ]]; then
            touch ${LEASES_DIR}/dhcpd6.leases
        fi

        # Chek DHCP syntax or fail
        if ! dhcpd -t -6 -cf /etc/dhcp6/dhcpd.conf &>/dev/null; then
            logger -p lpr.error -t $( basename $0 ) "Failed to validate DHCPv6 config! Not restarting DHCP."
            return
        fi

        # Wait for DHCPv6 to restart on peer
        logger -p lpr.notice -t $( basename $0 ) "Waiting 5 seconds for DHCPv6 peer sync"
        sleep 5

        logger -p lpr.notice -t $( basename $0 ) "Running DHCPv6 restart"
        service isc-dhcp6-server restart >/dev/null 200>/dev/null
        if [[ $? -ne 0 ]]; then
            logger -p lpr.error -t $( basename $0 ) "Failed to restart DHCPv6!"
        fi
        set -o errexit
    }

    function dhcp6_failover {
        set +o errexit

        if fping6 ${PRIMARY_DHCP_HOST} && nc -6 -vzu -w 1 ${PRIMARY_DHCP_HOST} dhcpv6-server; then
            service isc-dhcp6-server stop >/dev/null 200>/dev/null
            rm ${DHCP6_DIR}/subnet
            sleep 1
            ${RSYNC} ${DHCP_RSYNC_OPTIONS} --no-itemize-changes \
                ${USER}@${PRIMARY_DHCP_HOST}::leases/dhcpd6.leases ${LEASES_DIR}/
        else
            # Check DHCP syntax or fail
            mv ${TMP_SUBNET} ${DHCP6_DIR}/subnet
            if ! dhcpd -t -6 -cf /etc/dhcp6/dhcpd.conf &>/dev/null; then
                logger -p lpr.error -t $( basename $0 ) "Failed to validate DHCPv6 config! Not starting failover DHCPv6."
                return
            fi

            service isc-dhcp6-server restart >/dev/null 200>/dev/null
        fi
        set -o errexit
    }

    # detect daylight saving time changes
    # and automatically restart DHCP to force time-offset update
    DATE_CUR=$( date --utc +%s )
    TIMEDIFF_MAX=1800
    if [[ -s ${RUNSTAMP} ]]; then
        # subsequent run (since reboot)
        DATE_LAST=$(<${RUNSTAMP})
        TIMEDIFF=$(( ${DATE_CUR} - ${DATE_LAST} ))
    else
        TIMEDIFF=0
    fi
    echo ${DATE_CUR} > ${RUNSTAMP}
   
    CHANGE_COUNT=$( cat ${LOG} | grep --invert-match --extended-regexp '\./$' | wc --lines || true )

    if [[ -x /etc/init.d/isc-dhcp6-server ]]; then
        if [[ ${PROV_ID} -eq 1 ]]; then
            if [[ ${SUBNET_CHANGE} == y ]]; then
                dhcp6_restart "generated ${DHCP6_DIR}/subnet file has changed" &
            elif [[ ${CHANGE_COUNT} -gt 0 ]]; then
                dhcp6_restart "${CHANGE_COUNT} change(s) in ${DHCP6_DIR} directory" &
            elif [[ ${TIMEDIFF} -gt ${TIMEDIFF_MAX} || ${TIMEDIFF} -lt 0 ]]; then
                dhcp6_restart "time skew of ${TIMEDIFF} seconds" &
            fi
        else
            dhcp6_failover &
        fi
    fi

    if [[ -f ${TMP_SUBNET} ]]; then
        rm ${TMP_SUBNET}
    fi

    rm ${LOG}
fi

wait

rm ${PASSWORD_FILE}

syscheck release ${JOB_NAME}

) 200>${LOCK_DHCP}
