Sun Cluster 3.0 U1 Data Services Developer's Guide

PROBE Program Code Listing

The PROBE program checks the availability of the data service using nslookup(1M) commands. The MONITOR_START callback method launches this program and the MONITOR_START callback method stops it.


Example B-5 dns_probe Program

#!/bin/ksh
#pragma ident	"@(#)dns_probe	1.1	00/04/19 SMI"
#
# Probe method for HA-DNS.
#
# This program checks the availability of the data service using
nslookup, which 
# queries the DNS server to look for the DNS server itself. If the
server
# does not respond or if the query is replied to by some other server, 
# then the probe concludes that there is some problem with the data
service
# and fails the sevice over to another node in the cluster. Probing
is done
# at a specific interval set by THOROUGH_PROBE_INTERVAL in the RTR
file. 
  
#pragma ident	"@(#)dns_probe	1.1	00/05/24 SMI"


###############################################################################
# Parse program arguments.
#
function parse_args # [args ...]
{
        typeset opt

        while getopts `R:G:T:' opt
        do
                case "$opt" in
                R)
                        # Name of the DNS resource.
                        RESOURCE_NAME=$OPTARG
                        ;;
                G)
                        # Name of the resource group in which the
resource is
                        # configured.
                        RESOURCEGROUP_NAME=$OPTARG
                        ;;
                T)
                        # Name of the resource type.
                        RESOURCETYPE_NAME=$OPTARG
                        ;;

                *)
                    logger -p ${SYSLOG_FACILITY}.err \
                    -t [$RESOURCETYPE_NAME,$RESOURCEGROUP_NAME,$RESOURCE_NAME]
\
                    "ERROR: Option $OPTARG unknown"
                     exit 1
                     ;;

                esac
        done

}


###############################################################################
# restart_service ()
#
# This function tries to restart the dataservice by calling the
STOP method
# followed by the START method of the dataservice. If the dataservice
has
# already died and no tag is registered for the dataservice under
PMF,
# then this function fails the service over to another node in the
cluster.
#
function restart_service
{

        # To restart the dataservice, first, verify that the 
        # dataservice itself is still registered under PMF.
        pmfadm -q $PMF_TAG
        if [[ $? -eq 0 ]]; then
                # Since the TAG for the dataservice is still registered
under
                # PMF, first stop the dataservice and start it back
up again.

                # Obtain the STOP method name and the STOP_TIMEOUT
value for
                # this resource.
                STOP_TIMEOUT=`scha_resource_get -O STOP_TIMEOUT
\
                        -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME`
                STOP_METHOD=`scha_resource_get -O STOP
\
                        -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME`
                hatimerun -t $STOP_TIMEOUT $RT_BASEDIR/$STOP_METHOD
\
                        -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME
\
                        -T $RESOURCETYPE_NAME

                if [[ $? -ne 0 ]]; then
                        logger-p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG]
\
                                "${ARGV0} Stop method failed."
                        return 1
                fi

                # Obtain the START method name and the START_TIMEOUT
value for
                # this resource.
                START_TIMEOUT=`scha_resource_get -O START_TIMEOUT
\
                        -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME`
                START_METHOD=`scha_resource_get -O START
\
                        -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME`
                hatimerun -t $START_TIMEOUT $RT_BASEDIR/$START_METHOD
\
                        -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME
\
                        -T $RESOURCETYPE_NAME

                if [[ $? -ne 0 ]]; then
                        logger-p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG]
\
                                "${ARGV0} Start method
failed."
                        return 1
                fi


        else
                # The absence of the TAG for the dataservice 
                # implies that the dataservice has already
                # exceeded the maximum retries allowed under PMF. 
                # Therefore, do not attempt to restart the
                # dataservice again, but try to failover
                # to another node in the cluster.
                scha_control -O GIVEOVER -G $RESOURCEGROUP_NAME
\
                        -R $RESOURCE_NAME
        fi

        return 0
}




###############################################################################
# decide_restart_or_failover ()
#
# This function decides the action to be taken upon the failure
of a probe: 
# restart the data service locally or fail over to another node
in the cluster.
#
function decide_restart_or_failover
{
	
	# Check if this is the first restart attempt.
	if [ $retries -eq 0 ]; then
			# This is the first failure. Note the time of 
			# this first attempt. 
			start_time=`$RT_BASEDIR/gettime`
			retries=`expr $retries + 1`
			# Because this is the first failure, attempt to restart
			# the data service.
			restart_service
			if [ $? -ne 0 ]; then
				logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
				    "${ARGV0} Failed to restart data service."
				exit 1
			fi
	else
		# This is not the first failure
		current_time=`$RT_BASEDIR/gettime`
		time_diff=`expr $current_time - $start_time`
		if [ $time_diff -ge $RETRY_INTERVAL ]; then
			# This failure happened after the time window
			# elapsed, so reset the retries counter,
			# slide the window, and do a retry.
			retries=1
 			start_time=$current_time
			# Because the previous failure occurred more than 
			# Retry_interval ago, attempt to restart the data service.
			restart_service
			if [ $? -ne 0 ]; then
				logger -p ${SYSLOG_FACILITY}.err \
				    -t [$SYSLOG_TAG
				    "${ARGV0} Failed to restart HA-DNS."
				exit 1
			fi
		elif [ $retries -ge $RETRY_COUNT ]; then
			# Still within the time window,
			# and the retry counter expired, so fail over.
			retries=0
			scha_control -O GIVEOVER -G $RESOURCEGROUP_NAME \
			    -R $RESOURCE_NAME
			if [ $? -ne 0 ]; then
				logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
				    "${ARGV0} Failover attempt failed."
				exit 1
			fi
		else
			# Still within the time window,
			# and the retry counter has not expired,
			# so do another retry.
			retries=`expr $retries + 1`
			restart_service
			if [ $? -ne 0 ]; then
				logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
				    "${ARGV0} Failed to restart HA-DNS."
				exit 1
			fi
		fi
fi
}


###############################################################################
# MAIN
###############################################################################

export PATH=/bin:/usr/bin:/usr/cluster/bin:/usr/sbin:/usr/proc/bin:$PATH

# Obtain the syslog facility to use to log messages.
SYSLOG_FACILITY=`scha_cluster_get -O SYSLOG_FACILITY`

# Parse the arguments that have been passed to this method
parse_args "$@"

PMF_TAG=$RESOURCE_NAME.named
SYSLOG_TAG=$RESOURCETYPE_NAME,$RESOURCEGROUP_NAME,$RESOURCE_NAME

# The interval at which probing is to be done is set in the system
defined
# property THOROUGH_PROBE_INTERVAL. Obtain the value of this property
with 
# scha_resource_get 
PROBE_INTERVAL=`scha_resource_get -O THOROUGH_PROBE_INTERVAL
-R $RESOURCE_NAME -G $RESOURCEGROUP_NAME`

# Obtain the timeout value allowed for the probe, which is set in
the 
# PROBE_TIMEOUT extension property in the RTR file. The default
timeout for 
# nslookup is 1.5 minutes.
probe_timeout_info=`scha_resource_get -O Extension -R $RESOURCE_NAME
-G \$RESOURCEGROUP_NAME Probe_timeout`
PROBE_TIMEOUT=`echo $probe_timeout_info | awk `{print $2}'`

# Identify the server on which DNS is serving by obtaining the value
# of the NETWORK_RESOURCES_USED property of the resource.
DNS_HOST=`scha_resource_get -O NETWORK_RESOURCES_USED -R
$RESOURCE_NAME -G \$RESOURCEGROUP_NAME`

# Get the retry count value from the system defined property Retry_count
RETRY_COUNT=`scha_resource_get -O RETRY_COUNT -R $RESOURCE_NAME
-G \$RESOURCEGROUP_NAME`

# Get the retry interval value from the system defined property
Retry_interval
RETRY_INTERVAL=`scha_resource_get -O RETRY_INTERVAL -R
$RESOURCE_NAME -G \$RESOURCEGROUP_NAME`

# Obtain the full path for the gettime utility from the 
# RT_basedir property of the resource type.
RT_BASEDIR=`scha_resource_get -O RT_BASEDIR -R $RESOURCE_NAME
-G \$RESOURCEGROUP_NAME`

# The probe runs in an infinite loop, trying nslookup commands. 
# Set up a temporary file for the nslookup replies.
DNSPROBEFILE=/tmp/.$RESOURCE_NAME.probe
probefail=0
retries=0

while :
do
	# The interval at which the probe needs to run is specified in
the
	# property THOROUGH_PROBE_INTERVAL. Therefore, set the probe to
sleep for a 
	# duration of <THOROUGH_PROBE_INTERVAL>
	sleep $PROBE_INTERVAL

	# Run the probe, which queries the IP address on 
	# which DNS is serving.
	hatimerun -t $PROBE_TIMEOUT /usr/sbin/nslookup $DNS_HOST $DNS_HOST
\
           > $DNSPROBEFILE 2>&1
	
	retcode=$?
        if [ retcode -ne 0 ]; then
                probefail=1
        fi

	# Make sure that the reply to nslookup command comes from the HA-DNS
	# server and not from another name server listed in the 
	# /etc/resolv.conf file.
	if [ $probefail -eq 0 ]; then
		# Get the name of the server that replied to the nslookup query.
						 SERVER=` awk ` $1=="Server:" {
print $2 }' \
							 $DNSPROBEFILE | awk -F. ` { print $1 } ` `
                if [ -z "$SERVER" ];
then
                        probefail=1
                else
                        if [ $SERVER != $DNS_HOST ]; then
                                probefail=1
                        fi
                fi
        fi

	# If the probefail variable is not set to 0, either the nslookup
command
	# timed out or the reply to the query was came from another server
	# (specified in the /etc/resolv.conf file). In either case, the
DNS server is
	# not responding and the method calls decide_restart_or_failover,
	# which evaluates whether to restart the data service or to fail
it over
	# to another node.

	if [ $probefail -ne 0 ]; then
			decide_restart_or_failover
	else
			logger -p ${SYSLOG_FACILITY}.info -t [$SYSLOG_TAG]\
		    "${ARGV0} Probe for resource HA-DNS successful"
	fi
done