PROBE Program Code Listing - Developing Data Services

Language:
`PROBE` Program Code Listing

The PROBE program checks the availability of the data service by using nslookup commands (see the nslookup(1) man page). The Monitor_start callback method starts this program, and the Monitor_stop callback method stops it.
Example 9 dns_probe Program
#!/bin/ksh
#pragma ident   "@(#)dns_probe   1.1   12/01/19"
#
# Probe method for HA-DNS.
#
# This program checks the availability of the data service using nslookup, which
# queries the DNS server to look for the DNS server itself. If the server
# does not respond or if the query is replied to by some other server,
# then the probe concludes that there is some problem with the data service
# and fails the service over to another node in the cluster. Probing is done
# at a specific interval set by THOROUGH_PROBE_INTERVAL in the RTR file.

#pragma ident   "@(#)dns_probe   1.1   00/05/24 SMI"

###############################################################################
# Parse program arguments.
function parse_args # [args ...]
{
typeset opt

while getopts `R:G:T:' opt
do
case "$opt" in
R)
# Name of the DNS resource.
RESOURCE_NAME=$OPTARG
;;
G)
# Name of the resource group in which the resource is
# configured.
RESOURCEGROUP_NAME=$OPTARG
;;
T)
# Name of the resource type.
RESOURCETYPE_NAME=$OPTARG
;;
*)
logger -p ${SYSLOG_FACILITY}.err \
-t [$RESOURCETYPE_NAME,$RESOURCEGROUP_NAME,$RESOURCE_NAME] \
"ERROR: Option $OPTARG unknown"
exit 1
;;
esac
done
}

###############################################################################
# restart_service ()
#
# This function tries to restart the data service by calling the Stop method
# followed by the Start method of the dataservice. If the dataservice has
# already died and no tag is registered for the dataservice under PMF,
# then this function fails the service over to another node in the cluster.
#
function restart_service
{
# To restart the dataservice, first, verify that the
# dataservice itself is still registered under PMF.
pmfadm -q $PMF_TAG
if [[ $? -eq 0 ]]; then
# Since the TAG for the dataservice is still registered under
# PMF, first stop the dataservice and start it back up again.
# Obtain the Stop method name and the STOP_TIMEOUT value for
# this resource.
STOP_TIMEOUT=`scha_resource_get -O STOP_TIMEOUT \
-R $RESOURCE_NAME -G $RESOURCEGROUP_NAME
STOP_METHOD=`scha_resource_get -O STOP \
-R $RESOURCE_NAME -G $RESOURCEGROUP_NAME
/usr/cluster/bin/hatimerun -t $STOP_TIMEOUT $RT_BASEDIR/$STOP_METHOD \
-R $RESOURCE_NAME -G $RESOURCEGROUP_NAME \
-T $RESOURCETYPE_NAME`

if [[ $? -ne 0 ]]; then
logger-p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
"${ARGV0} Stop method failed."
return 1
fi

# Obtain the Start method name and the START_TIMEOUT value for
# this resource.
START_TIMEOUT=`scha_resource_get -O START_TIMEOUT \
-R $RESOURCE_NAME -G $RESOURCEGROUP_NAME
START_METHOD=`scha_resource_get -O START \
-R $RESOURCE_NAME -G $RESOURCEGROUP_NAME
/usr/cluster/bin/hatimerun -t $START_TIMEOUT $RT_BASEDIR/$START_METHOD \
-R $RESOURCE_NAME -G $RESOURCEGROUP_NAME \
-T $RESOURCETYPE_NAME`

if [[ $? -ne 0 ]]; then
logger-p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
"${ARGV0} Start method failed."
return 1
fi

else
# The absence of the TAG for the dataservice
# implies that the dataservice has already
# exceeded the maximum retries allowed under PMF.
# Therefore, do not attempt to restart the
# dataservice again, but try to failover
# to another node in the cluster.
scha_control -O GIVEOVER -G $RESOURCEGROUP_NAME \
-R $RESOURCE_NAME
fi

return 0
}

###############################################################################
# decide_restart_or_failover ()
#
# This function decides the action to be taken upon the failure of a probe:
# restart the data service locally or fail over to another node in the cluster.
#
function decide_restart_or_failover
{

# Check if this is the first restart attempt.
if [ $retries -eq 0 ]; then
# This is the first failure. Note the time of
# this first attempt.
start_time=`$RT_BASEDIR/gettim`
retries=`expr $retries + 1`
# Because this is the first failure, attempt to restart
# the data service.
restart_service
if [ $? -ne 0 ]; then
logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
"${ARGV0} Failed to restart data service."
exit 1
fi
else
# This is not the first failure
current_time=`$RT_BASEDIR/gettime`
time_diff=`expr $current_time - $start_time`
if [ $time_diff -ge $RETRY_INTRVAL ]; then
# This failure happened after the time window
# elapsed, so reset the retries counter,
# slide the window, and do a retry.
retries=1
start_time=$current_time
# Because the previous failure occurred more than
# Retry_interval ago, attempt to restart the data service.
restart_service
if [ $? -ne 0 ]; then
logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG \
"${ARGV0} Failed to restart HA-DNS."
exit 1
fi
elif [ $retries -ge $RETRY_COUNT ]; then
# Still within the time window,
# and the retry counter expired, so fail over.
retries=0
scha_control -O GIVEOVER -G $RESOURCEGROUP_NAME \
-R $RESOURCE_NAME
if [ $? -ne 0 ]; then
logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
"${ARGV0} Failover attempt failed."
exit 1
fi
else
# Still within the time window,
# and the retry counter has not expired,
# so do another retry.
retries=`expr $retries + 1`
restart_service
if [ $? -ne 0 ]; then
logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
"${ARGV0} Failed to restart HA-DNS."
exit 1
fi
fi
fi
}

###############################################################################
# MAIN
###############################################################################

export PATH=/bin:/usr/bin:/usr/cluster/bin:/usr/sbin:/usr/proc/bin:$PATH

# Obtain the syslog facility to use to log messages.
SYSLOG_FACILITY=`scha_cluster_get -O SYSLOG_FACILITY`

# Parse the arguments that have been passed to this method
parse_args "$@"

PMF_TAG=$RESOURCE_NAME.named
SYSLOG_TAG=$RESOURCETYPE_NAME,$RESOURCEGROUP_NAME,$RESOURCE_NAME

# The interval at which probing is to be done is set in the system defined
# property THOROUGH_PROBE_INTERVAL. Obtain the value of this property with
# scha_resource_get
PROBE_INTERVAL=scha_resource_get -O THOROUGH_PROBE_INTERVAL \
-R $RESOURCE_NAME -G $RESOURCEGROUP_NAME

# Obtain the timeout value allowed for the probe, which is set in the
# PROBE_TIMEOUT extension property in the RTR file. The default timeout for
# nslookup is 1.5 minutes.
probe_timeout_info=`scha_resource_get -O Extension -R $RESOURCE_NAME \
-G $RESOURCEGROUP_NAME Probe_timeout`
PROBE_TIMEOUT=`echo $probe_timeout_info | awk `{print $2}'`

# Identify the server on which DNS is serving by obtaining the value
# of the NETWORK_RESOURCES_USED property of the resource.
DNS_HOST=`scha_resource_get -O NETWORK_RESOURCES_USED -R $RESOURCE_NAME \
-G $RESOURCEGROUP_NAME`

# Get the retry count value from the system defined property Retry_count
RETRY_COUNT =`scha_resource_get -O RETRY_COUNT -R $RESOURCE_NAME \
-G $RESOURCEGROUP_NAME`

# Get the retry interval value from the system defined property
Retry_interval
RETRY_INTRVAL=scha_resource_get -O RETRY_INTRVAL -R $RESOURCE_NAME \
-G $RESOURCEGROUP_NAME`

# Obtain the full path for the gettime utility from the
# RT_basedir property of the resource type.
RT_BASEDIR=scha_resource_get -O RT_basedir -R $RESOURCE_NAME \
-G $RESOURCEGROUP_NAME`

# The probe runs in an infinite loop, trying nslookup commands.
# Set up a temporary file for the nslookup replies.
DNSPROBEFILE=/tmp/.$RESOURCE_NAME.probe
probefail=0
retries=0

while :
do
# The interval at which the probe needs to run is specified in the
# property THOROUGH_PROBE_INTERVAL. Therefore, set the probe to sleep for a
# duration of <THOROUGH_PROBE_INTERVAL>
sleep $PROBE_INTERVAL

# Run the probe, which queries the IP address on
# which DNS is serving.
/usr/cluster/bin/hatimerun -t $PROBE_TIMEOUT /usr/sbin/nslookup $DNS_HOST $DNS_HOST \
> $DNSPROBEFILE 2>&1

retcode=$?
if [ retcode -ne 0 ]; then
probefail=1
fi

# Make sure that the reply to nslookup command comes from the HA-DNS
# server and not from another name server listed in the
# /etc/resolv.conf file.
if [ $probefail -eq 0 ]; then
# Get the name of the server that replied to the nslookup query.
SERVER=` awk ` $1=="Server:" {print $2 }' \
$DNSPROBEFILE | awk -F. ` { print $1 } ` `
if [ -z "$SERVER" ];
then
probefail=1
else
if [ $SERVER != $DNS_HOST ]; then
probefail=1
fi
fi
fi

# If the probefail variable is not set to 0, either the nslookup command
# timed out or the reply to the query was came from another server
# (specified in the /etc/resolv.conf file). In either case, the DNS server is
# not responding and the method calls decide_restart_or_failover,
# which evaluates whether to restart the data service or to fail it over
# to another node.

if [ $probefail -ne 0 ]; then
decide_restart_or_failover
else
logger -p ${SYSLOG_FACILITY}.info -t [$SYSLOG_TAG] \
"${ARGV0} Probe for resource HA-DNS successful"
fi
done