The PROBE program checks the availability of the data service using nslookup(1M) commands. The Monitor_start callback method launches this program and the Monitor_start callback method stops it.
#!/bin/ksh #pragma ident “@(#)dns_probe 1.1 00/04/19 SMI” # # Probe method for HA-DNS. # # This program checks the availability of the data service using nslookup, which # queries the DNS server to look for the DNS server itself. If the server # does not respond or if the query is replied to by some other server, # then the probe concludes that there is some problem with the data service # and fails the service over to another node in the cluster. Probing is done # at a specific interval set by THOROUGH_PROBE_INTERVAL in the RTR file. #pragma ident “@(#)dns_probe 1.1 00/05/24 SMI” ############################################################################### # Parse program arguments. function parse_args # [args ...] { typeset opt while getopts `R:G:T:' opt do case “$opt” in R) # Name of the DNS resource. RESOURCE_NAME=$OPTARG ;; G) # Name of the resource group in which the resource is # configured. RESOURCEGROUP_NAME=$OPTARG ;; T) # Name of the resource type. RESOURCETYPE_NAME=$OPTARG ;; *) logger -p ${SYSLOG_FACILITY}.err \ -t [$RESOURCETYPE_NAME,$RESOURCEGROUP_NAME,$RESOURCE_NAME] \ “ERROR: Option $OPTARG unknown” exit 1 ;; esac done } ############################################################################### # restart_service () # # This function tries to restart the data service by calling the Stop method # followed by the Start method of the dataservice. If the dataservice has # already died and no tag is registered for the dataservice under PMF, # then this function fails the service over to another node in the cluster. # function restart_service { # To restart the dataservice, first, verify that the # dataservice itself is still registered under PMF. pmfadm -q $PMF_TAG if [[ $? -eq 0 ]]; then # Since the TAG for the dataservice is still registered under # PMF, first stop the dataservice and start it back up again. # Obtain the Stop method name and the STOP_TIMEOUT value for # this resource. STOP_TIMEOUT=`scha_resource_get -O STOP_TIMEOUT \ -R $RESOURCE_NAME -G $RESOURCEGROUP_NAMÈ STOP_METHOD=`scha_resource_get -O STOP \ -R $RESOURCE_NAME -G $RESOURCEGROUP_NAMÈ hatimerun -t $STOP_TIMEOUT $RT_BASEDIR/$STOP_METHOD \ -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME \ -T $RESOURCETYPE_NAME if [[ $? -ne 0 ]]; then logger-p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \ “${ARGV0} Stop method failed.” return 1 fi # Obtain the Start method name and the START_TIMEOUT value for # this resource. START_TIMEOUT=`scha_resource_get -O START_TIMEOUT \ -R $RESOURCE_NAME -G $RESOURCEGROUP_NAMÈ START_METHOD=`scha_resource_get -O START \ -R $RESOURCE_NAME -G $RESOURCEGROUP_NAMÈ hatimerun -t $START_TIMEOUT $RT_BASEDIR/$START_METHOD \ -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME \ -T $RESOURCETYPE_NAME if [[ $? -ne 0 ]]; then logger-p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \ “${ARGV0} Start method failed.” return 1 fi else # The absence of the TAG for the dataservice # implies that the dataservice has already # exceeded the maximum retries allowed under PMF. # Therefore, do not attempt to restart the # dataservice again, but try to failover # to another node in the cluster. scha_control -O GIVEOVER -G $RESOURCEGROUP_NAME \ -R $RESOURCE_NAME fi return 0 } ############################################################################### # decide_restart_or_failover () # # This function decides the action to be taken upon the failure of a probe: # restart the data service locally or fail over to another node in the cluster. # function decide_restart_or_failover { # Check if this is the first restart attempt. if [ $retries -eq 0 ]; then # This is the first failure. Note the time of # this first attempt. start_time=`$RT_BASEDIR/gettimè retries=`expr $retries + 1` # Because this is the first failure, attempt to restart # the data service. restart_service if [ $? -ne 0 ]; then logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \ “${ARGV0} Failed to restart data service.” exit 1 fi else # This is not the first failure current_time=`$RT_BASEDIR/gettimè time_diff=`expr $current_time - $start_timè if [ $time_diff -ge $RETRY_INTERVAL ]; then # This failure happened after the time window # elapsed, so reset the retries counter, # slide the window, and do a retry. retries=1 start_time=$current_time # Because the previous failure occurred more than # Retry_interval ago, attempt to restart the data service. restart_service if [ $? -ne 0 ]; then logger -p ${SYSLOG_FACILITY}.err \ -t [$SYSLOG_TAG “${ARGV0} Failed to restart HA-DNS.” exit 1 fi elif [ $retries -ge $RETRY_COUNT ]; then # Still within the time window, # and the retry counter expired, so fail over. retries=0 scha_control -O GIVEOVER -G $RESOURCEGROUP_NAME \ -R $RESOURCE_NAME if [ $? -ne 0 ]; then logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \ “${ARGV0} Failover attempt failed.” exit 1 fi else # Still within the time window, # and the retry counter has not expired, # so do another retry. retries=`expr $retries + 1` restart_service if [ $? -ne 0 ]; then logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \ “${ARGV0} Failed to restart HA-DNS.” exit 1 fi fi fi } ############################################################################### # MAIN ############################################################################### export PATH=/bin:/usr/bin:/usr/cluster/bin:/usr/sbin:/usr/proc/bin:$PATH # Obtain the syslog facility to use to log messages. SYSLOG_FACILITY=`scha_cluster_get -O SYSLOG_FACILITY` # Parse the arguments that have been passed to this method parse_args “$@” PMF_TAG=$RESOURCE_NAME.named SYSLOG_TAG=$RESOURCETYPE_NAME,$RESOURCEGROUP_NAME,$RESOURCE_NAME # The interval at which probing is to be done is set in the system defined # property THOROUGH_PROBE_INTERVAL. Obtain the value of this property with # scha_resource_get PROBE_INTERVAL=scha_resource_get -O THOROUGH_PROBE_INTERVAL \ -R $RESOURCE_NAME -G $RESOURCEGROUP_NAMÈ # Obtain the timeout value allowed for the probe, which is set in the # PROBE_TIMEOUT extension property in the RTR file. The default timeout for # nslookup is 1.5 minutes. probe_timeout_info=`scha_resource_get -O Extension -R $RESOURCE_NAME \ -G $RESOURCEGROUP_NAME Probe_timeout` PROBE_TIMEOUT=`echo $probe_timeout_info | awk `{print $2}'` # Identify the server on which DNS is serving by obtaining the value # of the NETWORK_RESOURCES_USED property of the resource. DNS_HOST=`scha_resource_get -O NETWORK_RESOURCES_USED -R $RESOURCE_NAME \ -G $RESOURCEGROUP_NAMÈ # Get the retry count value from the system defined property Retry_count RETRY_COUNT =`scha_resource_get -O RETRY_COUNT -R $RESOURCE_NAME \ -G $RESOURCEGROUP_NAMÈ # Get the retry interval value from the system defined property Retry_interval RETRY_INTERVAL=scha_resource_get -O RETRY_INTERVAL -R $RESOURCE_NAME \ -G $RESOURCEGROUP_NAMÈ # Obtain the full path for the gettime utility from the # RT_basedir property of the resource type. RT_BASEDIR=scha_resource_get -O RT_BASEDIR -R $RESOURCE_NAME \ -G $RESOURCEGROUP_NAMÈ # The probe runs in an infinite loop, trying nslookup commands. # Set up a temporary file for the nslookup replies. DNSPROBEFILE=/tmp/.$RESOURCE_NAME.probe probefail=0 retries=0 while : do # The interval at which the probe needs to run is specified in the # property THOROUGH_PROBE_INTERVAL. Therefore, set the probe to sleep for a # duration of <THOROUGH_PROBE_INTERVAL> sleep $PROBE_INTERVAL # Run the probe, which queries the IP address on # which DNS is serving. hatimerun -t $PROBE_TIMEOUT /usr/sbin/nslookup $DNS_HOST $DNS_HOST \ > $DNSPROBEFILE 2>&1 retcode=$? if [ retcode -ne 0 ]; then probefail=1 fi # Make sure that the reply to nslookup command comes from the HA-DNS # server and not from another name server listed in the # /etc/resolv.conf file. if [ $probefail -eq 0 ]; then # Get the name of the server that replied to the nslookup query. SERVER=` awk ` $1==”Server:” {print $2 }' \ $DNSPROBEFILE | awk -F. ` { print $1 } ` ` if [ -z “$SERVER” ]; then probefail=1 else if [ $SERVER != $DNS_HOST ]; then probefail=1 fi fi fi # If the probefail variable is not set to 0, either the nslookup command # timed out or the reply to the query was came from another server # (specified in the /etc/resolv.conf file). In either case, the DNS server is # not responding and the method calls decide_restart_or_failover, # which evaluates whether to restart the data service or to fail it over # to another node. if [ $probefail -ne 0 ]; then decide_restart_or_failover else logger -p ${SYSLOG_FACILITY}.info -t [$SYSLOG_TAG] \ “${ARGV0} Probe for resource HA-DNS successful” fi done |