| Skip Navigation Links | |
| Exit Print View | |
|
Oracle Solaris Cluster Data Services Developer's Guide Oracle Solaris Cluster 3.3 3/13 |
1. Overview of Resource Management
3. Resource Management API Reference
6. Data Service Development Library
8. Sample DSDL Resource Type Implementation
9. Oracle Solaris Cluster Agent Builder
12. Cluster Reconfiguration Notification Protocol
13. Security for Data Services
A. Sample Data Service Code Listings
Resource Type Registration File Listing
Monitor_start Method Code Listing
Monitor_stop Method Code Listing
Monitor_check Method Code Listing
B. DSDL Sample Resource Type Code Listings
C. Requirements for Non-Cluster Aware Applications
D. Document Type Definitions for the CRNP
The PROBE program checks the availability of the data service by using nslookup commands (see the nslookup(1M) man page). The Monitor_start callback method starts this program, and the Monitor_stop callback method stops it.
Example A-5 dns_probe Program
#!/bin/ksh
#pragma ident “@(#)dns_probe 1.1 00/04/19”
#
# Probe method for HA-DNS.
#
# This program checks the availability of the data service using nslookup, which
# queries the DNS server to look for the DNS server itself. If the server
# does not respond or if the query is replied to by some other server,
# then the probe concludes that there is some problem with the data service
# and fails the service over to another node in the cluster. Probing is done
# at a specific interval set by THOROUGH_PROBE_INTERVAL in the RTR file.
#pragma ident “@(#)dns_probe 1.1 00/05/24”
###############################################################################
# Parse program arguments.
function parse_args # [args ...]
{
typeset opt
while getopts `R:G:T:' opt
do
case “$opt” in
R)
# Name of the DNS resource.
RESOURCE_NAME=$OPTARG
;;
G)
# Name of the resource group in which the resource is
# configured.
RESOURCEGROUP_NAME=$OPTARG
;;
T)
# Name of the resource type.
RESOURCETYPE_NAME=$OPTARG
;;
*)
logger -p ${SYSLOG_FACILITY}.err \
-t [$RESOURCETYPE_NAME,$RESOURCEGROUP_NAME,$RESOURCE_NAME] \
“ERROR: Option $OPTARG unknown”
exit 1
;;
esac
done
}
###############################################################################
# restart_service ()
#
# This function tries to restart the data service by calling the Stop method
# followed by the Start method of the dataservice. If the dataservice has
# already died and no tag is registered for the dataservice under PMF,
# then this function fails the service over to another node in the cluster.
#
function restart_service
{
# To restart the dataservice, first, verify that the
# dataservice itself is still registered under PMF.
pmfadm -q $PMF_TAG
if [[ $? -eq 0 ]]; then
# Since the TAG for the dataservice is still registered under
# PMF, first stop the dataservice and start it back up again.
# Obtain the Stop method name and the STOP_TIMEOUT value for
# this resource.
STOP_TIMEOUT=`scha_resource_get -O STOP_TIMEOUT \
-R $RESOURCE_NAME -G $RESOURCEGROUP_NAMÈ
STOP_METHOD=`scha_resource_get -O STOP \
-R $RESOURCE_NAME -G $RESOURCEGROUP_NAMÈ
hatimerun -t $STOP_TIMEOUT $RT_BASEDIR/$STOP_METHOD \
-R $RESOURCE_NAME -G $RESOURCEGROUP_NAME \
-T $RESOURCETYPE_NAME
if [[ $? -ne 0 ]]; then
logger-p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
“${ARGV0} Stop method failed.”
return 1
fi
# Obtain the Start method name and the START_TIMEOUT value for
# this resource.
START_TIMEOUT=`scha_resource_get -O START_TIMEOUT \
-R $RESOURCE_NAME -G $RESOURCEGROUP_NAMÈ
START_METHOD=`scha_resource_get -O START \
-R $RESOURCE_NAME -G $RESOURCEGROUP_NAMÈ
hatimerun -t $START_TIMEOUT $RT_BASEDIR/$START_METHOD \
-R $RESOURCE_NAME -G $RESOURCEGROUP_NAME \
-T $RESOURCETYPE_NAME
if [[ $? -ne 0 ]]; then
logger-p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
“${ARGV0} Start method failed.”
return 1
fi
else
# The absence of the TAG for the dataservice
# implies that the dataservice has already
# exceeded the maximum retries allowed under PMF.
# Therefore, do not attempt to restart the
# dataservice again, but try to failover
# to another node in the cluster.
scha_control -O GIVEOVER -G $RESOURCEGROUP_NAME \
-R $RESOURCE_NAME
fi
return 0
}
###############################################################################
# decide_restart_or_failover ()
#
# This function decides the action to be taken upon the failure of a probe:
# restart the data service locally or fail over to another node in the cluster.
#
function decide_restart_or_failover
{
# Check if this is the first restart attempt.
if [ $retries -eq 0 ]; then
# This is the first failure. Note the time of
# this first attempt.
start_time=`$RT_BASEDIR/gettimè
retries=`expr $retries + 1`
# Because this is the first failure, attempt to restart
# the data service.
restart_service
if [ $? -ne 0 ]; then
logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
“${ARGV0} Failed to restart data service.”
exit 1
fi
else
# This is not the first failure
current_time=`$RT_BASEDIR/gettimè
time_diff=`expr $current_time - $start_timè
if [ $time_diff -ge $RETRY_INTERVAL ]; then
# This failure happened after the time window
# elapsed, so reset the retries counter,
# slide the window, and do a retry.
retries=1
start_time=$current_time
# Because the previous failure occurred more than
# Retry_interval ago, attempt to restart the data service.
restart_service
if [ $? -ne 0 ]; then
logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG \
“${ARGV0} Failed to restart HA-DNS.”
exit 1
fi
elif [ $retries -ge $RETRY_COUNT ]; then
# Still within the time window,
# and the retry counter expired, so fail over.
retries=0
scha_control -O GIVEOVER -G $RESOURCEGROUP_NAME \
-R $RESOURCE_NAME
if [ $? -ne 0 ]; then
logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
“${ARGV0} Failover attempt failed.”
exit 1
fi
else
# Still within the time window,
# and the retry counter has not expired,
# so do another retry.
retries=`expr $retries + 1`
restart_service
if [ $? -ne 0 ]; then
logger -p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \
“${ARGV0} Failed to restart HA-DNS.”
exit 1
fi
fi
fi
}
###############################################################################
# MAIN
###############################################################################
export PATH=/bin:/usr/bin:/usr/cluster/bin:/usr/sbin:/usr/proc/bin:$PATH
# Obtain the syslog facility to use to log messages.
SYSLOG_FACILITY=`scha_cluster_get -O SYSLOG_FACILITY`
# Parse the arguments that have been passed to this method
parse_args “$@”
PMF_TAG=$RESOURCE_NAME.named
SYSLOG_TAG=$RESOURCETYPE_NAME,$RESOURCEGROUP_NAME,$RESOURCE_NAME
# The interval at which probing is to be done is set in the system defined
# property THOROUGH_PROBE_INTERVAL. Obtain the value of this property with
# scha_resource_get
PROBE_INTERVAL=scha_resource_get -O THOROUGH_PROBE_INTERVAL \
-R $RESOURCE_NAME -G $RESOURCEGROUP_NAMÈ
# Obtain the timeout value allowed for the probe, which is set in the
# PROBE_TIMEOUT extension property in the RTR file. The default timeout for
# nslookup is 1.5 minutes.
probe_timeout_info=`scha_resource_get -O Extension -R $RESOURCE_NAME \
-G $RESOURCEGROUP_NAME Probe_timeout`
PROBE_TIMEOUT=`echo $probe_timeout_info | awk `{print $2}'`
# Identify the server on which DNS is serving by obtaining the value
# of the NETWORK_RESOURCES_USED property of the resource.
DNS_HOST=`scha_resource_get -O NETWORK_RESOURCES_USED -R $RESOURCE_NAME \
-G $RESOURCEGROUP_NAMÈ
# Get the retry count value from the system defined property Retry_count
RETRY_COUNT =`scha_resource_get -O RETRY_COUNT -R $RESOURCE_NAME \
-G $RESOURCEGROUP_NAMÈ
# Get the retry interval value from the system defined property
Retry_interval
RETRY_INTERVAL=scha_resource_get -O RETRY_INTERVAL -R $RESOURCE_NAME \
-G $RESOURCEGROUP_NAMÈ
# Obtain the full path for the gettime utility from the
# RT_basedir property of the resource type.
RT_BASEDIR=scha_resource_get -O RT_basedir -R $RESOURCE_NAME \
-G $RESOURCEGROUP_NAMÈ
# The probe runs in an infinite loop, trying nslookup commands.
# Set up a temporary file for the nslookup replies.
DNSPROBEFILE=/tmp/.$RESOURCE_NAME.probe
probefail=0
retries=0
while :
do
# The interval at which the probe needs to run is specified in the
# property THOROUGH_PROBE_INTERVAL. Therefore, set the probe to sleep for a
# duration of <THOROUGH_PROBE_INTERVAL>
sleep $PROBE_INTERVAL
# Run the probe, which queries the IP address on
# which DNS is serving.
hatimerun -t $PROBE_TIMEOUT /usr/sbin/nslookup $DNS_HOST $DNS_HOST \
> $DNSPROBEFILE 2>&1
retcode=$?
if [ retcode -ne 0 ]; then
probefail=1
fi
# Make sure that the reply to nslookup command comes from the HA-DNS
# server and not from another name server listed in the
# /etc/resolv.conf file.
if [ $probefail -eq 0 ]; then
# Get the name of the server that replied to the nslookup query.
SERVER=` awk ` $1==”Server:” {print $2 }' \
$DNSPROBEFILE | awk -F. ` { print $1 } ` `
if [ -z “$SERVER” ];
then
probefail=1
else
if [ $SERVER != $DNS_HOST ]; then
probefail=1
fi
fi
fi
# If the probefail variable is not set to 0, either the nslookup command
# timed out or the reply to the query was came from another server
# (specified in the /etc/resolv.conf file). In either case, the DNS server is
# not responding and the method calls decide_restart_or_failover,
# which evaluates whether to restart the data service or to fail it over
# to another node.
if [ $probefail -ne 0 ]; then
decide_restart_or_failover
else
logger -p ${SYSLOG_FACILITY}.info -t [$SYSLOG_TAG] \
“${ARGV0} Probe for resource HA-DNS successful”
fi
done