Sun Cluster 3.0 12/01 Data Services Developer's Guide

The svc_probe Function

The svc_probe function makes a simple socket connection to the specified port by calling scds_fm_tcp_connect. If the connection fails, svc_probe returns a value of 100 indicating a complete failure. If the connection succeeds, but the disconnect fails, svc_probe returns a value of 50 indicating a partial failure. If the connection and disconnection both succeed, svc_probe returns a value of 0, indicating success.

The code for svc_probe is as follows.


Example 7-18

int
svc_probe(scds_handle_t scds_handle, char *hostname, int port, int
timeout)
{
	int  rc;
	hrtime_t	t1, t2;
	int 	sock;
	char	testcmd[2048];
	int 	time_used, time_remaining;
	time_t		connect_timeout;


	/*
	 * probe the data service by doing a socket connection to the port
*/
	 * specified in the port_list property to the host that is
	 * serving the XFS data service. If the XFS service which is configured
	 * to listen on the specified port, replies to the connection,
then
	 * the probe is successful. Else we will wait for a time period
set
	 * in probe_timeout property before concluding that the probe failed.
	 */

	/*
	 * Use the SVC_CONNECT_TIMEOUT_PCT percentage of timeout
	 * to connect to the port
	 */
	connect_timeout = (SVC_CONNECT_TIMEOUT_PCT * timeout)/100;
	t1 = (hrtime_t)(gethrtime()/1E9);

	/*
	 * the probe makes a connection to the specified hostname and port.
	 * The connection is timed for 95% of the actual probe_timeout.
	 */
	rc = scds_fm_tcp_connect(scds_handle, &sock, hostname,
port,
	    connect_timeout);
	if (rc) {
		scds_syslog(LOG_ERR,
		    "Failed to connect to port <%d> of resource <%s>.",
		    port, scds_get_resource_name(scds_handle));
		/* this is a complete failure */
		return (SCDS_PROBE_COMPLETE_FAILURE);
	}

	t2 = (hrtime_t)(gethrtime()/1E9);

	/*
	 * Compute the actual time it took to connect. This should be less
than
	 * or equal to connect_timeout, the time allocated to connect.
	 * If the connect uses all the time that is allocated for it,
	 * then the remaining value from the probe_timeout that is passed
to
	 * this function will be used as disconnect timeout. Otherwise,
the
	 * the remaining time from the connect call will also be added
to
	 * the disconnect timeout.
	 *
	 */

	time_used = (int)(t2 - t1);

	/*
	 * Use the remaining time(timeout - time_took_to_connect) to disconnect
	 */

	time_remaining = timeout - (int)time_used;

	/*
	 * If all the time is used up, use a small hardcoded timeout
	 * to still try to disconnect. This will avoid the fd leak.
	 */
	if (time_remaining <= 0) {
		scds_syslog_debug(DBG_LEVEL_LOW,
		    "svc_probe used entire timeout of "
		    "%d seconds during connect operation and exceeded the "
		    "timeout by %d seconds. Attempting disconnect with timeout"
		    " %d ",
		    connect_timeout,
		    abs(time_used),
		    SVC_DISCONNECT_TIMEOUT_SECONDS);

		time_remaining = SVC_DISCONNECT_TIMEOUT_SECONDS;
	}

	/*
	 * Return partial failure in case of disconnection failure.
	 * Reason: The connect call is successful, which means
	 * the application is alive. A disconnection failure
	 * could happen due to a hung application or heavy load.
	 * If it is the later case, don't declare the application
	 * as dead by returning complete failure. Instead, declare
	 * it as partial failure. If this situation persists, the
	 * disconnect call will fail again and the application will be
	 * restarted.
	 */
	rc = scds_fm_tcp_disconnect(scds_handle, sock, time_remaining);
	if (rc != SCHA_ERR_NOERR) {
		scds_syslog(LOG_ERR,
		    "Failed to disconnect to port %d of resource %s.",
		    port, scds_get_resource_name(scds_handle));
		/* this is a partial failure */
		return (SCDS_PROBE_COMPLETE_FAILURE/2);
	}

	t2 = (hrtime_t)(gethrtime()/1E9);
	time_used = (int)(t2 - t1);
	time_remaining = timeout - time_used;

	/*
	 * If there is no time left, don't do the full test with
	 * fsinfo. Return SCDS_PROBE_COMPLETE_FAILURE/2
	 * instead. This will make sure that if this timeout
	 * persists, server will be restarted.
	 */
	if (time_remaining <= 0) {
		scds_syslog(LOG_ERR, "Probe timed out.");
		return (SCDS_PROBE_COMPLETE_FAILURE/2);
	}

	/*
	 * The connection and disconnection to port is successful,
	 * Run the fsinfo command to perform a full check of
	 * server health.
	 * Redirect stdout, otherwise the output from fsinfo
	 * ends up on the console.
	 */
	(void) sprintf(testcmd,
	    "/usr/openwin/bin/fsinfo -server %s:%d > /dev/null",
	    hostname, port);
	scds_syslog_debug(DBG_LEVEL_HIGH,
	    "Checking the server status with %s.", testcmd);
	if (scds_timerun(scds_handle, testcmd, time_remaining,
		SIGKILL, &rc) != SCHA_ERR_NOERR || rc != 0) {

		scds_syslog(LOG_ERR,
			"Failed to check server status with command <%s>",
			testcmd);
		return (SCDS_PROBE_COMPLETE_FAILURE/2);
	}
	return (0);
}

When finished, svc_probe returns a success (0), partial failure (50), or complete failure (100) value. The xfnts_probe method passes this value to scds_fm_action.