decide_restart_or_failover() 调用 restart_service() 函数,以尝试在同一节点上重新启动数据服务。 该函数执行以下逻辑:
确定数据服务是否仍在 PMF 之下注册。如果服务仍处于注册状态,函数将执行以下操作:
为数据服务获取 Stop 方法名称和 Stop_timeout 值
使用 hatimerun 为数据服务启动 Stop 方法,传递 Stop_timeout 值
如果数据服务成功停止,则为数据服务获取 Start 方法名称和 Start_timeout 值
使用 hatimerun 为数据服务启动 Start 方法,传递 Start_timeout 值
如果数据服务已不在 PMF 之下注册,则表示数据服务已超过了 PMF 下允许的最大重试次数。系统将使用 GIVEOVER 选项调用 scha_control() 函数,以将数据服务故障转移到其他节点。
function restart_service { # To restart the data service, first verify that the # data service itself is still registered under PMF. pmfadm -q $PMF_TAG if [[ $? -eq 0 ]]; then # Since the TAG for the data service is still registered under # PMF, first stop the data service and start it back up again. # Obtain the Stop method name and the STOP_TIMEOUT value for # this resource. STOP_TIMEOUT=`scha_resource_get -O STOP_TIMEOUT \ -R $RESOURCE_NAME -G $RESOURCEGROUP_NAM? STOP_METHOD=`scha_resource_get -O STOP \ -R $RESOURCE_NAME -G $RESOURCEGROUP_NAM? hatimerun -t $STOP_TIMEOUT $RT_BASEDIR/$STOP_METHOD \ -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME \ -T $RESOURCETYPE_NAME if [[ $? -ne 0 ]]; then logger-p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \ “${ARGV0} Stop method failed.” return 1 fi # Obtain the START method name and the START_TIMEOUT value for # this resource. START_TIMEOUT=`scha_resource_get -O START_TIMEOUT \ -R $RESOURCE_NAME -G $RESOURCEGROUP_NAM? START_METHOD=`scha_resource_get -O START \ -R $RESOURCE_NAME -G $RESOURCEGROUP_NAM? hatimerun -t $START_TIMEOUT $RT_BASEDIR/$START_METHOD \ -R $RESOURCE_NAME -G $RESOURCEGROUP_NAME \ -T $RESOURCETYPE_NAME if [[ $? -ne 0 ]]; then logger-p ${SYSLOG_FACILITY}.err -t [$SYSLOG_TAG] \ “${ARGV0} Start method failed.” return 1 fi else # The absence of the TAG for the dataservice # implies that the data service has already # exceeded the maximum retries allowed under PMF. # Therefore, do not attempt to restart the # data service again, but try to failover # to another node in the cluster. scha_control -O GIVEOVER -G $RESOURCEGROUP_NAME \ -R $RESOURCE_NAME fi return 0 }