[Pacemaker] Pacemaker on system with disk failure

Carsten Otto carsten.otto at andrena.de
Thu Sep 25 10:39:06 EDT 2014


Dear John,

On Thu, Sep 25, 2014 at 10:03:27AM -0400, John Lauro wrote:
> One of the reasons I like ksh is that true, echo, and sleep (among
> many others) are all builtin, so you don't need those commands on the
> filesystem, so the script is less likely to fail if the filesystem
> fails...  that said you probably don't have ksh installed by default.

Thanks for the hint! I just wrote a simple watchdog resource agent and
the corresponding shell script which successfully reboots a server when
the disk fails.

I provided my solution in the attachment.

Put crude-watchdog.sh in /root/, and put crude-watchdog in
/usr/lib/ocf/resource.d/heartbeat/.

In my two node cluster I used these commands to let this watchdog run on
all two machines:
pcs resource create WATCHDOG ocf:heartbeat:crude-watchdog
pcs resource clone WATCHDOG

Best regards,
Carsten
-- 
andrena objects ag
Büro Frankfurt
Clemensstr. 8
60487 Frankfurt

Tel: +49 (0) 69 977 860 38
Fax: +49 (0) 69 977 860 39
http://www.andrena.de

Vorstand: Hagen Buchwald, Matthias Grund, Dr. Dieter Kuhn
Aufsichtsratsvorsitzender: Rolf Hetzelberger

Sitz der Gesellschaft: Karlsruhe
Amtsgericht Mannheim, HRB 109694
USt-IdNr. DE174314824

Bitte beachten Sie auch unsere anstehenden Veranstaltungen:
http://www.andrena.de/events
-------------- next part --------------
A non-text attachment was scrubbed...
Name: crude-watchdog.sh
Type: application/x-sh
Size: 189 bytes
Desc: not available
URL: <https://lists.clusterlabs.org/pipermail/pacemaker/attachments/20140925/b9ce7e6f/attachment-0003.sh>
-------------- next part --------------
#!/bin/sh

: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
SCRIPT=/root/crude-watchdog.sh

meta_data() {
	cat <<END
<?xml version="1.0"?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="crude-watchdog" version="1.0">
<version>1.0</version>

<longdesc lang="en">
This agent reboots the system if the root file system stops working.
</longdesc>
<shortdesc lang="en">
This agent reboots the system if the root file system stops working.
</shortdesc>
<parameters>
</parameters>

<actions>
<action name="start"        timeout="20" />
<action name="stop"         timeout="20" />
<action name="monitor"      timeout="20" interval="10" depth="0" />
<action name="reload"       timeout="20" />
<action name="migrate_to"   timeout="20" />
<action name="migrate_from" timeout="20" />
<action name="meta-data"    timeout="5" />
<action name="validate-all" timeout="20" />
</actions>
</resource-agent>
END
}

#######################################################################

watchdog_usage() {
	cat <<END
usage: $0 {start|stop|monitor|migrate_to|migrate_from|validate-all|meta-data}

Expects to have a fully populated OCF RA-compliant environment set.
END
}

watchdog_start() {
    watchdog_monitor
    if [ $? =  $OCF_SUCCESS ]; then
	return $OCF_SUCCESS
    fi
    nohup $SCRIPT &
}

watchdog_stop() {
    watchdog_monitor
    if [ $? =  $OCF_SUCCESS ]; then
	killall crude-watchdog.sh
    fi
    watchdog_monitor
    if [ $? =  $OCF_SUCCESS ]; then
	return $OCF_ERR_GENERIC
    fi
    return $OCF_SUCCESS
}

watchdog_monitor() {
	RES=`ps aux | grep crude-watchdog.sh | grep -v grep -q`
	if [ $? = 0 ]; then
	    return $OCF_SUCCESS
	fi
	return $OCF_NOT_RUNNING
}

watchdog_validate() {
    if [ -x "$SCRIPT" ]; then
      return $OCF_SUCCESS
    fi

    return $OCF_ERR_ARGS
}

case $__OCF_ACTION in
meta-data)	meta_data
		exit $OCF_SUCCESS
		;;
start)		watchdog_start;;
stop)		watchdog_stop;;
monitor)	watchdog_monitor;;
migrate_to)	ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} to ${OCF_RESKEY_CRM_meta_migrate_target}."
	        watchdog_stop
		;;
migrate_from)	ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} from ${OCF_RESKEY_CRM_meta_migrate_source}."
	        watchdog_start
		;;
reload)		ocf_log info "Reloading ${OCF_RESOURCE_INSTANCE} ..."
		;;
validate-all)	watchdog_validate;;
usage|help)	watchdog_usage
		exit $OCF_SUCCESS
		;;
*)		watchdog_usage
		exit $OCF_ERR_UNIMPLEMENTED
		;;
esac
rc=$?
ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc"
exit $rc

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <https://lists.clusterlabs.org/pipermail/pacemaker/attachments/20140925/b9ce7e6f/attachment-0003.sig>


More information about the Pacemaker mailing list