[ClusterLabs] VirtualBox : an improved OCF agent

Ioannis Ioannou roryt at roryt.gr
Thu Oct 4 06:35:50 EDT 2018


Hello to all,

This is the first time I use HA on Linux. I'm still learning, and I like 
it a lot.

I was interested to a active/passive system that can support virtual 
machines with live migration. For various reason I prefer VirtualBox as 
the hypervisor.

I used an existing agent that I found here: 
https://forums.virtualbox.org/viewtopic.php?f=7&t=35372, improved it and 
I would like to share it.

It is now more robust and ocf compliant and supports live migration (aka 
teleportation in VBox's terminology). I'm testing it one week now on 
Debian Stretch and seems stable enough for production.


The assumptions in order to be used:

1) Both nodes should mount the same filesystem, the same time (ie iscsi, 
cluster, nfs, and so on). I used glusterfs and works like a charm. I put 
VM settings and images on it.

2) Both nodes should have reasonable similar hardware - close enough so 
that teleportation can work - check VirtualBox's documentation regarding 
teleportation for this.

3) Both VMs should have the same settings- if you put the settings on 
the common filesystem as I did, then it is easy: create the VM on one 
node, just add it on the other.

4) SSH keys should be exchanged between nodes. You see, migrate_to 
should be able to handle both nodes in order live migration 
(teleporting) to work.

5) Teleportation works by putting the receiving node listening on a port 
- by default 6000. Ensure that the port is correct and accessible from 
the other node.


Any comments are more than welcomed


Cheers

John (aka Ioannis)

-------------- next part --------------
#!/bin/bash
#set -x
#
# Definition for <vbox> tag, for launching
# VirtualBox VM's as resources
# heavilly modified by roryt at roryt.gr 2018
#

LC_ALL=C
LANG=C
PATH=/bin:/sbin:/usr/bin:/usr/sbin
export LC_ALL LANG PATH


: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs

meta_data()
{
    cat <<EOT
<?xml version="1.0" ?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="vbox" version="1.0">
    <version>1.0</version>
    <longdesc lang="en">
        This is a Oracle VirtualBox VM. All types of guests which
        VirtualBox supports, should work.
    </longdesc>
    <shortdesc lang="en">
        Oracle VirtualBox VM
    </shortdesc>

    <parameters>
        <parameter name="vmname" unique="1" required="1">
            <longdesc lang="en">
                The name or UUID for the VM, as passed to VBoxManage
            </longdesc>

            <shortdesc lang="en">
                VM name 
            </shortdesc>
            <content type="string"/>
        </parameter>

         <parameter name="shutdowntime" required="0">
            <longdesc lang="en">
                The time in seconds a VM gets to shutdown after an ACPI event,
                before forceful shutdown is used. The maximum is 600 seconds
                (10 minutes).
            </longdesc>

            <shortdesc lang="en">
                Time for ACPI shutdown (seconds).
            </shortdesc>
            <content type="integer"/>
        </parameter>

        <parameter name="TelePort" required="1">
            <longdesc lang="en">
                The port the remote site listens for teleport
            </longdesc>

            <shortdesc lang="en">
                Teleport port
            </shortdesc>
            <content type="integer"/>
        </parameter>


    </parameters>

    <actions>
        <action name="start" timeout="30"/>
        <action name="stop" timeout="30"/>
        <action name="status" depth="10" interval="60" timeout="20"/>
        <action name="monitor" depth="10" interval="60" timeout="20"/>
        <action name="meta-data" timeout="5"/>
        <action name="validate-all" timeout="20"/>
	<action name="migrate_to"   timeout="60" />
	<action name="migrate_from"   timeout="10" />
    </actions>
</resource-agent>
EOT
}

# Default values
case ${OCF_RESKEY_shutdowntime} in
	"")	OCF_RESKEY_shutdowntime=30
esac
case ${OCF_RESKEY_TelePort} in
	"")	OCF_RESKEY_TelePort=6000
esac


# Run a command on a machine
# We assume ssh with keys so to be able to run commands remotely
# $1 should be the command to run, everything after VBoxManage
# if there is a second parameter then should run with ssh on it
function vboxmanage_execute {
	ocf_log debug "vboxmanage_execute $1 $2"
	#set -x
        ocf_log debug "Running command: $1"
	SHELL=""
	case $2 in
		"")	;; # local
		*)	ocf_log debug "will run on the remote $2"
			SHELL="ssh $2"
			;;
	esac
	$SHELL /usr/bin/VBoxManage  ${1} 
}


# Get the state of the machine
# $1 should be the machine name
# if there is a second parameter then should run with ssh on it
function vm_state {
	ocf_log debug "vm_state $1 $2"
	#set -x
        ocf_log debug "Getting state for VM $1"
	STATE=`vboxmanage_execute " showvminfo  ${1} --machinereadable" ${2} | grep -E "^VMState=" | awk -F \" '{print $2}'`
	ocf_log debug "VM $1 is $STATE "
	echo $STATE
}


#Is the machine really on ? 
# $1 should be the machine name
# if there is a second parameter then should run with ssh on it
function is_vm_on {
	ocf_log debug "is_vm_on $1 $2"
	#set -x
	STATE=`vm_state $1 $2`
        case $STATE in
		running)	return 0
				;;
		*)		return 1
				;;
	esac
}

# Wait the machine to start
# $1 should be the machine name
# if there is a second parameter then should run with ssh on it
function wait_for_start {
	ocf_log debug "wait_for_start $1 $2"
        I=0
        until is_vm_on $1 $2
	do
		ocf_log info "VM $1 is not yet started!"
		sleep 1
		I=$((${I}+1))
		if [ $I -gt $2 ] 
		then
			ocf_log err "VM $1 was not started after $I itterations!"
			return 1
		fi
        done
	ocf_log info "VM $1 has been started"
	return 0
}

#Try to get the VM to shutdown itself with acpi
# $1 should be the machine name
# if there is a second parameter then should run with ssh on it
function shutdown_vm_acpi {
	ocf_log debug "shutdown_vm_acpi $1 $2"
	STATE=`vm_state ${1} ${2}`
	case $STATE in
		poweroff|saved|teleported|stack*)		
				ocf_log info "No shutdown neccessery, VM $1 is not really running"
		                return 0
				;;
		pause)		# Really running, resume and then shutdown ? 
				ocf_log info "Resuming $1 in order to shutdown it"
				vboxmanage_execute " controlvm ${1} resume" ${2}
				shutdown_vm_acpi ${1} ${2} # I love LISP :)
                		return $?
				;;
		running)	ocf_log info "Stoping $1 with ACPI"
				vboxmanage_execute " controlvm $1 acpipowerbutton" ${2}
                		return $?
				;;
		teleportingin)	# shit, this sops only with timout
				# for the moment this is just failing
				ocf_log info "WTF is $1 teleporting ? don;t know how to stop it"
        			vboxmanage_execute " controlvm $1 poweroff" ${2}
				return $?
				;;
		*)		# WTF ? 
				ocf_log info "WTF is $1 doing ? Fail in order to power off"
				return 1
				;;
	esac
}

#Poweroff the specified VM (force)
# $1 should be the machine name
# if there is a second parameter then should run with ssh on it
function force_poweroff_vm {
	ocf_log debug "force_poweroff_vm $1 $2"
        ocf_log warning "Forcefully shutting down VM $1"
        vboxmanage_execute " controlvm $1 poweroff" ${2}
        return $?
}

#Function which tries to shutdown the VM, or use forceful shutdown if the timeout expires
# $1 should be the machine name
# if there is a second parameter then should run with ssh on it
function stop_vm {
	ocf_log debug "stop_vm $1 $2"
        I=0
        until ! is_vm_on ${1} ${2}
	do
        	shutdown_vm_acpi ${1} ${2} # run acpi shutdown eery time, there is a case the machine has not fully started
		ocf_log debug "Waiting for $1 to shutdown (waited $I seconds)"
		sleep 1
		I=$(($I+1))
		if [ $I -gt ${OCF_RESKEY_shutdowntime} ]
		then
			#Shutdown timer expired.
			ocf_log warning "Shutdown timer for VM $1 expired! Shutting down forcefully!"
			force_poweroff_vm $1 $2
			return $?
		fi
	done
	# just in case, try to be in the safe side and cancel any teleporter 
        vboxmanage_execute " modifyvm $1  --teleporter off " ${2}
        return 0
}


#Function which tries to power on the VM
# $1 should be the machine name
# if there is a second parameter then should run with ssh on it
function poweron_vm {
	ocf_log debug "poweron_vm $1 $2"
        ocf_log info "Starting VM $1"
	# just in case, try to be in the safe side and cancel any teleporter 
        vboxmanage_execute " modifyvm $1  --teleporter off " ${2}

	STATE=`vm_state ${1} ${2}`
	case $STATE in
		poweroff|saved|teleported|stack*) # all cases considered as off
        					vboxmanage_execute " startvm $1 --type headless" 
        					CODE=$?
        					ocf_log info " startvm returned $CODE"
						return $CODE
						;;
		pause)				# Really it is running, just resume it
						ocf_log info "Resuming $1 "
						vboxmanage_execute " controlvm ${1} resume" ${2}
                				return $?
						;;
		teleportingin)			# ooops, tricky. Should stop it first but how ? 
						ocf_log info "Stop teleporting on $1, then start it"
						stop_vm ${1} ${2}
						poweron_vm ${1} ${2}
                				return $?
						;;							
		running)			# do nothing
						ocf_log info "Already running $1 do nothing"
                				return 0
						;;
	esac
	# WTF ? 
	ocf_log info "WTF is $1 doing ? Fail"
	return 1
}

# # live migrate, aka teleporting
# $1 is the Vname
# $2 is the remote site
#
# a few steps should happen on both machines. 
# 	*) On the remote site put it on teleport mode and start it
# 	*) On the local site teleport the VM to the remote
# 	after you have ensure correct state on each !!!
# 
function migrate { 
	ocf_log debug "migrate $1 $2"
	VName=${1}
	Target=${2}
	Target_port=${OCF_RESKEY_TelePort}
	ocf_log info "Migrating ${OCF_RESKEY_vmname} to ${OCF_RESKEY_CRM_meta_migrate_target}."

	LOCAL_STATE=`vm_state $VName` 
	case $LOCAL_STATE in
		 running|pause)	# only on these cases we can have teleporting
				;;
		*)		ocf_log debug "No teleporting $VName to $Target"
				return 0
				;;
	esac
	
	REMOTE_STATE=`vm_state $VName $Target`
	case $REMOTE_STATE in
		teleportingin) # ok, the remote site is already waiting for teleporting, GOOD !! We can teleport right now!
				ocf_log debug "teleporting $VName to $Target"
				vboxmanage_execute " controlvm $VName teleport --host $Target --port $Target_port" 
				CODE=$?
				case $CODE in
					0)	# when a teleporting is succesfull, local Vbox is in teleported stage and waits to get the machine back - do not want this
						vboxmanage_execute " modifyvm $VName  --teleporter off " 
						;;
				esac
       				return $CODE
       				;;
		running|pause)	# why the remote is running ? Stop it first
				ocf_log debug "Stoping $VName on $Target in order to teleport"
				stop_vm ${VName} ${Target}
				migrate ${VName} ${Target} ${Target_port}	
				return $?
				;;
		poweroff|saved|teleported|stack*)
				{	# VBoxManage does not returns when it waits teleport !!! Run it in the background
					ocf_log debug "Preparing $VName on $Target in order to teleport"
					vboxmanage_execute " modifyvm $VName --teleporter on --teleporterport $Target_port " $Target  &&
						vboxmanage_execute " startvm $VName --type headless" $Target
				} & 
				sleep 2
				migrate ${VName} ${Target} ${Target_port} 
				return $?
				;;
	esac
	ocf_log warning "WTF just happened? ! Migration of $VName to $Target will fail"
       	return 1
}

case $1 in
	start)
        	poweron_vm ${OCF_RESKEY_vmname} || exit  $OCF_ERR_GENERIC
	        wait_for_start ${OCF_RESKEY_vmname} ||  exit  $OCF_ERR_GENERIC
        	exit $OCF_SUCCESS
	        ;;
	stop)
        	stop_vm ${OCF_RESKEY_vmname}  &&  exit $OCF_SUCCESS
	        exit  $OCF_ERR_GENERIC
        	;;
	status|monitor|migrate_from)
        	is_vm_on ${OCF_RESKEY_vmname} && exit $OCF_SUCCESS
	        exit $OCF_NOT_RUNNING
		;;
	validate-all|verify_all)
		ocf_log debug "$1 called with environment: " 
		ocf_log debug `env`
        	case ${OCF_RESKEY_vmname} in
			"")	exit $OCF_ERR_CONFIGURED ;;
		esac
        	case ${OCF_RESKEY_CRM_meta_migrate_target} in
			"")	exit $OCF_ERR_CONFIGURED ;;
		esac
        	case ${OCF_RESKEY_TelePort} in
			"")		exit $OCF_ERR_CONFIGURED ;;
			*[!0-9]*)	exit $OCF_ERR_ARGS ;;
		esac
	        exit $OCF_SUCCESS
		;;
	restart)
        	stop_vm ${OCF_RESKEY_vmname} || exit $OCF_ERR_GENERIC
		poweron_vm ${OCF_RESKEY_vmname} || exit $OCF_ERR_GENERIC
        	wait_for_start ${OCF_RESKEY_vmname} || exit $OCF_ERR_GENERIC
	        exit $OCF_SUCCESS
        	;;
	meta-data)
        	meta_data
	  	exit $OCF_SUCCESS
        	;;
	migrate|migrate_to)	
		migrate ${OCF_RESKEY_vmname}  ${OCF_RESKEY_CRM_meta_migrate_target} && exit $OCF_SUCCESS
		exit  $OCF_ERR_GENERIC
		;;
	*)
	        echo "usage: $0 {start|stop|status|monitor|restart|meta-data|validate-all|migrate|migrate_to|migrate_from}"
        	exit $OCF_ERR_UNIMPLEMENTED
	        ;;
esac



More information about the Users mailing list