[ClusterLabs] VirtualBox : an improved OCF agent
Ioannis Ioannou
roryt at roryt.gr
Thu Oct 4 06:35:50 EDT 2018
Hello to all,
This is the first time I use HA on Linux. I'm still learning, and I like
it a lot.
I was interested to a active/passive system that can support virtual
machines with live migration. For various reason I prefer VirtualBox as
the hypervisor.
I used an existing agent that I found here:
https://forums.virtualbox.org/viewtopic.php?f=7&t=35372, improved it and
I would like to share it.
It is now more robust and ocf compliant and supports live migration (aka
teleportation in VBox's terminology). I'm testing it one week now on
Debian Stretch and seems stable enough for production.
The assumptions in order to be used:
1) Both nodes should mount the same filesystem, the same time (ie iscsi,
cluster, nfs, and so on). I used glusterfs and works like a charm. I put
VM settings and images on it.
2) Both nodes should have reasonable similar hardware - close enough so
that teleportation can work - check VirtualBox's documentation regarding
teleportation for this.
3) Both VMs should have the same settings- if you put the settings on
the common filesystem as I did, then it is easy: create the VM on one
node, just add it on the other.
4) SSH keys should be exchanged between nodes. You see, migrate_to
should be able to handle both nodes in order live migration
(teleporting) to work.
5) Teleportation works by putting the receiving node listening on a port
- by default 6000. Ensure that the port is correct and accessible from
the other node.
Any comments are more than welcomed
Cheers
John (aka Ioannis)
-------------- next part --------------
#!/bin/bash
#set -x
#
# Definition for <vbox> tag, for launching
# VirtualBox VM's as resources
# heavilly modified by roryt at roryt.gr 2018
#
LC_ALL=C
LANG=C
PATH=/bin:/sbin:/usr/bin:/usr/sbin
export LC_ALL LANG PATH
: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat}
. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs
meta_data()
{
cat <<EOT
<?xml version="1.0" ?>
<!DOCTYPE resource-agent SYSTEM "ra-api-1.dtd">
<resource-agent name="vbox" version="1.0">
<version>1.0</version>
<longdesc lang="en">
This is a Oracle VirtualBox VM. All types of guests which
VirtualBox supports, should work.
</longdesc>
<shortdesc lang="en">
Oracle VirtualBox VM
</shortdesc>
<parameters>
<parameter name="vmname" unique="1" required="1">
<longdesc lang="en">
The name or UUID for the VM, as passed to VBoxManage
</longdesc>
<shortdesc lang="en">
VM name
</shortdesc>
<content type="string"/>
</parameter>
<parameter name="shutdowntime" required="0">
<longdesc lang="en">
The time in seconds a VM gets to shutdown after an ACPI event,
before forceful shutdown is used. The maximum is 600 seconds
(10 minutes).
</longdesc>
<shortdesc lang="en">
Time for ACPI shutdown (seconds).
</shortdesc>
<content type="integer"/>
</parameter>
<parameter name="TelePort" required="1">
<longdesc lang="en">
The port the remote site listens for teleport
</longdesc>
<shortdesc lang="en">
Teleport port
</shortdesc>
<content type="integer"/>
</parameter>
</parameters>
<actions>
<action name="start" timeout="30"/>
<action name="stop" timeout="30"/>
<action name="status" depth="10" interval="60" timeout="20"/>
<action name="monitor" depth="10" interval="60" timeout="20"/>
<action name="meta-data" timeout="5"/>
<action name="validate-all" timeout="20"/>
<action name="migrate_to" timeout="60" />
<action name="migrate_from" timeout="10" />
</actions>
</resource-agent>
EOT
}
# Default values
case ${OCF_RESKEY_shutdowntime} in
"") OCF_RESKEY_shutdowntime=30
esac
case ${OCF_RESKEY_TelePort} in
"") OCF_RESKEY_TelePort=6000
esac
# Run a command on a machine
# We assume ssh with keys so to be able to run commands remotely
# $1 should be the command to run, everything after VBoxManage
# if there is a second parameter then should run with ssh on it
function vboxmanage_execute {
ocf_log debug "vboxmanage_execute $1 $2"
#set -x
ocf_log debug "Running command: $1"
SHELL=""
case $2 in
"") ;; # local
*) ocf_log debug "will run on the remote $2"
SHELL="ssh $2"
;;
esac
$SHELL /usr/bin/VBoxManage ${1}
}
# Get the state of the machine
# $1 should be the machine name
# if there is a second parameter then should run with ssh on it
function vm_state {
ocf_log debug "vm_state $1 $2"
#set -x
ocf_log debug "Getting state for VM $1"
STATE=`vboxmanage_execute " showvminfo ${1} --machinereadable" ${2} | grep -E "^VMState=" | awk -F \" '{print $2}'`
ocf_log debug "VM $1 is $STATE "
echo $STATE
}
#Is the machine really on ?
# $1 should be the machine name
# if there is a second parameter then should run with ssh on it
function is_vm_on {
ocf_log debug "is_vm_on $1 $2"
#set -x
STATE=`vm_state $1 $2`
case $STATE in
running) return 0
;;
*) return 1
;;
esac
}
# Wait the machine to start
# $1 should be the machine name
# if there is a second parameter then should run with ssh on it
function wait_for_start {
ocf_log debug "wait_for_start $1 $2"
I=0
until is_vm_on $1 $2
do
ocf_log info "VM $1 is not yet started!"
sleep 1
I=$((${I}+1))
if [ $I -gt $2 ]
then
ocf_log err "VM $1 was not started after $I itterations!"
return 1
fi
done
ocf_log info "VM $1 has been started"
return 0
}
#Try to get the VM to shutdown itself with acpi
# $1 should be the machine name
# if there is a second parameter then should run with ssh on it
function shutdown_vm_acpi {
ocf_log debug "shutdown_vm_acpi $1 $2"
STATE=`vm_state ${1} ${2}`
case $STATE in
poweroff|saved|teleported|stack*)
ocf_log info "No shutdown neccessery, VM $1 is not really running"
return 0
;;
pause) # Really running, resume and then shutdown ?
ocf_log info "Resuming $1 in order to shutdown it"
vboxmanage_execute " controlvm ${1} resume" ${2}
shutdown_vm_acpi ${1} ${2} # I love LISP :)
return $?
;;
running) ocf_log info "Stoping $1 with ACPI"
vboxmanage_execute " controlvm $1 acpipowerbutton" ${2}
return $?
;;
teleportingin) # shit, this sops only with timout
# for the moment this is just failing
ocf_log info "WTF is $1 teleporting ? don;t know how to stop it"
vboxmanage_execute " controlvm $1 poweroff" ${2}
return $?
;;
*) # WTF ?
ocf_log info "WTF is $1 doing ? Fail in order to power off"
return 1
;;
esac
}
#Poweroff the specified VM (force)
# $1 should be the machine name
# if there is a second parameter then should run with ssh on it
function force_poweroff_vm {
ocf_log debug "force_poweroff_vm $1 $2"
ocf_log warning "Forcefully shutting down VM $1"
vboxmanage_execute " controlvm $1 poweroff" ${2}
return $?
}
#Function which tries to shutdown the VM, or use forceful shutdown if the timeout expires
# $1 should be the machine name
# if there is a second parameter then should run with ssh on it
function stop_vm {
ocf_log debug "stop_vm $1 $2"
I=0
until ! is_vm_on ${1} ${2}
do
shutdown_vm_acpi ${1} ${2} # run acpi shutdown eery time, there is a case the machine has not fully started
ocf_log debug "Waiting for $1 to shutdown (waited $I seconds)"
sleep 1
I=$(($I+1))
if [ $I -gt ${OCF_RESKEY_shutdowntime} ]
then
#Shutdown timer expired.
ocf_log warning "Shutdown timer for VM $1 expired! Shutting down forcefully!"
force_poweroff_vm $1 $2
return $?
fi
done
# just in case, try to be in the safe side and cancel any teleporter
vboxmanage_execute " modifyvm $1 --teleporter off " ${2}
return 0
}
#Function which tries to power on the VM
# $1 should be the machine name
# if there is a second parameter then should run with ssh on it
function poweron_vm {
ocf_log debug "poweron_vm $1 $2"
ocf_log info "Starting VM $1"
# just in case, try to be in the safe side and cancel any teleporter
vboxmanage_execute " modifyvm $1 --teleporter off " ${2}
STATE=`vm_state ${1} ${2}`
case $STATE in
poweroff|saved|teleported|stack*) # all cases considered as off
vboxmanage_execute " startvm $1 --type headless"
CODE=$?
ocf_log info " startvm returned $CODE"
return $CODE
;;
pause) # Really it is running, just resume it
ocf_log info "Resuming $1 "
vboxmanage_execute " controlvm ${1} resume" ${2}
return $?
;;
teleportingin) # ooops, tricky. Should stop it first but how ?
ocf_log info "Stop teleporting on $1, then start it"
stop_vm ${1} ${2}
poweron_vm ${1} ${2}
return $?
;;
running) # do nothing
ocf_log info "Already running $1 do nothing"
return 0
;;
esac
# WTF ?
ocf_log info "WTF is $1 doing ? Fail"
return 1
}
# # live migrate, aka teleporting
# $1 is the Vname
# $2 is the remote site
#
# a few steps should happen on both machines.
# *) On the remote site put it on teleport mode and start it
# *) On the local site teleport the VM to the remote
# after you have ensure correct state on each !!!
#
function migrate {
ocf_log debug "migrate $1 $2"
VName=${1}
Target=${2}
Target_port=${OCF_RESKEY_TelePort}
ocf_log info "Migrating ${OCF_RESKEY_vmname} to ${OCF_RESKEY_CRM_meta_migrate_target}."
LOCAL_STATE=`vm_state $VName`
case $LOCAL_STATE in
running|pause) # only on these cases we can have teleporting
;;
*) ocf_log debug "No teleporting $VName to $Target"
return 0
;;
esac
REMOTE_STATE=`vm_state $VName $Target`
case $REMOTE_STATE in
teleportingin) # ok, the remote site is already waiting for teleporting, GOOD !! We can teleport right now!
ocf_log debug "teleporting $VName to $Target"
vboxmanage_execute " controlvm $VName teleport --host $Target --port $Target_port"
CODE=$?
case $CODE in
0) # when a teleporting is succesfull, local Vbox is in teleported stage and waits to get the machine back - do not want this
vboxmanage_execute " modifyvm $VName --teleporter off "
;;
esac
return $CODE
;;
running|pause) # why the remote is running ? Stop it first
ocf_log debug "Stoping $VName on $Target in order to teleport"
stop_vm ${VName} ${Target}
migrate ${VName} ${Target} ${Target_port}
return $?
;;
poweroff|saved|teleported|stack*)
{ # VBoxManage does not returns when it waits teleport !!! Run it in the background
ocf_log debug "Preparing $VName on $Target in order to teleport"
vboxmanage_execute " modifyvm $VName --teleporter on --teleporterport $Target_port " $Target &&
vboxmanage_execute " startvm $VName --type headless" $Target
} &
sleep 2
migrate ${VName} ${Target} ${Target_port}
return $?
;;
esac
ocf_log warning "WTF just happened? ! Migration of $VName to $Target will fail"
return 1
}
case $1 in
start)
poweron_vm ${OCF_RESKEY_vmname} || exit $OCF_ERR_GENERIC
wait_for_start ${OCF_RESKEY_vmname} || exit $OCF_ERR_GENERIC
exit $OCF_SUCCESS
;;
stop)
stop_vm ${OCF_RESKEY_vmname} && exit $OCF_SUCCESS
exit $OCF_ERR_GENERIC
;;
status|monitor|migrate_from)
is_vm_on ${OCF_RESKEY_vmname} && exit $OCF_SUCCESS
exit $OCF_NOT_RUNNING
;;
validate-all|verify_all)
ocf_log debug "$1 called with environment: "
ocf_log debug `env`
case ${OCF_RESKEY_vmname} in
"") exit $OCF_ERR_CONFIGURED ;;
esac
case ${OCF_RESKEY_CRM_meta_migrate_target} in
"") exit $OCF_ERR_CONFIGURED ;;
esac
case ${OCF_RESKEY_TelePort} in
"") exit $OCF_ERR_CONFIGURED ;;
*[!0-9]*) exit $OCF_ERR_ARGS ;;
esac
exit $OCF_SUCCESS
;;
restart)
stop_vm ${OCF_RESKEY_vmname} || exit $OCF_ERR_GENERIC
poweron_vm ${OCF_RESKEY_vmname} || exit $OCF_ERR_GENERIC
wait_for_start ${OCF_RESKEY_vmname} || exit $OCF_ERR_GENERIC
exit $OCF_SUCCESS
;;
meta-data)
meta_data
exit $OCF_SUCCESS
;;
migrate|migrate_to)
migrate ${OCF_RESKEY_vmname} ${OCF_RESKEY_CRM_meta_migrate_target} && exit $OCF_SUCCESS
exit $OCF_ERR_GENERIC
;;
*)
echo "usage: $0 {start|stop|status|monitor|restart|meta-data|validate-all|migrate|migrate_to|migrate_from}"
exit $OCF_ERR_UNIMPLEMENTED
;;
esac
More information about the Users
mailing list