[Pacemaker] Issue with clusterlab mysql ocf script

Fri Aug 26 13:19:31 EDT 2011

I'm having a problem with master/slave promotion using the most recent
version of the mysql ocf script hosted off the clusterLabs/resource-agents
github repo.

The script works well failing over to a slave if a master looses connection
with the cluster.  However, when the master rejoins the cluster the script
is doing some undesirable things.  Basically, if the master looses
connection (say I pull the network cable) then a new slave is promoted and
the old master is just orphaned (which is fine, I don't have STONITH setup
yet or anything).  If i plug that machine's cable back in then the node
rejoins the cluster and initially there are now two masters (the old,
orphaned one and the newly promoted one).  Pacemaker properly sees this and
demotes the old master to a slave.

After some time debugging the ocf I think what is happening is that the
script sees the old master join and fires off a post-demote notification
event for the returning master which causes a unset_master command to be
executed.  This causes all the slaves to remove their master connection
info.  However, since the other master server has already been promoted and
is (to its mind) already replicating to the other slaves in the cluster, a
new pre-promote is never fired which means that the slaves do not get a new
CHANGE MASTER TO issued so I wind up with a broken replication setup.

I'm not sure if I'm missing something in how this is supposed to be working
or if this is a limitation of the script.  It seems like there must be
either a bug or something I've got setup wrong, however, since it's not all
that unlikely that such a scenario could occur.  If anyone has any ideas or
suggestions on how the script is supposed to work (or what I may be doing
wrong) I'd appreciate some ideas.

I'll include the output of my crm configure show in case it'll be useful:

node $id="a1a3266d-24e2-4d1b-bfd7-de3bac929661" seven \
attributes 172.17.0.130-log-file-p_mysql="mysql-bin.000005"
172.17.0.130-log-pos-p_mysql="865"
172.17.0.131-log-file-p_mysql="mysql-bin.000038"
172.17.0.131-log-pos-p_mysql="607" four-log-file-p_mysql="mysql-bin.000040"
four-log-pos-p_mysql="2150"
node $id="cc0227a2-a7bc-4a0d-ba1b-f6ecb7e7d845" four \
attributes 172.17.0.130-log-file-p_mysql="mysql-bin.000005"
172.17.0.130-log-pos-p_mysql="865" three-log-file-p_mysql="mysql-bin.000022"
three-log-pos-p_mysql="106"
node $id="d9d3c6cb-bf60-4468-926f-d9716e56fb0f" three \
attributes 172.17.0.131-log-file-p_mysql="mysql-bin.000038"
172.17.0.131-log-pos-p_mysql="607" three-log-pos-p_mysql="4"
primitive p_mysql ocf:heartbeat:mysql \
params binary="/usr/sbin/mysqld" config="/etc/mysql/my.cnf" \
params pid="/var/lib/mysql/mySQL.pid" socket="/var/run/mysqld/mysqld.sock" \
 params replication_user="sqlSlave" replication_passwd="slave" \
params additional_parameters="--skip-slave-start" \
 op start interval="0" timeout="120" \
op stop interval="0" timeout="120" \
 op promote interval="0" timeout="120" \
op demote interval="0" timeout="120" \
 op monitor interval="5" role="Master" timeout="30" \
op monitor interval="10" role="Slave" timeout="30"
ms ms_mysql p_mysql \
meta master-max="1" clone-max="3" target-role="Started" is-managed="true"
notify="true" \
 meta target-role="Started"
property $id="cib-bootstrap-options" \
dc-version="1.0.9-da7075976b5ff0bee71074385f8fd02f296ec8a3" \
 cluster-infrastructure="Heartbeat" \
stonith-enabled="false" \
 last-lrm-refresh="1314307995"

Thanks!
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.clusterlabs.org/pipermail/pacemaker/attachments/20110826/ab06ae00/attachment-0002.html>