[Pacemaker] crmd exits and restarts after failback

Wed Dec 8 06:56:40 EST 2010

Hi,

I have set up a pacemaker cluster on Ubuntu 10.04 LTS Server.
Further I wrote an multistate OCF RA for the Rsyslog service. This RA passes
all tests that are run by the ocf-tester tool.

Now the problem:
When I firstly start the msSyslog resource it promotes on node1 and is fully
functional. After that I set node1 to standby. The other node (node2) takes
the master role. This behaviour is just as expected. Then I set node1 to
online again to test if the failback works. There the error occurs: the crmd
exits and starts again. These actions occur in an endless loop and I can
just reboot both nodes several times to come in a functional state again.
I attached a summary of the log file so that you can see what's happening
exactly.
The cluster is configured as follows:
node node1 \
    attributes standby="off"
node node2 \
    attributes standby="off"
primitive resApache ocf:heartbeat:apache \
    params configfile="/mnt/DRBD/drbd0/apache/cnf/apache2.conf" \
    op monitor interval="10" timeout="20" \
    op start interval="0" timeout="40" \
    op stop interval="0" timeout="60" \
    meta target-role="Started"
primitive resDHCP ocf:T-Systems:dhcp3 \
    params config="/mnt/DRBD/drbd1/dhcp/cnf/dhcpd.conf"
leases="/mnt/DRBD/drbd1/dhcp/data/dhcpd.leases" \
    op monitor interval="10s" timeout="30s" \
    op start interval="0" timeout="120s" \
    op stop interval="0" timeout="120s" \
    meta target-role="Started"
primitive resDRBD0 ocf:linbit:drbd \
    params drbd_resource="drbd0" \
    op monitor interval="60s" role="Master" timeout="120s" \
    op monitor interval="59s" \
    op start interval="0" timeout="240s" \
    op stop interval="0" timeout="100s"
primitive resDRBD1 ocf:linbit:drbd \
    params drbd_resource="drbd1" \
    op monitor interval="60s" role="Master" timeout="120s" \
    op monitor interval="59s"
    op start interval="0" timeout="240s" \
    op stop interval="0" timeout="100s"
primitive resFSys0 ocf:heartbeat:Filesystem \
    params device="/dev/drbd0" fstype="ext4" directory="/mnt/DRBD/drbd0" \
    op monitor interval="20s" timeout="40s" \
    op start interval="0" timeout="60s" \
    op stop interval="0" timeout="60s" \
    meta target-role="Started"
primitive resFSys1 ocf:heartbeat:Filesystem \
    params device="/dev/drbd1" fstype="ext4" directory="/mnt/DRBD/drbd1" \
    op monitor interval="20s" timeout="40s" \
    op start interval="0" timeout="60s" \
    op stop interval="0" timeout="60s" \
    meta target-role="Started"
primitive resIP0 ocf:heartbeat:IPaddr2 \
    params ip="10.32.194.246" nic="eth0" cidr_netmask="24" iflabel="0" \
    op monitor interval="10s" \
    meta target-role="Started"
primitive resIP1 ocf:heartbeat:IPaddr2 \
    params ip="10.32.194.247" nic="eth0" cidr_netmask="24" iflabel="1" \
    op monitor interval="10s" \
    meta target-role="Started"
primitive resIPVM ocf:heartbeat:IPaddr2 \
    params ip="192.168.200.30" nic="eth2" cidr_netmask="24" \
    op monitor interval="10s"
primitive resMySQL ocf:heartbeat:mysql \
    params binary="/usr/bin/mysqld_safe" pid="/var/run/mysqld/mysqld.pid"
socket="/var/run/mysqld/mysqld.sock" datadir="/mnt/DRBD/drbd0/mysql/data"
config="/mnt/DRBD/drbd0/mysql/cnf/my.cnf" \
    op monitor interval="10s" timeout="30s" \
    op start interval="0" timeout="120s" \
    op stop interval="0" timeout="120s" \
    meta target-role="Started"
primitive resNagios lsb:nagios3 \
    op monitor interval="15" timeout="20s" \
    op start interval="0" timeout="120s" \
    op stop interval="0" timeout="120s"
primitive resSendmail lsb:sendmail \
    op monitor interval="20s" timeout="60s" \
    op start interval="0" timeout="120s" \
    op stop interval="0" timeout="120s"
primitive resSquid ocf:heartbeat:Squid \
    params squid_exe="/usr/sbin/squid" squid_pidfile="/var/run/squid.pid"
squid_conf="/mnt/DRBD/drbd1/squid/cnf/squid.conf" squid_port="3128" \
    op monitor interval="10s" timeout="30s" \
    op start interval="0" timeout="60s" \
    op stop interval="0" timeout="120s"
primitive resSyslog ocf:T-Systems:Rsyslog \
    params master_config="/mnt/DRBD/drbd0/Rsyslog/cnf/rsyslog_master.conf"
slave_config="/etc/rsyslog.conf" \
    op monitor interval="10s" role="Master" timeout="30s" \
    op monitor interval="11s" timeout="33s" \
    op start interval="0" timeout="120s" \
    op stop interval="0" timeout="120s"
primitive resVMPS ocf:T-Systems:OpenVMPS \
    params vlan_db="/mnt/DRBD/drbd1/OpenVMPS/data/vlan.db" \
    op monitor interval="10s" timeout="30s" \
    op start interval="0" timeout="120s" \
    op stop interval="0" timeout="120s"
primitive resVPN ocf:T-Systems:OpenVPN \
    params config="/mnt/DRBD/drbd0/OpenVPN/cnf/openvpn.conf" \
    op monitor interval="10s" timeout="30s" \
    op start interval="0" timeout="120s" \
    op stop interval="0" timeout="120s"
group groupIPVPN resIPVM resVPN
group groupNagiosApache resApache resNagios
ms msDRBD0 resDRBD0 \
    meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1"
notify="true" globally-unique="false" target-role="Started"
ms msDRBD1 resDRBD1 \
    meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1"
notify="true" globally-unique="false" target-role="Started"
ms msSyslog resSyslog \
    meta target-role="Started"
location locDRBD0Node1 msDRBD0 \
    rule $id="locDRBD0Node1-rule" $role="Master" 1000: #uname eq node1
location locDRBD1Node2 msDRBD1 \
    rule $id="locDRBD1Node2-rule" $role="Master" 1000: #uname eq node2
location locIP0Node1 resIP0 \
    rule $id="locIP0Node1-rule" 1000: #uname eq node1
location locIP1Node2 resIP1 \
    rule $id="locIP1Node2-rule" 1000: #uname eq node2
colocation colDRBD0FSys0 inf: resFSys0 msDRBD0:Master
colocation colDRBD1FSys1 inf: resFSys1 msDRBD1:Master
colocation colFSys0Apache inf: groupNagiosApache resFSys0
colocation colFSys0MySQL inf: resMySQL resFSys0
colocation colFSys0Syslog inf: msSyslog:Master resFSys0
colocation colFSys0VPN inf: groupIPVPN resFSys0
colocation colFSys1DHCP inf: resDHCP resFSys1
colocation colFSys1Sendmail inf: resSendmail resFSys1
colocation colFSys1Squid inf: resSquid resFSys1
colocation colFSys1VMPS inf: resVMPS resFSys1
colocation colIP0Apache inf: groupNagiosApache resIP0
colocation colIP0MySQL inf: resMySQL resIP0
colocation colIP0Syslog inf: msSyslog:Master resIP0
colocation colIP0VPN inf: groupIPVPN resIP0
colocation colIP1Sendmail inf: resSendmail resIP1
colocation colIP1Squid inf: resSquid resIP1
colocation colIP1VMPS inf: resVMPS resIP1
colocation colVPNSyslog 1000: msSyslog:Master groupIPVPN
order orderDRBD0FSys0 inf: msDRBD0:promote resFSys0:start
order orderDRBD1FSys1 inf: msDRBD1:promote resFSys1:start
order orderFSys0Apache inf: resFSys0 groupNagiosApache
order orderFSys0MySQL inf: resFSys0 resMySQL
order orderFSys0Syslog inf: resFSys0 msSyslog:promote
order orderFSys0VPN inf: resFSys0 groupIPVPN
order orderFSys1DHCP inf: resFSys1 resDHCP
order orderFSys1Sendmail inf: resFSys1 resSendmail
order orderFSys1Squid inf: resFSys1 resSquid
order orderFSys1VMPS inf: resFSys1 resVMPS
order orderIP0Apache inf: resIP0 groupNagiosApache
order orderIP0MySQL inf: resIP0 resMySQL
order orderIP0Syslog inf: resIP0 msSyslog:promote
order orderIP0VPN inf: resIP0 groupIPVPN
order orderIP1Sendmail inf: resIP1 resSendmail
order orderIP1Squid inf: resIP1 resSquid
order orderIP1VMPS inf: resIP1 resVMPS
order orderVPNSyslog inf: groupIPVPN msSyslog:promote
property $id="cib-bootstrap-options" \
    dc-version="1.0.8-042548a451fce8400660f6031f4da6f0223dd5dd" \
    cluster-infrastructure="openais" \
    expected-quorum-votes="2" \
    stonith-enabled="false" \
    no-quorum-policy="ignore" \
    last-lrm-refresh="1291802109"

Maybe someone has a clue why the crmd is restarting all the time after the
failback.

Thank you for your help.

-- 

Regards,

Simon Jansen

---------------------------
Simon Jansen
64291 Darmstadt
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.clusterlabs.org/pipermail/pacemaker/attachments/20101208/14287dd3/attachment-0001.html>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: crmd_restart.log
Type: application/octet-stream
Size: 22534 bytes
Desc: not available
URL: <https://lists.clusterlabs.org/pipermail/pacemaker/attachments/20101208/14287dd3/attachment-0003.obj>