[Pacemaker] crmd exits and restarts after failback

Tue Dec 14 10:36:56 UTC 2010

Hi,

On Wed, Dec 08, 2010 at 12:56:40PM +0100, Simon Jansen wrote:
> Hi,
> 
> I have set up a pacemaker cluster on Ubuntu 10.04 LTS Server.
> Further I wrote an multistate OCF RA for the Rsyslog service. This RA passes
> all tests that are run by the ocf-tester tool.
> 
> Now the problem:
> When I firstly start the msSyslog resource it promotes on node1 and is fully
> functional. After that I set node1 to standby. The other node (node2) takes
> the master role. This behaviour is just as expected. Then I set node1 to
> online again to test if the failback works. There the error occurs: the crmd
> exits and starts again. These actions occur in an endless loop and I can

Any coredumps? Take a look at /var/lib/heartbeat/cores/...
crmd shouldn't exit just like that. The logs should say why crmd
it left. If there are no coredumps, you need to enable them
(see ulimit).

Thanks,

Dejan

> just reboot both nodes several times to come in a functional state again.
> I attached a summary of the log file so that you can see what's happening
> exactly.
> The cluster is configured as follows:
> node node1 \
>     attributes standby="off"
> node node2 \
>     attributes standby="off"
> primitive resApache ocf:heartbeat:apache \
>     params configfile="/mnt/DRBD/drbd0/apache/cnf/apache2.conf" \
>     op monitor interval="10" timeout="20" \
>     op start interval="0" timeout="40" \
>     op stop interval="0" timeout="60" \
>     meta target-role="Started"
> primitive resDHCP ocf:T-Systems:dhcp3 \
>     params config="/mnt/DRBD/drbd1/dhcp/cnf/dhcpd.conf"
> leases="/mnt/DRBD/drbd1/dhcp/data/dhcpd.leases" \
>     op monitor interval="10s" timeout="30s" \
>     op start interval="0" timeout="120s" \
>     op stop interval="0" timeout="120s" \
>     meta target-role="Started"
> primitive resDRBD0 ocf:linbit:drbd \
>     params drbd_resource="drbd0" \
>     op monitor interval="60s" role="Master" timeout="120s" \
>     op monitor interval="59s" \
>     op start interval="0" timeout="240s" \
>     op stop interval="0" timeout="100s"
> primitive resDRBD1 ocf:linbit:drbd \
>     params drbd_resource="drbd1" \
>     op monitor interval="60s" role="Master" timeout="120s" \
>     op monitor interval="59s"
>     op start interval="0" timeout="240s" \
>     op stop interval="0" timeout="100s"
> primitive resFSys0 ocf:heartbeat:Filesystem \
>     params device="/dev/drbd0" fstype="ext4" directory="/mnt/DRBD/drbd0" \
>     op monitor interval="20s" timeout="40s" \
>     op start interval="0" timeout="60s" \
>     op stop interval="0" timeout="60s" \
>     meta target-role="Started"
> primitive resFSys1 ocf:heartbeat:Filesystem \
>     params device="/dev/drbd1" fstype="ext4" directory="/mnt/DRBD/drbd1" \
>     op monitor interval="20s" timeout="40s" \
>     op start interval="0" timeout="60s" \
>     op stop interval="0" timeout="60s" \
>     meta target-role="Started"
> primitive resIP0 ocf:heartbeat:IPaddr2 \
>     params ip="10.32.194.246" nic="eth0" cidr_netmask="24" iflabel="0" \
>     op monitor interval="10s" \
>     meta target-role="Started"
> primitive resIP1 ocf:heartbeat:IPaddr2 \
>     params ip="10.32.194.247" nic="eth0" cidr_netmask="24" iflabel="1" \
>     op monitor interval="10s" \
>     meta target-role="Started"
> primitive resIPVM ocf:heartbeat:IPaddr2 \
>     params ip="192.168.200.30" nic="eth2" cidr_netmask="24" \
>     op monitor interval="10s"
> primitive resMySQL ocf:heartbeat:mysql \
>     params binary="/usr/bin/mysqld_safe" pid="/var/run/mysqld/mysqld.pid"
> socket="/var/run/mysqld/mysqld.sock" datadir="/mnt/DRBD/drbd0/mysql/data"
> config="/mnt/DRBD/drbd0/mysql/cnf/my.cnf" \
>     op monitor interval="10s" timeout="30s" \
>     op start interval="0" timeout="120s" \
>     op stop interval="0" timeout="120s" \
>     meta target-role="Started"
> primitive resNagios lsb:nagios3 \
>     op monitor interval="15" timeout="20s" \
>     op start interval="0" timeout="120s" \
>     op stop interval="0" timeout="120s"
> primitive resSendmail lsb:sendmail \
>     op monitor interval="20s" timeout="60s" \
>     op start interval="0" timeout="120s" \
>     op stop interval="0" timeout="120s"
> primitive resSquid ocf:heartbeat:Squid \
>     params squid_exe="/usr/sbin/squid" squid_pidfile="/var/run/squid.pid"
> squid_conf="/mnt/DRBD/drbd1/squid/cnf/squid.conf" squid_port="3128" \
>     op monitor interval="10s" timeout="30s" \
>     op start interval="0" timeout="60s" \
>     op stop interval="0" timeout="120s"
> primitive resSyslog ocf:T-Systems:Rsyslog \
>     params master_config="/mnt/DRBD/drbd0/Rsyslog/cnf/rsyslog_master.conf"
> slave_config="/etc/rsyslog.conf" \
>     op monitor interval="10s" role="Master" timeout="30s" \
>     op monitor interval="11s" timeout="33s" \
>     op start interval="0" timeout="120s" \
>     op stop interval="0" timeout="120s"
> primitive resVMPS ocf:T-Systems:OpenVMPS \
>     params vlan_db="/mnt/DRBD/drbd1/OpenVMPS/data/vlan.db" \
>     op monitor interval="10s" timeout="30s" \
>     op start interval="0" timeout="120s" \
>     op stop interval="0" timeout="120s"
> primitive resVPN ocf:T-Systems:OpenVPN \
>     params config="/mnt/DRBD/drbd0/OpenVPN/cnf/openvpn.conf" \
>     op monitor interval="10s" timeout="30s" \
>     op start interval="0" timeout="120s" \
>     op stop interval="0" timeout="120s"
> group groupIPVPN resIPVM resVPN
> group groupNagiosApache resApache resNagios
> ms msDRBD0 resDRBD0 \
>     meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1"
> notify="true" globally-unique="false" target-role="Started"
> ms msDRBD1 resDRBD1 \
>     meta master-max="1" master-node-max="1" clone-max="2" clone-node-max="1"
> notify="true" globally-unique="false" target-role="Started"
> ms msSyslog resSyslog \
>     meta target-role="Started"
> location locDRBD0Node1 msDRBD0 \
>     rule $id="locDRBD0Node1-rule" $role="Master" 1000: #uname eq node1
> location locDRBD1Node2 msDRBD1 \
>     rule $id="locDRBD1Node2-rule" $role="Master" 1000: #uname eq node2
> location locIP0Node1 resIP0 \
>     rule $id="locIP0Node1-rule" 1000: #uname eq node1
> location locIP1Node2 resIP1 \
>     rule $id="locIP1Node2-rule" 1000: #uname eq node2
> colocation colDRBD0FSys0 inf: resFSys0 msDRBD0:Master
> colocation colDRBD1FSys1 inf: resFSys1 msDRBD1:Master
> colocation colFSys0Apache inf: groupNagiosApache resFSys0
> colocation colFSys0MySQL inf: resMySQL resFSys0
> colocation colFSys0Syslog inf: msSyslog:Master resFSys0
> colocation colFSys0VPN inf: groupIPVPN resFSys0
> colocation colFSys1DHCP inf: resDHCP resFSys1
> colocation colFSys1Sendmail inf: resSendmail resFSys1
> colocation colFSys1Squid inf: resSquid resFSys1
> colocation colFSys1VMPS inf: resVMPS resFSys1
> colocation colIP0Apache inf: groupNagiosApache resIP0
> colocation colIP0MySQL inf: resMySQL resIP0
> colocation colIP0Syslog inf: msSyslog:Master resIP0
> colocation colIP0VPN inf: groupIPVPN resIP0
> colocation colIP1Sendmail inf: resSendmail resIP1
> colocation colIP1Squid inf: resSquid resIP1
> colocation colIP1VMPS inf: resVMPS resIP1
> colocation colVPNSyslog 1000: msSyslog:Master groupIPVPN
> order orderDRBD0FSys0 inf: msDRBD0:promote resFSys0:start
> order orderDRBD1FSys1 inf: msDRBD1:promote resFSys1:start
> order orderFSys0Apache inf: resFSys0 groupNagiosApache
> order orderFSys0MySQL inf: resFSys0 resMySQL
> order orderFSys0Syslog inf: resFSys0 msSyslog:promote
> order orderFSys0VPN inf: resFSys0 groupIPVPN
> order orderFSys1DHCP inf: resFSys1 resDHCP
> order orderFSys1Sendmail inf: resFSys1 resSendmail
> order orderFSys1Squid inf: resFSys1 resSquid
> order orderFSys1VMPS inf: resFSys1 resVMPS
> order orderIP0Apache inf: resIP0 groupNagiosApache
> order orderIP0MySQL inf: resIP0 resMySQL
> order orderIP0Syslog inf: resIP0 msSyslog:promote
> order orderIP0VPN inf: resIP0 groupIPVPN
> order orderIP1Sendmail inf: resIP1 resSendmail
> order orderIP1Squid inf: resIP1 resSquid
> order orderIP1VMPS inf: resIP1 resVMPS
> order orderVPNSyslog inf: groupIPVPN msSyslog:promote
> property $id="cib-bootstrap-options" \
>     dc-version="1.0.8-042548a451fce8400660f6031f4da6f0223dd5dd" \
>     cluster-infrastructure="openais" \
>     expected-quorum-votes="2" \
>     stonith-enabled="false" \
>     no-quorum-policy="ignore" \
>     last-lrm-refresh="1291802109"
> 
> 
> Maybe someone has a clue why the crmd is restarting all the time after the
> failback.
> 
> Thank you for your help.
> 
> -- 
> 
> 
> Regards,
> 
> Simon Jansen
> 
> 
> ---------------------------
> Simon Jansen
> 64291 Darmstadt

> _______________________________________________
> Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> 
> Project Home: http://www.clusterlabs.org
> Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> Bugs: http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker