[Pacemaker] cluster-delay property

Thu Oct 24 08:39:39 EDT 2013

Sorry, I try to explain

Hi

In your book you describe a parameter 'deadtime' which defines the
timeout to declare a node as dead. I want to extend this value to 120s
to avoid such a scenario

But: in the SuSE documentation I cannot find 'deadtime', instead I see  
a value 'cluster-delay'.
My Question is: Are these two parameters equivalent ?

More details about the scenario:
The I/O load was created by me, because I copied a large xen image to an
logical volume of the cLVM (using 'dd'). I did it several times before
without problems. Maybe something changed after upgrading tu SLES SP3.

One node, (it was the DC) died, the Xen resources went to the  
surviving node. Fine.

No information in the log file.

On the the surviving node I see:
Oct 23 09:30:41 ha2infra corosync[9085]:  [TOTEM ] A processor failed,  
forming new configuration.

here is my configuration, see the new 'cluster-delay'  I did not yet  
try the 'dd',
I'm afraid, that one of the nodes dies again.
-----------------------------------------------------------------------------------------------------------------
node ha1infra \
         attributes standby="off"
node ha2infra \
         attributes standby="off"
primitive cluvg1 ocf:heartbeat:LVM \
         params volgrpname="cluvg1" \
         op start interval="0" timeout="240s" \
         op stop interval="0" timeout="100s"
primitive clvm ocf:lvm2:clvmd \
         params daemon_timeout="30" \
         op start interval="0" timeout="240s" \
         op stop interval="0" timeout="100s" \
         meta target-role="Started"
primitive dlm ocf:pacemaker:controld \
         op monitor interval="120s" \
         op start interval="0" timeout="240s" \
         op stop interval="0" timeout="100s"
primitive fkfantivir ocf:heartbeat:Xen \
         meta target-role="Started" is-managed="true" allow-migrate="true" \
         op monitor interval="10" timeout="30" \
         op migrate_from interval="0" timeout="600" \
         op migrate_to interval="0" timeout="600" \
         params xmfile="/etc/xen/vm/fkfantivir" shutdown_timeout="60"
primitive fkfbdc ocf:heartbeat:Xen \
         meta target-role="Started" allow-migrate="true" \
         op monitor interval="10" timeout="30" \
         op migrate_to interval="0" timeout="600" \
         params xmfile="/etc/xen/vm/fkfbdc" shutdown_timeout="120"
primitive fkfcups ocf:heartbeat:Xen \
         meta target-role="Started" allow-migrate="true" is-managed="true" \
         op monitor interval="10" timeout="30" \
         op migrate_from interval="0" timeout="600" \
         op migrate_to interval="0" timeout="600" \
         params xmfile="/etc/xen/vm/fkfcups"
primitive fkfdhcp ocf:heartbeat:Xen \
         meta target-role="Started" allow-migrate="true" is-managed="true" \
         op monitor interval="10" timeout="30" \
         op migrate_from interval="0" timeout="600" \
         op migrate_to interval="0" timeout="600" \
         params xmfile="/etc/xen/vm/fkfdhcp"
primitive fkfdir0 ocf:heartbeat:Xen \
         meta target-role="Started" allow-migrate="true" is-managed="true" \
         operations $id="fkfdir0-operations" \
         op monitor interval="10" timeout="30" \
         op migrate_from interval="0" timeout="600" \
         op migrate_to interval="0" timeout="600" \
         params xmfile="/etc/xen/vm/fkfdir0" shutdown_timeout="60"
primitive fkfkms ocf:heartbeat:Xen \
         meta target-role="Stopped" is-managed="true" allow-migrate="true" \
         operations $id="fkfkms-operations" \
         op monitor interval="10" timeout="30" \
         op migrate_from interval="0" timeout="600" \
         op migrate_to interval="0" timeout="600" \
         params xmfile="/etc/xen/vm/fkfkms" shutdown_timeout="120"
primitive fkflm ocf:heartbeat:Xen \
         meta target-role="Started" allow-migrate="true" is-managed="true" \
         op migrate_from interval="0" timeout="600" \
         op migrate_to interval="0" timeout="600" \
         params xmfile="/etc/xen/vm/fkflm" shutdown_timeout="60"
primitive fkflmw ocf:heartbeat:Xen \
         meta target-role="Started" is-managed="true" allow-migrate="true" \
         op monitor interval="10" timeout="30" \
         op migrate_from interval="0" timeout="600" \
         op migrate_to interval="0" timeout="600" \
         params xmfile="/etc/xen/vm/fkflmw" shutdown_timeout="120"
primitive fkfnfs ocf:heartbeat:Xen \
         meta target-role="Started" allow-migrate="true" is-managed="true" \
         op monitor interval="10" timeout="30" \
         op migrate_from interval="0" timeout="600" \
         op migrate_to interval="0" timeout="600" \
         params xmfile="/etc/xen/vm/fkfnfs" shutdown_timeout="120"
primitive horde_test ocf:heartbeat:Xen \
         meta target-role="Started" allow-migrate="true" is-managed="true" \
         operations $id="horde_test-operations" \
         op monitor interval="10" timeout="30" \
         op migrate_from interval="0" timeout="600" \
         op migrate_to interval="0" timeout="600" \
         params xmfile="/etc/xen/vm/horde_test" shutdown_timeout="60"
primitive orion ocf:heartbeat:Xen \
         meta target-role="Started" allow-migrate="true" is-managed="true" \
         op monitor interval="10" timeout="30" \
         op migrate_from interval="0" timeout="600" \
         op migrate_to interval="0" timeout="600" \
         params xmfile="/etc/xen/vm/orion"
primitive printmon ocf:heartbeat:Xen \
         meta target-role="Started" allow-migrate="true" is-managed="true" \
         op monitor interval="10" timeout="30" \
         op migrate_from interval="0" timeout="1800" \
         op migrate_to interval="0" timeout="1800" \
         params xmfile="/etc/xen/vm/printmon"
primitive squid ocf:heartbeat:Xen \
         meta target-role="Started" allow-migrate="true" \
         op monitor interval="10" timeout="30" \
         op migrate_from interval="0" timeout="600" \
         op migrate_to interval="0" timeout="600" \
         params xmfile="/etc/xen/vm/squid" shutdown_timeout="120"
primitive stonith_1 stonith:external/ipmi \
         meta target-role="Started" \
         op monitor interval="3600" timeout="20" \
         params ipaddr="192.168.128.149" userid="USERID"  
passwd="PASSW0RD" ipmitool="/usr/bin/ipmitool" hostname="ha2infra"
primitive stonith_2 stonith:external/ipmi \
         meta target-role="Started" \
         op monitor interval="3600" timeout="20" \
         params ipaddr="192.168.128.148" userid="USERID"  
passwd="PASSW0RD" ipmitool="/usr/bin/ipmitool" hostname="ha1infra"
primitive vmdv03 ocf:heartbeat:Xen \
         meta target-role="Stopped" allow-migrate="true" is-managed="true" \
         op monitor interval="10" timeout="30" \
         op migrate_from interval="0" timeout="600" \
         op migrate_to interval="0" timeout="600" \
         params xmfile="/etc/xen/vm/vmdv03" shutdown_timeout="120"
clone cluvg1_clone cluvg1 \
         meta interleave="true" target-role="Started" is-managed="true"
clone clvm_clone clvm \
         meta globally-unique="false" interleave="true" target-role="started"
clone dlm_clone dlm \
         meta globally-unique="false" interleave="true"  
target-role="started" is-managed="true"
location cli-prefer-fkfnfs fkfnfs \
         rule $id="cli-prefer-rule-fkfnfs" inf: #uname eq ha1infra
location cli-prefer-horde_test horde_test \
         rule $id="cli-prefer-rule-horde_test" inf: #uname eq ha2infra
location cli-prefer-printmon printmon \
         rule $id="cli-prefer-rule-printmon" inf: #uname eq ha2infra
location stonith_1_noton_ha2infra stonith_1 -inf: ha2infra
location stonith_2_noton_ha1infra stonith_2 -inf: ha1infra
colocation cluvg1_with_clvm inf: cluvg1_clone clvm_clone
colocation clvm_with_dlm inf: clvm_clone dlm_clone
colocation fkfantivir_with_cluvg1 inf: fkfantivir cluvg1_clone
colocation fkfbdc_with_cluvg1 inf: fkfbdc cluvg1_clone
colocation fkfcups_with_cluvg1 inf: fkfcups cluvg1_clone
colocation fkfdhcp_with_cluvg1 inf: fkfdhcp cluvg1_clone
colocation fkfdir0_with_cluvg1 inf: fkfdir0 cluvg1_clone
colocation fkfkms_with_cluvg1 inf: fkfkms cluvg1_clone
colocation fkflm_with_cluvg1 inf: fkflm cluvg1_clone
colocation fkflmw_with_cluvg1 inf: fkflmw cluvg1_clone
colocation fkfnfs_with_cluvg1 inf: fkfnfs cluvg1_clone
colocation horde_test_with_cluvg1 inf: horde_test cluvg1_clone
colocation orion_with_cluvg1 inf: orion cluvg1_clone
colocation printmon_with_cluvg1 inf: printmon cluvg1_clone
colocation squid_with_cluvg1 inf: squid cluvg1_clone
colocation vmdv03_with_cluvg1 inf: vmdv03 cluvg1_clone
order cluvg1_before_fkfantivir inf: cluvg1_clone fkfantivir
order cluvg1_before_fkfbdc inf: cluvg1_clone fkfbdc
order cluvg1_before_fkfcups inf: cluvg1_clone fkfcups
order cluvg1_before_fkfdhcp inf: cluvg1_clone fkfdhcp
order cluvg1_before_fkfdir0 inf: cluvg1_clone fkfdir0
order cluvg1_before_fkfkms inf: cluvg1_clone fkfkms
order cluvg1_before_fkflm inf: cluvg1_clone fkflm
order cluvg1_before_fkflmw inf: cluvg1_clone fkflmw
order cluvg1_before_fkfnfs inf: cluvg1_clone fkfnfs
order cluvg1_before_horde_test inf: cluvg1_clone horde_test
order cluvg1_before_orion inf: cluvg1_clone orion
order cluvg1_before_printmon inf: cluvg1_clone printmon
order cluvg1_before_squid inf: cluvg1_clone squid
order cluvg1_before_vmdv03 inf: cluvg1_clone vmdv03
order clvm_before_cluvg1 inf: clvm_clone cluvg1_clone
order dlm_before_clvm inf: dlm_clone clvm_clone
property $id="cib-bootstrap-options" \
         dc-version="1.1.9-2db99f1" \
         cluster-infrastructure="classic openais (with plugin)" \
         expected-quorum-votes="2" \
         no-quorum-policy="ignore" \
         last-lrm-refresh="1375099464" \
         default-action-timeout="60s" \
         cluster-delay="120s"          <---------------- this was 60s  
before ---------------------
rsc_defaults $id="rsc_defaults-options" \
         resource-stickiness="10" \
         migration-threshold="3"
--------------------------------------------------------------------------------------------
an ha1infra
Quoting Michael Schwartzkopff <ms at sys4.de>:

> Am Donnerstag, 24. Oktober 2013, 11:07:10 schrieb Karl Rößmann:
>> Hi,
>>
>> we have a two node HA cluster using SuSE SlES 11 HA Extension SP3
>>
>> For some reason there was heavy I/O load on both nodes yesterday.
>> and one of the nodes went down. (Which was a serious problem)
>
> Then you should thick about your system design. A cluster should be designed
> to provide the service even if one server fails.
>
>> Maybe we have to change a timeout value ?
>
> Depends why your node went down. Please see the logs for the reason, the node
> wnt down. If you found out the reason then you can optimize your cluster
> configuration.
>
> --
> Mit freundlichen Grüßen,
>
> Michael Schwartzkopff
>
> --
> [*] sys4 AG
>
> http://sys4.de, +49 (89) 30 90 46 64, +49 (162) 165 0044
> Franziskanerstraße 15, 81669 München
>
> Sitz der Gesellschaft: München, Amtsgericht München: HRB 199263
> Vorstand: Patrick Ben Koetter, Axel von der Ohe, Marc Schiffbauer
> Aufsichtsratsvorsitzender: Florian Kirstein

-- 
Karl Rößmann				Tel. +49-711-689-1657
Max-Planck-Institut FKF       		Fax. +49-711-689-1632
Postfach 800 665
70506 Stuttgart				email K.Roessmann at fkf.mpg.de