[Pacemaker] A/P Corosync, PGSQL and Split Brains questions

Wed Feb 9 08:48:52 EST 2011

Hello agian,

after fixing up my VirtualIP problem, I have been doing some Split Brain tests and while everything 'returns to normal', it is not quite what I had desired.

My scenario:
Acive/Passive 2 node cluster (serverA & serverB) with Corosync, DRBD & PGSQL.
The resources are configured as Master/Slave and sofar it is fine.

Since bullet points speak more then words: ;)
Test:
 1) Pull the plug on the master (serverA)
 2) Then Reattach
Expected results:
 1) serverB becomes Master
 2) serverB remains Master, serverA syncs with serverB
Actual results:
 1) serverB becomes Master
 2) serverA becomes Master, data written on serverB is lost.

In all honesty, I am not an expert in HA, DRBD and Corosync. I know the basics but it is not my domain of excellence.
Most of my configs has been influenced... ok, blatantly copied from the net and tweaked until the worked.
Yet now I am at a loss.

Am I presuming something that is not possible with Corosync (which I doubt) or is my config wrong(probably)?
Yet I am unable to find any smoking gun.

I have visited all the sites that might hold the information, but none really point anything out.
Only difference I could tell was that some examples did not have the split brain handling in the drbd.conf.

Can someone possibly point me into the correct direction?

Thanks!

Frank

Here are the obligatory config file contents:

############### /etc/drbd.conf 

global {
  usage-count no;
}
common {
  syncer {
    rate 100M;
  }
  protocol C;
}
resource drbd0 {

  startup {
    wfc-timeout 20;
    degr-wfc-timeout 10;
  }
  disk {
    on-io-error detach;
  }
  net {
    cram-hmac-alg sha1;
    after-sb-0pri discard-zero-changes;
    after-sb-1pri discard-secondary;
    after-sb-2pri disconnect; 

  }
  on serverA {
    device /dev/drbd0;
    disk /dev/sda5;
    meta-disk internal;
    address 150.158.183.22:7788;
  }
  on serverB {
    device /dev/drbd0;
    disk /dev/sda5;
    meta-disk internal;
    address 150.158.183.23:7788;
  }
}

############### /etc/ha.d/ha.cf 

udpport 694
ucast eth0 150.158.183.23

autojoin none
debug 1
logfile /var/log/ha-log
use_logd false
logfacility daemon
keepalive 2 # 2 second(s)
deadtime 10
# warntime 10
initdead 80

# list all shared ip addresses we want to ping
ping 150.158.183.30

# list all node names
node serverB serverA
crm yes
respawn root /usr/lib/heartbeat/pingd -m 100 -d 5s

############### /etc/corosync/corosync.conf

totem {
	version: 2
	token: 1000
	hold: 180
	token_retransmits_before_loss_const: 20
	join: 60
	configuration (ms)
	consensus: 4800
	vsftype: none
	max_messages: 20
	clear_node_high_bit: yes
	secauth: off
	threads: 0
	rrp_mode: none
	interface {
		ringnumber: 0
		bindnetaddr: 150.158.183.0
		mcastaddr: 226.94.1.22
		mcastport: 5427
	}
}
amf {
	mode: disabled
}
service {
	ver: 0
	name: pacemaker
}
aisexec {
	user: root
	group: root
}
logging {
	fileline: off
	to_stderr: yes
	to_logfile: yes
	to_syslog: yes
        logfile: /var/log/corosync/corosync.log
	syslog_facility: daemon
	debug: off
	timestamp: on
	logger_subsys {
		subsys: AMF
		debug: off
		tags: enter|leave|trace1|trace2|trace3|trace4|trace6
	}
}

############### /var/lib/heartbeat/crm/cib.xml

<cib have_quorum="true" generated="true" ignore_dtd="false" epoch="14" num_updates="0" admin_epoch="0" validate-with="transitional-0.6" cib-last-written="Wed Feb  9 14:03:30 2011" crm_feature_set="3.0.1" have-quorum="0" dc-uuid="serverA">
  <configuration>
    <crm_config>
      <cluster_property_set id="cib-bootstrap-options">
        <attributes>
          <nvpair id="option_1" name="symmetric_cluster" value="true"/>
          <nvpair id="option_2" name="no_quorum_policy" value="ignore"/>
          <nvpair id="option_3" name="stonith_enabled" value="false"/>
          <nvpair id="option_9" name="default-resource-stickiness" value="1000"/>
          <nvpair id="cib-bootstrap-options-dc-version" name="dc-version" value="1.0.9-74392a28b7f31d7ddc86689598bd23114f58978b"/>
          <nvpair id="cib-bootstrap-options-cluster-infrastructure" name="cluster-infrastructure" value="openais"/>
          <nvpair id="cib-bootstrap-options-expected-quorum-votes" name="expected-quorum-votes" value="2"/>
        </attributes>
      </cluster_property_set>
    </crm_config>
    <nodes>
      <node id="serverA" uname="serverA" type="normal"/>
      <node id="serverB" uname="serverB" type="normal"/>
    </nodes>
    <resources>
      <master_slave id="ms_drbd0">
        <meta_attributes id="ma-ms_drbd0">
          <attributes>
            <nvpair id="ma-ms-drbd0-1" name="clone_max" value="2"/>
            <nvpair id="ma-ms-drbd0-2" name="clone_node_max" value="1"/>
            <nvpair id="ma-ms-drbd0-3" name="master_max" value="1"/>
            <nvpair id="ma-ms-drbd0-4" name="master_node_max" value="1"/>
            <nvpair id="ma-ms-drbd0-5" name="notify" value="yes"/>
            <nvpair id="ma-ms-drbd0-6" name="globally_unique" value="false"/>
            <nvpair id="ma-ms-drbd0-7" name="target_role" value="started"/>
          </attributes>
        </meta_attributes>
        <primitive class="ocf" type="drbd" provider="heartbeat" id="drbddisk_rep">
          <instance_attributes id="drbddisk_rep_ias">
            <attributes>
              <nvpair id="drbd_primary_ia_failover_1" name="drbd_resource" value="drbd0"/>
              <nvpair id="drbd_primary_ia_failover_2" name="target_role" value="started"/>
              <nvpair id="drbd_primary_ia_failover_3" name="ignore_deprecation" value="true"/>
            </attributes>
          </instance_attributes>
          <operations>
            <op id="ms_drbd_mysql-monitor-master" name="monitor" interval="29s" timeout="10s" role="Master"/>
            <op id="ms_drbd_mysql-monitor-slave" name="monitor" interval="30s" timeout="10s" role="Slave"/>
          </operations>
        </primitive>
      </master_slave>
      <group id="rg_drbd" ordered="true">
        <meta_attributes id="ma-apache">
          <attributes>
            <nvpair id="ia-at-fs0" name="target_role" value="started"/>
          </attributes>
        </meta_attributes>
        <primitive id="ip_resource" class="ocf" type="IPaddr2" provider="heartbeat">
          <instance_attributes id="virtual-ip-attribs">
            <attributes>
              <nvpair id="virtual-ip-addr" name="ip" value="150.158.183.30"/>
              <nvpair id="virtual-ip-addr-nic" name="nic" value="eth0"/>
              <nvpair id="virtual-ip-addr-netmask" name="cidr_netmask" value="22"/>
              <nvpair id="virtual-ip-addr-iflabel" name="iflabel" value="0"/>
            </attributes>
          </instance_attributes>
          <operations>
            <op id="virtual-ip-monitor-10s" interval="10s" name="monitor"/>
          </operations>
        </primitive>
        <primitive class="ocf" provider="heartbeat" type="Filesystem" id="fs0">
          <instance_attributes id="ia-fs0">
            <attributes>
              <nvpair id="ia-fs0-1" name="fstype" value="ext3"/>
              <nvpair id="ia-fs0-2" name="directory" value="/mnt/rep"/>
              <nvpair id="ia-fs0-3" name="device" value="/dev/drbd0"/>
              <nvpair id="ia-fs0-4" name="options" value="noatime,nodiratime,barrier=0"/>
            </attributes>
          </instance_attributes>
        </primitive>
        <primitive id="pgsql" class="ocf" type="pgsql" provider="heartbeat">
          <instance_attributes id="pgsql-instance_attributes">
            <attributes>
              <nvpair id="pgsql-instance_attributes-pgdata" name="pgdata" value="/mnt/rep/pgsql/data"/>
              <nvpair id="pgsql-instance_attributes-pgctl" name="pgctl" value="/usr/lib/postgresql/8.3/bin/pg_ctl"/>
              <nvpair id="pgsql-instance_attributes-pgport" name="pgport" value="5432"/>
            </attributes>
          </instance_attributes>
          <operations>
            <op id="psql-monitor-30s" timeout="30s" interval="30s" name="monitor"/>
          </operations>
        </primitive>
      </group>
    </resources>
    <constraints>
      <rsc_location id="drbd0-placement-1" rsc="ms_drbd0">
        <rule id="drbd0-rule-1" score="-INFINITY">
          <expression id="exp-01" value="serverA" attribute="#uname" operation="ne"/>
          <expression id="exp-02" value="serverB" attribute="#uname" operation="ne"/>
        </rule>
        <rule id="drbd0-master-on-1" role="master" score="100">
          <expression id="exp-1" attribute="#uname" operation="eq" value="serverA"/>
        </rule>
      </rsc_location>
      <rsc_order id="mount_after_drbd" from="rg_drbd" action="start" to="ms_drbd0" to_action="promote"/>
      <rsc_colocation id="mount_on_drbd" to="ms_drbd0" to_role="master" from="rg_drbd" score="INFINITY"/>
    </constraints>
  </configuration>
</cib>

-- 
Empfehlen Sie GMX DSL Ihren Freunden und Bekannten und wir
belohnen Sie mit bis zu 50,- Euro! https://freundschaftswerbung.gmx.de