[Pacemaker] Corosync won't recover when a node fails

Thu Oct 3 21:03:44 EDT 2013

Sure.  Here's the full config:

<cib epoch="28" num_updates="34" admin_epoch="0"
validate-with="pacemaker-1.2" cib-last-written="Thu Oct  3 16:26:39 2013"
crm_feature_set="3.0.6" update-origin="test-vm-2" update-client="cibadmin"
have-quorum="1" dc-uuid="test-vm-1">
  <configuration>
    <crm_config>
      <cluster_property_set id="cib-bootstrap-options">
        <nvpair id="cib-bootstrap-options-dc-version" name="dc-version"
value="1.1.7-ee0730e13d124c3d58f00016c3376a1de5323cff"/>
        <nvpair id="cib-bootstrap-options-cluster-infrastructure"
name="cluster-infrastructure" value="openais"/>
        <nvpair id="cib-bootstrap-options-expected-quorum-votes"
name="expected-quorum-votes" value="2"/>
        <nvpair id="cib-bootstrap-options-stonith-enabled"
name="stonith-enabled" value="false"/>
        <nvpair id="cib-bootstrap-options-no-quorum-policy"
name="no-quorum-policy" value="ignore"/>
      </cluster_property_set>
    </crm_config>
    <nodes>
      <node id="test-vm-1" type="normal" uname="test-vm-1"/>
      <node id="test-vm-2" type="normal" uname="test-vm-2"/>
    </nodes>
    <resources>
      <group id="nfs_resources">
        <meta_attributes id="nfs_resources-meta_attributes">
          <nvpair id="nfs_resources-meta_attributes-target-role"
name="target-role" value="Started"/>
        </meta_attributes>
        <primitive class="ocf" id="nfs_fs" provider="heartbeat"
type="Filesystem">
          <instance_attributes id="nfs_fs-instance_attributes">
            <nvpair id="nfs_fs-instance_attributes-device" name="device"
value="/dev/drbd1"/>
            <nvpair id="nfs_fs-instance_attributes-directory"
name="directory" value="/export/data/"/>
            <nvpair id="nfs_fs-instance_attributes-fstype" name="fstype"
value="ext3"/>
            <nvpair id="nfs_fs-instance_attributes-options" name="options"
value="noatime,nodiratime"/>
          </instance_attributes>
          <operations>
            <op id="nfs_fs-start-0" interval="0" name="start" timeout="60"/>
            <op id="nfs_fs-stop-0" interval="0" name="stop" timeout="120"/>
          </operations>
        </primitive>
        <primitive class="ocf" id="nfs_ip" provider="heartbeat"
type="IPaddr2">
          <instance_attributes id="nfs_ip-instance_attributes">
            <nvpair id="nfs_ip-instance_attributes-ip" name="ip"
value="192.168.25.205"/>
            <nvpair id="nfs_ip-instance_attributes-cidr_netmask"
name="cidr_netmask" value="32"/>
          </instance_attributes>
          <operations>
            <op id="nfs_ip-monitor-10s" interval="10s" name="monitor"/>
          </operations>
          <meta_attributes id="nfs_ip-meta_attributes">
            <nvpair id="nfs_ip-meta_attributes-is-managed"
name="is-managed" value="true"/>
          </meta_attributes>
        </primitive>
        <primitive class="lsb" id="nfs" type="nfs-kernel-server">
          <operations>
            <op id="nfs-monitor-5s" interval="5s" name="monitor"/>
            <op id="nfs-start-0" interval="0" name="start" timeout="120"/>
            <op id="nfs-stop-0" interval="0" name="stop" timeout="120"/>
          </operations>
        </primitive>
      </group>
      <master id="ms-drbd_r0">
        <meta_attributes id="ms-drbd_r0-meta_attributes">
          <nvpair id="ms-drbd_r0-meta_attributes-clone-max"
name="clone-max" value="2"/>
          <nvpair id="ms-drbd_r0-meta_attributes-notify" name="notify"
value="true"/>
          <nvpair id="ms-drbd_r0-meta_attributes-globally-unique"
name="globally-unique" value="false"/>
          <nvpair id="ms-drbd_r0-meta_attributes-target-role"
name="target-role" value="Master"/>
        </meta_attributes>
        <primitive class="ocf" id="drbd_r0" provider="heartbeat"
type="drbd">
          <instance_attributes id="drbd_r0-instance_attributes">
            <nvpair id="drbd_r0-instance_attributes-drbd_resource"
name="drbd_resource" value="r0"/>
          </instance_attributes>
          <operations>
            <op id="drbd_r0-monitor-59s" interval="59s" name="monitor"
role="Master" timeout="30s"/>
            <op id="drbd_r0-monitor-60s" interval="60s" name="monitor"
role="Slave" timeout="30s"/>
          </operations>
        </primitive>
      </master>
    </resources>
    <constraints>
      <rsc_colocation id="drbd-nfs-ha" rsc="ms-drbd_r0" rsc-role="Master"
score="INFINITY" with-rsc="nfs_resources"/>
      <rsc_order id="drbd-before-nfs" first="ms-drbd_r0"
first-action="promote" score="INFINITY" then="nfs_resources"
then-action="start"/>
    </constraints>
    <rsc_defaults>
      <meta_attributes id="rsc-options">
        <nvpair id="rsc-options-resource-stickiness"
name="resource-stickiness" value="100"/>
      </meta_attributes>
    </rsc_defaults>
  </configuration>
  <status>
    <node_state id="test-vm-1" uname="test-vm-1" ha="active" in_ccm="true"
crmd="online" join="member" expected="member"
crm-debug-origin="do_state_transition" shutdown="0">
      <transient_attributes id="test-vm-1">
        <instance_attributes id="status-test-vm-1">
          <nvpair id="status-test-vm-1-fail-count-drbd_r0.1"
name="fail-count-drbd_r0:1" value="1"/>
          <nvpair id="status-test-vm-1-last-failure-drbd_r0.1"
name="last-failure-drbd_r0:1" value="1380831442"/>
          <nvpair id="status-test-vm-1-master-drbd_r0.0"
name="master-drbd_r0:0" value="100"/>
          <nvpair id="status-test-vm-1-probe_complete"
name="probe_complete" value="true"/>
        </instance_attributes>
      </transient_attributes>
      <lrm id="test-vm-1">
        <lrm_resources>
          <lrm_resource id="drbd_r0:0" type="drbd" class="ocf"
provider="heartbeat">
            <lrm_rsc_op id="drbd_r0:0_last_failure_0"
operation_key="drbd_r0:0_monitor_0" operation="monitor"
crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
transition-key="7:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:8;7:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="32" rc-code="8" op-status="0" interval="0"
op-digest="c0e018b73fdf522b6cdd355e125af15e"/>
            <lrm_rsc_op id="drbd_r0:0_monitor_59000"
operation_key="drbd_r0:0_monitor_59000" operation="monitor"
crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
transition-key="20:5:8:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:8;20:5:8:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="35" rc-code="8" op-status="0" interval="59000"
op-digest="6f5adcd7f1211cdfc17850827b8582c5"/>
          </lrm_resource>
          <lrm_resource id="nfs" type="nfs-kernel-server" class="lsb">
            <lrm_rsc_op id="nfs_last_0" operation_key="nfs_start_0"
operation="start" crm-debug-origin="build_active_RAs"
crm_feature_set="3.0.6"
transition-key="14:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;14:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="39" rc-code="0" op-status="0" interval="0"
op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
            <lrm_rsc_op id="nfs_last_failure_0"
operation_key="nfs_monitor_0" operation="monitor"
crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
transition-key="6:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;6:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="31" rc-code="0" op-status="0" interval="0"
op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
            <lrm_rsc_op id="nfs_monitor_5000"
operation_key="nfs_monitor_5000" operation="monitor"
crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
transition-key="2:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;2:8:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="40" rc-code="0" op-status="0" interval="5000"
op-digest="4811cef7f7f94e3a35a70be7916cb2fd"/>
          </lrm_resource>
          <lrm_resource id="nfs_ip" type="IPaddr2" class="ocf"
provider="heartbeat">
            <lrm_rsc_op id="nfs_ip_last_failure_0"
operation_key="nfs_ip_monitor_0" operation="monitor"
crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
transition-key="5:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;5:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="30" rc-code="0" op-status="0" interval="0"
op-digest="570cd25774b1ead32cb1840813adbe21"/>
            <lrm_rsc_op id="nfs_ip_monitor_10000"
operation_key="nfs_ip_monitor_10000" operation="monitor"
crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
transition-key="8:5:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;8:5:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="33" rc-code="0" op-status="0" interval="10000"
op-digest="bc929bfa78c3086ebd199cf0110b87bf"/>
          </lrm_resource>
          <lrm_resource id="nfs_fs" type="Filesystem" class="ocf"
provider="heartbeat">
            <lrm_rsc_op id="nfs_fs_last_failure_0"
operation_key="nfs_fs_monitor_0" operation="monitor"
crm-debug-origin="build_active_RAs" crm_feature_set="3.0.6"
transition-key="4:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;4:4:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="29" rc-code="0" op-status="0" interval="0"
op-digest="c0a40c0015f71e8b20b5359e12f25eb5"/>
          </lrm_resource>
        </lrm_resources>
      </lrm>
    </node_state>
    <node_state id="test-vm-2" uname="test-vm-2" ha="active" in_ccm="true"
crmd="online" join="member" crm-debug-origin="do_update_resource"
expected="member" shutdown="0">
      <lrm id="test-vm-2">
        <lrm_resources>
          <lrm_resource id="nfs" type="nfs-kernel-server" class="lsb">
            <lrm_rsc_op id="nfs_last_0" operation_key="nfs_monitor_0"
operation="monitor" crm-debug-origin="do_update_resource"
crm_feature_set="3.0.6"
transition-key="10:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:7;10:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="4" rc-code="7" op-status="0" interval="0" last-run="1380832563"
last-rc-change="1380832563" exec-time="210" queue-time="0"
op-digest="f2317cad3d54cec5d7d7aa7d0bf35cf8"/>
          </lrm_resource>
          <lrm_resource id="nfs_ip" type="IPaddr2" class="ocf"
provider="heartbeat">
            <lrm_rsc_op id="nfs_ip_last_0" operation_key="nfs_ip_monitor_0"
operation="monitor" crm-debug-origin="do_update_resource"
crm_feature_set="3.0.6"
transition-key="9:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:7;9:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="3" rc-code="7" op-status="0" interval="0" last-run="1380832563"
last-rc-change="1380832563" exec-time="490" queue-time="0"
op-digest="570cd25774b1ead32cb1840813adbe21"/>
          </lrm_resource>
          <lrm_resource id="nfs_fs" type="Filesystem" class="ocf"
provider="heartbeat">
            <lrm_rsc_op id="nfs_fs_last_0" operation_key="nfs_fs_monitor_0"
operation="monitor" crm-debug-origin="do_update_resource"
crm_feature_set="3.0.6"
transition-key="8:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:7;8:14:7:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="2" rc-code="7" op-status="0" interval="0" last-run="1380832563"
last-rc-change="1380832563" exec-time="690" queue-time="0"
op-digest="c0a40c0015f71e8b20b5359e12f25eb5"/>
          </lrm_resource>
          <lrm_resource id="drbd_r0:1" type="drbd" class="ocf"
provider="heartbeat">
            <lrm_rsc_op id="drbd_r0:1_last_0"
operation_key="drbd_r0:1_start_0" operation="start"
crm-debug-origin="do_update_resource" crm_feature_set="3.0.6"
transition-key="26:14:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;26:14:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="6" rc-code="0" op-status="0" interval="0" last-run="1380832564"
last-rc-change="1380832564" exec-time="840" queue-time="0"
op-digest="c0e018b73fdf522b6cdd355e125af15e"/>
            <lrm_rsc_op id="drbd_r0:1_monitor_60000"
operation_key="drbd_r0:1_monitor_60000" operation="monitor"
crm-debug-origin="do_update_resource" crm_feature_set="3.0.6"
transition-key="25:15:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
transition-magic="0:0;25:15:0:1b4a3ae4-b013-45d1-a865-9b3b3deecf5f"
call-id="8" rc-code="0" op-status="0" interval="60000"
last-rc-change="1380832565" exec-time="310" queue-time="10"
op-digest="6f5adcd7f1211cdfc17850827b8582c5"/>
          </lrm_resource>
        </lrm_resources>
      </lrm>
      <transient_attributes id="test-vm-2">
        <instance_attributes id="status-test-vm-2">
          <nvpair id="status-test-vm-2-probe_complete"
name="probe_complete" value="true"/>
          <nvpair id="status-test-vm-2-master-drbd_r0.1"
name="master-drbd_r0:1" value="75"/>
        </instance_attributes>
      </transient_attributes>
    </node_state>
  </status>
</cib>

On Thu, Oct 3, 2013 at 5:06 PM, Andreas Kurz <andreas at hastexo.com> wrote:

> On 2013-10-03 22:12, David Parker wrote:
> > Thanks, Andrew.  The goal was to use either Pacemaker and Corosync 1.x
> > from the Debain packages, or use both compiled from source.  So, with
> > the compiled version, I was hoping to avoid CMAN.  However, it seems the
> > packaged version of Pacemaker doesn't support CMAN anyway, so it's moot.
> >
> > I rebuilt my VMs from scratch, re-installed Pacemaker and Corosync from
> > the Debian packages, but I'm still having an odd problem.  Here is the
> > config portion of my CIB:
> >
> >     <crm_config>
> >       <cluster_property_set id="cib-bootstrap-options">
> >         <nvpair id="cib-bootstrap-options-dc-version" name="dc-version"
> > value="1.1.7-ee0730e13d124c3d58f00016c3376a1de5323cff"/>
> >         <nvpair id="cib-bootstrap-options-cluster-infrastructure"
> > name="cluster-infrastructure" value="openais"/>
> >         <nvpair id="cib-bootstrap-options-expected-quorum-votes"
> > name="expected-quorum-votes" value="2"/>
> >         <nvpair id="cib-bootstrap-options-stonith-enabled"
> > name="stonith-enabled" value="false"/>
> >         <nvpair id="cib-bootstrap-options-no-quorum-policy"
> > name="no-quorum-policy" value="ignore"/>
> >       </cluster_property_set>
> >     </crm_config>
> >
> > I set no-quorum-policy=ignore based on the documentation example for a
> > 2-node cluster.  But when Pacemaker starts up on the first node, the
> > DRBD resource is in slave mode and none of the other resources are
> > started (they depend on DRBD being master), and I see these lines in the
> > log:
> >
> > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: unpack_config: On
> > loss of CCM Quorum: Ignore
> > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
> > nfs_fs   (test-vm-1 - blocked)
> > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
> > nfs_ip   (test-vm-1 - blocked)
> > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
> > nfs      (test-vm-1 - blocked)
> > Oct 03 15:29:18 test-vm-1 pengine: [3742]: notice: LogActions: Start
> > drbd_r0:0        (test-vm-1)
> >
> > I'm assuming the NFS resources show "blocked" because the resource they
> > depend on is not in the correct state.
> >
> > Even when the second node (test-vm-2) comes online, the state of these
> > resources does not change.  I can shutdown and re-start Pacemaker over
> > and over again on test-vm-2, but nothihg changes.  However... and this
> > is where it gets weird... if I shut down Pacemaker on test-vm-1, then
> > all of the resources immediately fail over to test-vm-2 and start
> > correctly.  And I see these lines in the log:
> >
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: unpack_config: On
> > loss of CCM Quorum: Ignore
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: stage6: Scheduling
> > Node test-vm-1 for shutdown
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Start
> > nfs_fs   (test-vm-2)
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Start
> > nfs_ip   (test-vm-2)
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Start
> > nfs      (test-vm-2)
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Stop
> >  drbd_r0:0        (test-vm-1)
> > Oct 03 15:44:26 test-vm-1 pengine: [5305]: notice: LogActions: Promote
> > drbd_r0:1        (Slave -> Master test-vm-2)
> >
> > After that, I can generally move the resources back and forth, and even
> > fail them over by hard-failing a node, without any problems.  The real
> > problem is that this isn't consistent, though.  Every once in a while,
> > I'll hard-fail a node and the other one will go into this "stuck" state
> > where Pacemaker knows it lost a node, but DRBD will stay in slave mode
> > and the other resources will never start.  It seems to happen quite
> > randomly.  Then, even if I restart Pacemaker on both nodes, or reboot
> > them altogether, I run into the startup issue mentioned previously.
> >
> > Any ideas?
>
> Yes, share your complete resource configuration ;-)
>
> Regards,
> Andreas
>
> >
> >     Thanks,
> >     Dave
> >
> >
> >
> > On Wed, Oct 2, 2013 at 1:01 AM, Andrew Beekhof <andrew at beekhof.net
> > <mailto:andrew at beekhof.net>> wrote:
> >
> >
> >     On 02/10/2013, at 5:24 AM, David Parker <dparker at utica.edu
> >     <mailto:dparker at utica.edu>> wrote:
> >
> >     > Thanks, I did a little Googling and found the git repository for
> pcs.
> >
> >     pcs won't help you rebuild pacemaker with cman support (or corosync
> >     2.x support) turned on though.
> >
> >
> >     >  Is there any way to make a two-node cluster work with the stock
> >     Debian packages, though?  It seems odd that this would be impossible.
> >
> >     it really depends how the debian maintainers built pacemaker.
> >     by the sounds of it, it only supports the pacemaker plugin mode for
> >     corosync 1.x
> >
> >     >
> >     >
> >     > On Tue, Oct 1, 2013 at 3:16 PM, Larry Brigman
> >     <larry.brigman at gmail.com <mailto:larry.brigman at gmail.com>> wrote:
> >     > pcs is another package you will need to install.
> >     >
> >     > On Oct 1, 2013 9:04 AM, "David Parker" <dparker at utica.edu
> >     <mailto:dparker at utica.edu>> wrote:
> >     > Hello,
> >     >
> >     > Sorry for the delay in my reply.  I've been doing a lot of
> >     experimentation, but so far I've had no luck.
> >     >
> >     > Thanks for the suggestion, but it seems I'm not able to use CMAN.
> >      I'm running Debian Wheezy with Corosync and Pacemaker installed via
> >     apt-get.  When I installed CMAN and set up a cluster.conf file,
> >     Pacemaker refused to start and said that CMAN was not supported.
> >      When CMAN is not installed, Pacemaker starts up fine, but I see
> >     these lines in the log:
> >     >
> >     > Sep 30 23:36:29 test-vm-1 crmd: [6941]: ERROR:
> >     init_quorum_connection: The Corosync quorum API is not supported in
> >     this build
> >     > Sep 30 23:36:29 test-vm-1 pacemakerd: [6932]: ERROR:
> >     pcmk_child_exit: Child process crmd exited (pid=6941, rc=100)
> >     > Sep 30 23:36:29 test-vm-1 pacemakerd: [6932]: WARN:
> >     pcmk_child_exit: Pacemaker child process crmd no longer wishes to be
> >     respawned. Shutting ourselves down.
> >     >
> >     > So, then I checked to see which plugins are supported:
> >     >
> >     > # pacemakerd -F
> >     > Pacemaker 1.1.7 (Build: ee0730e13d124c3d58f00016c3376a1de5323cff)
> >     >  Supporting:  generated-manpages agent-manpages ncurses  heartbeat
> >     corosync-plugin snmp libesmtp
> >     >
> >     > Am I correct in believing that this Pacemaker package has been
> >     compiled without support for any quorum API?  If so, does anyone
> >     know if there is a Debian package which has the correct support?
> >     >
> >     > I also tried compiling LibQB, Corosync and Pacemaker from source
> >     via git, following the instructions documented here:
> >     >
> >     > http://clusterlabs.org/wiki/SourceInstall
> >     >
> >     > I was hopeful that this would work, because as I understand it,
> >     Corosync 2.x no longer uses CMAN.  Everything compiled and started
> >     fine, but the compiled version of Pacemaker did not include either
> >     the 'crm' or 'pcs' commands.  Do I need to install something else in
> >     order to get one of these?
> >     >
> >     > Any and all help is greatly appreciated!
> >     >
> >     >     Thanks,
> >     >     Dave
> >     >
> >     >
> >     > On Wed, Sep 25, 2013 at 6:08 AM, David Lang <david at lang.hm
> >     <mailto:david at lang.hm>> wrote:
> >     > the cluster is trying to reach a quarum (the majority of the nodes
> >     talking to each other) and that is never going to happen with only
> >     one node. so you have to disable this.
> >     >
> >     > try putting
> >     > <cman two_node="1" expected_votes="1" transport="udpu"/>
> >     > in your cluster.conf
> >     >
> >     > David Lang
> >     >
> >     >  On Tue, 24 Sep 2013, David Parker wrote:
> >     >
> >     > Date: Tue, 24 Sep 2013 11:48:59 -0400
> >     > From: David Parker <dparker at utica.edu <mailto:dparker at utica.edu>>
> >     > Reply-To: The Pacemaker cluster resource manager
> >     >     <pacemaker at oss.clusterlabs.org
> >     <mailto:pacemaker at oss.clusterlabs.org>>
> >     > To: The Pacemaker cluster resource manager
> >     <pacemaker at oss.clusterlabs.org <mailto:pacemaker at oss.clusterlabs.org
> >>
> >     > Subject: Re: [Pacemaker] Corosync won't recover when a node fails
> >     >
> >     >
> >     > I forgot to mention, OS is Debian Wheezy 64-bit, Corosync and
> >     Pacemaker
> >     > installed from packages via apt-get, and there are no local
> >     firewall rules
> >     > in place:
> >     >
> >     > # iptables -L
> >     > Chain INPUT (policy ACCEPT)
> >     > target     prot opt source               destination
> >     >
> >     > Chain FORWARD (policy ACCEPT)
> >     > target     prot opt source               destination
> >     >
> >     > Chain OUTPUT (policy ACCEPT)
> >     > target     prot opt source               destination
> >     >
> >     >
> >     > On Tue, Sep 24, 2013 at 11:41 AM, David Parker <dparker at utica.edu
> >     <mailto:dparker at utica.edu>> wrote:
> >     >
> >     > Hello,
> >     >
> >     > I have a 2-node cluster using Corosync and Pacemaker, where the
> >     nodes are
> >     > actually to VirtualBox VMs on the same physical machine.  I have
> some
> >     > resources set up in Pacemaker, and everything works fine if I move
> >     them in
> >     > a controlled way with the "crm_resource -r <resource> --move
> >     --node <node>"
> >     > command.
> >     >
> >     > However, when I hard-fail one of the nodes via the "poweroff"
> >     command in
> >     > Virtual Box, which "pulls the plug" on the VM, the resources do
> >     not move,
> >     > and I see the following output in the log on the remaining node:
> >     >
> >     > Sep 24 11:20:30 corosync [TOTEM ] The token was lost in the
> >     OPERATIONAL
> >     > state.
> >     > Sep 24 11:20:30 corosync [TOTEM ] A processor failed, forming new
> >     > configuration.
> >     > Sep 24 11:20:30 corosync [TOTEM ] entering GATHER state from 2.
> >     > Sep 24 11:20:31 test-vm-2 lrmd: [2503]: debug: rsc:drbd_r0:0
> >     monitor[31]
> >     > (pid 8495)
> >     > drbd[8495]:     2013/09/24_11:20:31 WARNING: This resource agent is
> >     > deprecated and may be removed in a future release. See the man
> >     page for
> >     > details. To suppress this warning, set the "ignore_deprecation"
> >     resource
> >     > parameter to true.
> >     > drbd[8495]:     2013/09/24_11:20:31 WARNING: This resource agent is
> >     > deprecated and may be removed in a future release. See the man
> >     page for
> >     > details. To suppress this warning, set the "ignore_deprecation"
> >     resource
> >     > parameter to true.
> >     > drbd[8495]:     2013/09/24_11:20:31 DEBUG: r0: Calling drbdadm -c
> >     > /etc/drbd.conf role r0
> >     > drbd[8495]:     2013/09/24_11:20:31 DEBUG: r0: Exit code 0
> >     > drbd[8495]:     2013/09/24_11:20:31 DEBUG: r0: Command output:
> >     > Secondary/Primary
> >     > drbd[8495]:     2013/09/24_11:20:31 DEBUG: r0: Calling drbdadm -c
> >     > /etc/drbd.conf cstate r0
> >     > drbd[8495]:     2013/09/24_11:20:31 DEBUG: r0: Exit code 0
> >     > drbd[8495]:     2013/09/24_11:20:31 DEBUG: r0: Command output:
> >     Connected
> >     > drbd[8495]:     2013/09/24_11:20:31 DEBUG: r0 status:
> >     Secondary/Primary
> >     > Secondary Primary Connected
> >     > Sep 24 11:20:31 test-vm-2 lrmd: [2503]: info: operation
> monitor[31] on
> >     > drbd_r0:0 for client 2506: pid 8495 exited with return code 0
> >     > Sep 24 11:20:32 corosync [TOTEM ] entering GATHER state from 0.
> >     > Sep 24 11:20:34 corosync [TOTEM ] The consensus timeout expired.
> >     > Sep 24 11:20:34 corosync [TOTEM ] entering GATHER state from 3.
> >     > Sep 24 11:20:36 corosync [TOTEM ] The consensus timeout expired.
> >     > Sep 24 11:20:36 corosync [TOTEM ] entering GATHER state from 3.
> >     > Sep 24 11:20:38 corosync [TOTEM ] The consensus timeout expired.
> >     > Sep 24 11:20:38 corosync [TOTEM ] entering GATHER state from 3.
> >     > Sep 24 11:20:40 corosync [TOTEM ] The consensus timeout expired.
> >     > Sep 24 11:20:40 corosync [TOTEM ] entering GATHER state from 3.
> >     > Sep 24 11:20:40 corosync [TOTEM ] Totem is unable to form a cluster
> >     > because of an operating system or network fault. The most common
> >     cause of
> >     > this message is that the local firewall is configured improperly.
> >     > Sep 24 11:20:43 corosync [TOTEM ] The consensus timeout expired.
> >     > Sep 24 11:20:43 corosync [TOTEM ] entering GATHER state from 3.
> >     > Sep 24 11:20:43 corosync [TOTEM ] Totem is unable to form a cluster
> >     > because of an operating system or network fault. The most common
> >     cause of
> >     > this message is that the local firewall is configured improperly.
> >     > Sep 24 11:20:45 corosync [TOTEM ] The consensus timeout expired.
> >     > Sep 24 11:20:45 corosync [TOTEM ] entering GATHER state from 3.
> >     > Sep 24 11:20:45 corosync [TOTEM ] Totem is unable to form a cluster
> >     > because of an operating system or network fault. The most common
> >     cause of
> >     > this message is that the local firewall is configured improperly.
> >     > Sep 24 11:20:47 corosync [TOTEM ] The consensus timeout expired.
> >     >
> >     > Those last 3 messages just repeat over and over, the cluster never
> >     > recovers, and the resources never move.  "crm_mon" reports that the
> >     > resources are still running on the dead node, and shows no
> >     indication that
> >     > anything has gone wrong.
> >     >
> >     > Does anyone know what the issue could be?  My expectation was that
> the
> >     > remaining node would become the sole member of the cluster, take
> >     over the
> >     > resources, and everything would keep running.
> >     >
> >     > For reference, my corosync.conf file is below:
> >     >
> >     > compatibility: whitetank
> >     >
> >     > totem {
> >     >         version: 2
> >     >         secauth: off
> >     >         interface {
> >     >                 member {
> >     >                         memberaddr: 192.168.25.201
> >     >                 }
> >     >                 member {
> >     >                         memberaddr: 192.168.25.202
> >     >                  }
> >     >                 ringnumber: 0
> >     >                 bindnetaddr: 192.168.25.0
> >     >                 mcastport: 5405
> >     >         }
> >     >         transport: udpu
> >     > }
> >     >
> >     > logging {
> >     >         fileline: off
> >     >         to_logfile: yes
> >     >         to_syslog: yes
> >     >         debug: on
> >     >         logfile: /var/log/cluster/corosync.log
> >     >         timestamp: on
> >     >         logger_subsys {
> >     >                 subsys: AMF
> >     >                 debug: on
> >     >         }
> >     > }
> >     >
> >     >
> >     > Thanks!
> >     > Dave
> >     >
> >     > --
> >     > Dave Parker
> >     > Systems Administrator
> >     > Utica College
> >     > Integrated Information Technology Services
> >     > (315) 792-3229
> >     > Registered Linux User #408177
> >     >
> >     >
> >     >
> >     >
> >     >
> >     > _______________________________________________
> >     >
> >     > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> >     <mailto:Pacemaker at oss.clusterlabs.org>
> >     >
> >     > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> >     >
> >     >
> >     >
> >     > Project Home: http://www.clusterlabs.org
> >     >
> >     > Getting started:
> >     http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> >     >
> >     > Bugs: http://bugs.clusterlabs.org
> >     >
> >     >
> >     > _______________________________________________
> >     > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> >     <mailto:Pacemaker at oss.clusterlabs.org>
> >     > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> >     >
> >     > Project Home: http://www.clusterlabs.org
> >     > Getting started:
> >     http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> >     > Bugs: http://bugs.clusterlabs.org
> >     >
> >     >
> >     >
> >     >
> >     > --
> >     > Dave Parker
> >     > Systems Administrator
> >     > Utica College
> >     > Integrated Information Technology Services
> >     > (315) 792-3229 <tel:%28315%29%20792-3229>
> >     > Registered Linux User #408177
> >     >
> >     > _______________________________________________
> >     > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> >     <mailto:Pacemaker at oss.clusterlabs.org>
> >     > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> >     >
> >     > Project Home: http://www.clusterlabs.org
> >     > Getting started:
> >     http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> >     > Bugs: http://bugs.clusterlabs.org
> >     >
> >     >
> >     > _______________________________________________
> >     > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> >     <mailto:Pacemaker at oss.clusterlabs.org>
> >     > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> >     >
> >     > Project Home: http://www.clusterlabs.org
> >     > Getting started:
> >     http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> >     > Bugs: http://bugs.clusterlabs.org
> >     >
> >     >
> >     >
> >     >
> >     > --
> >     > Dave Parker
> >     > Systems Administrator
> >     > Utica College
> >     > Integrated Information Technology Services
> >     > (315) 792-3229 <tel:%28315%29%20792-3229>
> >     > Registered Linux User #408177
> >     > _______________________________________________
> >     > Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> >     <mailto:Pacemaker at oss.clusterlabs.org>
> >     > http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> >     >
> >     > Project Home: http://www.clusterlabs.org
> >     > Getting started:
> >     http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> >     > Bugs: http://bugs.clusterlabs.org
> >
> >
> >     _______________________________________________
> >     Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> >     <mailto:Pacemaker at oss.clusterlabs.org>
> >     http://oss.clusterlabs.org/mailman/listinfo/pacemaker
> >
> >     Project Home: http://www.clusterlabs.org
> >     Getting started:
> http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> >     Bugs: http://bugs.clusterlabs.org
> >
> >
> >
> >
> > --
> > Dave Parker
> > Systems Administrator
> > Utica College
> > Integrated Information Technology Services
> > (315) 792-3229
> > Registered Linux User #408177
> >
> >
> > This body part will be downloaded on demand.
> >
>
>
> --
> Need help with Pacemaker?
> http://www.hastexo.com/now
>
>
>
> _______________________________________________
> Pacemaker mailing list: Pacemaker at oss.clusterlabs.org
> http://oss.clusterlabs.org/mailman/listinfo/pacemaker
>
> Project Home: http://www.clusterlabs.org
> Getting started: http://www.clusterlabs.org/doc/Cluster_from_Scratch.pdf
> Bugs: http://bugs.clusterlabs.org
>
>

-- 
Dave Parker
Systems Administrator
Utica College
Integrated Information Technology Services
(315) 792-3229
Registered Linux User #408177
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.clusterlabs.org/pipermail/pacemaker/attachments/20131003/f2b21f2d/attachment-0003.html>