[Pacemaker] Fail-count and failure timeout

Holger.Teutsch at fresenius-netcare.com Holger.Teutsch at fresenius-netcare.com
Fri Oct 1 09:40:26 EDT 2010


Hi,
I observed the following in pacemaker Versions 1.1.3 and tip up to patch 
10258.

In a small test environment to study fail-count behavior I have one 
resource

anything
doing sleep 600 with monitoring interval 10 secs.

The failure-timeout is 300.

I would expect to never see a failcount higher than 1.

I observed some sporadic clears but mostly the count is increasing by 1 
each 10 minutes. 

Am I mistaken or is this a bug ?

Regards
Holger

-- complete cib for reference ---

<cib epoch="32" num_updates="0" admin_epoch="0" 
validate-with="pacemaker-1.2" crm_feature_set="3.0.4" have-quorum="0" 
cib-last-written="Fri Oct  1 14:17:31 2010" dc-uuid="hotlx">
  <configuration>
    <crm_config>
      <cluster_property_set id="cib-bootstrap-options">
        <nvpair id="cib-bootstrap-options-dc-version" name="dc-version" 
value="1.1.3-09640bd6069e677d5eed65203a6056d9bf562e67"/>
        <nvpair id="cib-bootstrap-options-cluster-infrastructure" 
name="cluster-infrastructure" value="openais"/>
        <nvpair id="cib-bootstrap-options-expected-quorum-votes" 
name="expected-quorum-votes" value="2"/>
        <nvpair id="cib-bootstrap-options-no-quorum-policy" 
name="no-quorum-policy" value="ignore"/>
        <nvpair id="cib-bootstrap-options-stonith-enabled" 
name="stonith-enabled" value="false"/>
        <nvpair id="cib-bootstrap-options-start-failure-is-fatal" 
name="start-failure-is-fatal" value="false"/>
        <nvpair id="cib-bootstrap-options-last-lrm-refresh" 
name="last-lrm-refresh" value="1285926879"/>
      </cluster_property_set>
    </crm_config>
    <nodes>
      <node id="hotlx" uname="hotlx" type="normal"/>
    </nodes>
    <resources>
      <primitive class="ocf" id="test" provider="heartbeat" 
type="anything">
        <meta_attributes id="test-meta_attributes">
          <nvpair id="test-meta_attributes-target-role" name="target-role" 
value="started"/>
          <nvpair id="test-meta_attributes-failure-timeout" 
name="failure-timeout" value="300"/>
        </meta_attributes>
        <operations id="test-operations">
          <op id="test-op-monitor-10" interval="10" name="monitor" 
on-fail="restart" timeout="20s"/>
          <op id="test-op-start-0" interval="0" name="start" 
on-fail="restart" timeout="20s"/>
        </operations>
        <instance_attributes id="test-instance_attributes">
          <nvpair id="test-instance_attributes-binfile" name="binfile" 
value="sleep 600"/>
        </instance_attributes>
      </primitive>
    </resources>
    <constraints/>
  </configuration>
</cib>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.clusterlabs.org/pipermail/pacemaker/attachments/20101001/7b15fd92/attachment-0001.html>


More information about the Pacemaker mailing list