[ClusterLabs] Pacemaker not reacting as I would expect when two resources fail at the same time
Harvey Shepherd
Harvey.Shepherd at Aviatnet.com
Thu May 30 19:39:55 EDT 2019
Hi All,
I'm running Pacemaker 2.0.1 on a cluster containing two nodes; one master and one slave. I have a main master/slave resource (m_main_system), a group of resources that run in active-active mode (active_active - i.e. run on both nodes), and a group that runs in active-disabled mode (snmp_active_disabled - resources only run on the current promoted master). The snmp_active_disabled group is configured to be co-located with the master of m_main_system, so only a failure of the master m_main_system resource can trigger a failover. The constraints specify that m_main_system must be started before snmp_active_disabled.
The problem I'm having is that when a resource in the snmp_active_disabled group fails and gets into a constant cycle where Pacemaker tries to restart it, and I then kill m_main_system on the master, then Pacemaker still constantly tries to restart the failed snmp_active_disabled resource and ignores the more important m_main_system process which should be triggering a failover. If I stabilise the snmp_active_disabled resource then Pacemaker finally acts on the m_main_system failure. I hope I've described this well enough, but I've included a cut down form of my CIB config below if it helps!
Is this a bug or an error in my config? Perhaps the order in which the groups are defined in the CIB matters despite the constraints? Any help would be gratefully received.
Thanks,
Harvey
<configuration>
<crm_config>
<cluster_property_set id="cib-bootstrap-options">
<nvpair name="stonith-enabled" value="false" id="cib-bootstrap-options-stonith-enabled"/>
<nvpair name="no-quorum-policy" value="ignore" id="cib-bootstrap-options-no-quorum-policy"/>
<nvpair name="have-watchdog" value="false" id="cib-bootstrap-options-have-watchdog"/>
<nvpair name="cluster-name" value="lbcluster" id="cib-bootstrap-options-cluster-name"/>
<nvpair name="start-failure-is-fatal" value="false" id="cib-bootstrap-options-start-failure-is-fatal"/>
<nvpair name="cluster-recheck-interval" value="0s" id="cib-bootstrap-options-cluster-recheck-interval"/>
</cluster_property_set>
</crm_config>
<nodes>
<node id="1" uname="primary"/>
<node id="2" uname="secondary"/>
</nodes>
<resources>
<group id="snmp_active_disabled">
<primitive id="snmpd" class="lsb" type="snmpd">
<operations>
<op name="monitor" interval="10s" id="snmpd-monitor-10s"/>
<op name="start" interval="0" timeout="30s" id="snmpd-start-30s"/>
<op name="stop" interval="0" timeout="30s" id="snmpd-stop-30s"/>
</operations>
</primitive>
<primitive id="snmp-auxiliaries" class="lsb" type="snmp-auxiliaries">
<operations>
<op name="monitor" interval="10s" id="snmp-auxiliaries-monitor-10s"/>
<op name="start" interval="0" timeout="30s" id="snmp-auxiliaries-start-30s"/>
<op name="stop" interval="0" timeout="30s" id="snmp-auxiliaries-stop-30s"/>
</operations>
</primitive>
</group>
<clone id="clone_active_active">
<meta_attributes id="clone_active_active_meta_attributes">
<nvpair id="group-unique" name="globally-unique" value="false"/>
</meta_attributes>
<group id="active_active">
<primitive id="logd" class="lsb" type="logd">
<operations>
<op name="monitor" interval="10s" id="logd-monitor-10s"/>
<op name="start" interval="0" timeout="30s" id="logd-start-30s"/>
<op name="stop" interval="0" timeout="30s" id="logd-stop-30s"/>
</operations>
</primitive>
<primitive id="serviced" class="lsb" type="serviced">
<operations>
<op name="monitor" interval="10s" id="serviced-monitor-10s"/>
<op name="start" interval="0" timeout="30s" id="serviced-start-30s"/>
<op name="stop" interval="0" timeout="30s" id="serviced-stop-30s"/>
</operations>
</primitive>
</group>
</clone>
<master id="m_main_system">
<meta_attributes id="m_main_system-meta_attributes">
<nvpair name="notify" value="true" id="m_main_system-meta_attributes-notify"/>
<nvpair name="clone-max" value="2" id="m_main_system-meta_attributes-clone-max"/>
<nvpair name="promoted-max" value="1" id="m_main_system-meta_attributes-promoted-max"/>
<nvpair name="promoted-node-max" value="1" id="m_main_system-meta_attributes-promoted-node-max"/>
</meta_attributes>
<primitive id="main_system" class="ocf" provider="acme" type="main-system-ocf">
<operations>
<op name="start" interval="0" timeout="120s" id="main_system-start-0"/>
<op name="stop" interval="0" timeout="120s" id="main_system-stop-0"/>
<op name="promote" interval="0" timeout="120s" id="main_system-promote-0"/>
<op name="demote" interval="0" timeout="120s" id="main_system-demote-0"/>
<op name="monitor" interval="10s" timeout="10s" role="Master" id="main_system-monitor-10s"/>
<op name="monitor" interval="11s" timeout="10s" role="Slave" id="main_system-monitor-11s"/>
<op name="notify" interval="0" timeout="60s" id="main_system-notify-0"/>
</operations>
</primitive>
</master>
</resources>
<constraints>
<rsc_colocation id="master_only_snmp_rscs_with_main_system" score="INFINITY" rsc="snmp_active_disabled" with-rsc="m_main_system" with-rsc-role="Master"/>
<rsc_order id="snmp_active_disabled_after_main_system" kind="Mandatory" first="m_main_system" then="snmp_active_disabled"/>
<rsc_order id="active_active_after_main_system" kind="Mandatory" first="m_main_system" then="clone_active_active"/>
</constraints>
<rsc_defaults>
<meta_attributes id="rsc-options">
<nvpair name="resource-stickiness" value="1" id="rsc-options-resource-stickiness"/>
<nvpair name="migration-threshold" value="0" id="rsc-options-migration-threshold"/>
<nvpair name="requires" value="nothing" id="rsc-options-requires"/>
</meta_attributes>
</rsc_defaults>
</configuration>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.clusterlabs.org/pipermail/users/attachments/20190530/937858ef/attachment-0001.html>
More information about the Users
mailing list