[Pacemaker] Configuration recommandations for (very?) large cluster

Lars Ellenberg lars.ellenberg at linbit.com
Wed Aug 13 10:05:03 EDT 2014


On Wed, Aug 13, 2014 at 10:33:55AM +1000, Andrew Beekhof wrote:
> On 13 Aug 2014, at 2:02 am, Cédric Dufour - Idiap Research Institute <cedric.dufour at idiap.ch> wrote:
> > On 12/08/14 07:52, Andrew Beekhof wrote:
> >> On 11 Aug 2014, at 10:10 pm, Cédric Dufour - Idiap Research Institute <cedric.dufour at idiap.ch> wrote:

...

> > While I still had the ~450 resources, I also "accidentally" brought
> > all 22 nodes back to life together (well, actually started the DC
> > alone and then started the remaining 21 nodes together). As could be
> > expected, the DC got quite busy (dispatching/executing the ~450*22
> > monitoring operations on all nodes). It took 40 minutes for the
> > cluster to stabilize. But it did stabilize, with no timeout and not
> > monitor operations failure! A few "high CIB load detected / throttle
> > down mode" messages popped up but all went well.

Cool.

> Thats about 0.12s per operation, not too bad.
> More importantly, I'm glad to hear that real-world clusters are seeing
> the same kind of improvements as those in the lab.
>
> It would be interesting to know how the 40 minutes compares to bringing one node online at a time.
> 
> > 
> > Q: Is there a way to favorize more powerful nodes for the DC (iow. push the DC "election" process in a preferred direction) ?
> 
> Only by starting it first and ensuring it doesn't die (we prfioritize the node with the largest crmd process uptime).

Uhm, there was a patch once for pacemaker-1.0.
The latest version I found right now is below.
Written by Klaus Wenninger, iirc.

The idea was to communicate via environment
a "HA_dc_prio" value, with meanings:

unset => use default of 1
 <= -1: Node does not become DC (does not vote)
 ==  0: Node may only become DC if no node with >= 1 is available.
        It also will trigger an election whenever a node joins.
 (==  1: default)
  >=  1: "classic pacemaker behavior",
  	 but changed so positive prio will be checked first,
         and higher positive prio will win

It may still apply (with white space changes) to current pacemaker 1.0.

It will need some more adjustments for pacemaker 1.1,
but a quick browse through the code suggests it won't be too much work.

	Lars


--- crmd/election.c.orig	2011-11-28 16:24:54.345431668 +0100
+++ crmd/election.c	2011-11-28 16:39:18.008420543 +0100
@@ -33,6 +33,7 @@
 GHashTable *voted = NULL;
 uint highest_born_on = -1;
 static int current_election_id = 1;
+static int our_dc_prio = INT_MIN; /* INT_MIN/<0/==0/>0 not_set/not_voting/retrigger_election/default_behaviour_plus_prio */
 
 static int
 crm_uptime(struct timeval *output)
@@ -107,6 +108,20 @@
             break;
     }
 
+    if (our_dc_prio == INT_MIN) {
+        char * dc_prio_str = getenv("HA_dc_prio");
+
+        if (dc_prio_str == NULL) {
+            our_dc_prio = 1;
+        } else {
+            our_dc_prio = atoi(dc_prio_str);
+        }
+    }
+
+    if (our_dc_prio < 0) {
+        not_voting = TRUE;
+    }
+
     if (not_voting == FALSE) {
         if (is_set(fsa_input_register, R_STARTING)) {
             not_voting = TRUE;
@@ -123,6 +138,7 @@
     current_election_id++;
     crm_xml_add(vote, F_CRM_ELECTION_OWNER, fsa_our_uuid);
     crm_xml_add_int(vote, F_CRM_ELECTION_ID, current_election_id);
+    crm_xml_add_int(vote, F_CRM_DC_PRIO, our_dc_prio);
 
     crm_uptime(&age);
     crm_xml_add_int(vote, F_CRM_ELECTION_AGE_S, age.tv_sec);
@@ -241,8 +258,9 @@
 {
     struct timeval your_age;
     int age;
     int election_id = -1;
+    int your_dc_prio = 1;
     int log_level = LOG_INFO;
     gboolean use_born_on = FALSE;
     gboolean done = FALSE;
     gboolean we_loose = FALSE;
@@ -273,6 +291,18 @@
     your_version = crm_element_value(vote->msg, F_CRM_VERSION);
     election_owner = crm_element_value(vote->msg, F_CRM_ELECTION_OWNER);
     crm_element_value_int(vote->msg, F_CRM_ELECTION_ID, &election_id);
+    crm_element_value_int(vote->msg, F_CRM_DC_PRIO, &your_dc_prio);
+
+    if (our_dc_prio == INT_MIN) {
+        char * dc_prio_str = getenv("HA_dc_prio");
+
+        if (dc_prio_str == NULL) {
+            our_dc_prio = 1;
+        } else {
+            our_dc_prio = atoi(dc_prio_str);
+        }
+    }
+
     crm_element_value_int(vote->msg, F_CRM_ELECTION_AGE_S, (int *)&(your_age.tv_sec));
     crm_element_value_int(vote->msg, F_CRM_ELECTION_AGE_US, (int *)&(your_age.tv_usec));
 
@@ -334,6 +364,13 @@
         reason = "Recorded";
         done = TRUE;
 
+    } else if(our_dc_prio < your_dc_prio) {
+        reason = "DC Prio";
+        we_loose = TRUE;
+
+    } else if(our_dc_prio > your_dc_prio) {
+        reason = "DC Prio";
+
     } else if (compare_version(your_version, CRM_FEATURE_SET) < 0) {
         reason = "Version";
         we_loose = TRUE;
@@ -400,6 +437,7 @@
 
         crm_xml_add(novote, F_CRM_ELECTION_OWNER, election_owner);
         crm_xml_add_int(novote, F_CRM_ELECTION_ID, election_id);
+	crm_xml_add_int(novote, F_CRM_DC_PRIO, 0); /* rather don't advertise a negative value */
 
         send_cluster_message(vote_from, crm_msg_crmd, novote, TRUE);
         free_xml(novote);
--- include/crm/msg_xml.h.orig	2011-11-28 16:41:47.309414327 +0100
+++ include/crm/msg_xml.h	2011-11-28 16:42:23.921417584 +0100
@@ -33,6 +33,7 @@
 #  define F_CRM_USER			"crm_user"
 #  define F_CRM_JOIN_ID			"join_id"
 #  define F_CRM_ELECTION_ID		"election-id"
+#  define F_CRM_DC_PRIO                 "dc-prio"
 #  define F_CRM_ELECTION_AGE_S		"election-age-sec"
 #  define F_CRM_ELECTION_AGE_US		"election-age-nano-sec"
 #  define F_CRM_ELECTION_OWNER		"election-owner"
--- lib/ais/plugin.c.orig	2011-11-28 16:42:57.002411543 +0100
+++ lib/ais/plugin.c	2011-11-28 16:44:22.160413844 +0100
@@ -409,6 +409,9 @@
     get_config_opt(pcmk_api, local_handle, "use_logd", &value, "no");
     pcmk_env.use_logd = value;
 
+    get_config_opt(pcmk_api, local_handle, "dc_prio", &value, "1");
+    pcmk_env.dc_prio = value;
+
     get_config_opt(pcmk_api, local_handle, "use_mgmtd", &value, "no");
     if (ais_get_boolean(value) == FALSE) {
         int lpc = 0;
@@ -599,6 +602,7 @@
     pcmk_env.logfile = NULL;
     pcmk_env.use_logd = "false";
     pcmk_env.syslog = "daemon";
+    pcmk_env.dc_prio = "1";
 
     if (cs_uid != root_uid) {
         ais_err("Corosync must be configured to start as 'root',"
--- lib/ais/utils.c.orig	2011-11-28 16:45:01.940415754 +0100
+++ lib/ais/utils.c	2011-11-28 16:45:33.018412117 +0100
@@ -237,6 +237,7 @@
 	setenv("HA_logfacility",	pcmk_env.syslog,   1);
 	setenv("HA_LOGFACILITY",	pcmk_env.syslog,   1);
 	setenv("HA_use_logd",		pcmk_env.use_logd, 1);
+        setenv("HA_dc_prio", pcmk_env.dc_prio, 1);
 	setenv("HA_quorum_type",	pcmk_env.quorum,   1);
 /* *INDENT-ON* */
 
--- lib/ais/utils.h.orig	2011-11-28 16:45:45.143412597 +0100
+++ lib/ais/utils.h	2011-11-28 16:46:37.026410208 +0100
@@ -238,6 +238,7 @@
     const char *syslog;
     const char *logfile;
     const char *use_logd;
+    const char *dc_prio;
     const char *quorum;
 };
 
--- crmd/messages.c.orig	2012-05-25 16:23:22.913106180 +0200
+++ crmd/messages.c	2012-05-25 16:28:30.330263392 +0200
@@ -36,6 +36,8 @@
 #include <crmd_messages.h>
 #include <crmd_lrm.h>
 
+static int our_dc_prio = INT_MIN;
+
 GListPtr fsa_message_queue = NULL;
 extern void crm_shutdown(int nsig);
 
@@ -693,7 +695,19 @@
     /*========== DC-Only Actions ==========*/
     if (AM_I_DC) {
         if (strcmp(op, CRM_OP_JOIN_ANNOUNCE) == 0) {
-            return I_NODE_JOIN;
+	    if (our_dc_prio == INT_MIN) {
+        	char * dc_prio_str = getenv("HA_dc_prio");
+
+        	if (dc_prio_str == NULL) {
+		    our_dc_prio = 1;
+        	} else {
+            	    our_dc_prio = atoi(dc_prio_str);
+        	}
+    	    }		
+	    if (our_dc_prio == 0)
+		return I_ELECTION;	
+            else 
+		return I_NODE_JOIN;
 
         } else if (strcmp(op, CRM_OP_JOIN_REQUEST) == 0) {
             return I_JOIN_REQUEST;


-- 
: Lars Ellenberg
: LINBIT | Your Way to High Availability
: DRBD/HA support and consulting http://www.linbit.com

DRBD® and LINBIT® are registered trademarks of LINBIT, Austria.




More information about the Pacemaker mailing list