diff --git a/crmd/lrm.c b/crmd/lrm.c
index f4f2aaf630..7445e557ba 100644
--- a/crmd/lrm.c
+++ b/crmd/lrm.c
@@ -1,1894 +1,1895 @@
 /* 
  * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
  * 
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  * 
  * This software is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <crm_internal.h>
 
 #include <sys/param.h>
 #include <crm/crm.h>
 #include <crmd_fsa.h>
 
 #include <sys/types.h>
 #include <sys/wait.h>
 
 #include <unistd.h>			/* for access */
 #include <heartbeat.h>
 #include <clplumbing/cl_signal.h>
 
 #include <errno.h>
 
 #include <crm/cib.h>
 #include <crm/msg_xml.h>
 #include <crm/common/xml.h>
 
 #include <crmd.h>
 #include <crmd_messages.h>
 #include <crmd_callbacks.h>
 #include <crmd_lrm.h>
 
 #include <lrm/lrm_api.h>
 #include <lrm/raexec.h>
 
 
 struct recurring_op_s 
 {
 		char *rsc_id;
 		char *op_key;
 		int   call_id;
 		int   interval;
 		gboolean remove;
 		gboolean cancelled;
 };
 
 char *make_stop_id(const char *rsc, int call_id);
 void cib_rsc_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data);
 
 gboolean build_operation_update(
     xmlNode *rsc_list, lrm_rsc_t *rsc, lrm_op_t *op, const char *src, int lpc, int level);
 
 gboolean build_active_RAs(xmlNode *rsc_list);
 gboolean is_rsc_active(const char *rsc_id);
 
 int do_update_resource(lrm_op_t *op);
 gboolean process_lrm_event(lrm_op_t *op);
 
 void do_lrm_rsc_op(lrm_rsc_t *rsc, const char *operation,
 		   xmlNode *msg, xmlNode *request);
 
 lrm_op_t *construct_op(
 	xmlNode *rsc_op, const char *rsc_id, const char *operation);
 
 void send_direct_ack(const char *to_host, const char *to_sys,
 		     lrm_rsc_t *rsc, lrm_op_t* op, const char *rsc_id);
 
 void free_recurring_op(gpointer value);
 
 GHashTable *resources = NULL;
 GHashTable *pending_ops = NULL;
 GCHSource *lrm_source = NULL;
 
 int num_lrm_register_fails = 0;
 int max_lrm_register_fails = 30;
 
 void lrm_connection_destroy(gpointer user_data)
 {
     if(is_set(fsa_input_register, R_LRM_CONNECTED)) {
 	crm_crit("LRM Connection failed");
 	register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL);
 	clear_bit_inplace(fsa_input_register, R_LRM_CONNECTED);
 	
     } else {
 	crm_info("LRM Connection disconnected");
     }
     
     lrm_source = NULL;
 }
 
 /*	 A_LRM_CONNECT	*/
 void
 do_lrm_control(long long action,
 	       enum crmd_fsa_cause cause,
 	       enum crmd_fsa_state cur_state,
 	       enum crmd_fsa_input current_input,
 	       fsa_data_t *msg_data)
 {
 	if(fsa_lrm_conn == NULL) {
 	    register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 	    return;
 	}
 
 	if(action & A_LRM_DISCONNECT) {
 		if(verify_stopped(cur_state, LOG_INFO) == FALSE) {
 		    crmd_fsa_stall(NULL);
 		    return;
 		}
 		
 		if(is_set(fsa_input_register, R_LRM_CONNECTED)) {
 		    clear_bit_inplace(fsa_input_register, R_LRM_CONNECTED);
 		    fsa_lrm_conn->lrm_ops->signoff(fsa_lrm_conn);
 		    crm_info("Disconnected from the LRM");
 		}
 
 		/* TODO: Clean up the hashtable */
 	}
 
 	if(action & A_LRM_CONNECT) {
 		int ret = HA_OK;
 		
 		pending_ops = g_hash_table_new_full(
 			g_str_hash, g_str_equal,
 			g_hash_destroy_str, free_recurring_op);
 
 		resources = g_hash_table_new_full(
 			g_str_hash, g_str_equal,
 			g_hash_destroy_str, g_hash_destroy_str);
 		
 		if(ret == HA_OK) {
 			crm_debug("Connecting to the LRM");
 			ret = fsa_lrm_conn->lrm_ops->signon(
 				fsa_lrm_conn, CRM_SYSTEM_CRMD);
 		}
 		
 		if(ret != HA_OK) {
 			if(++num_lrm_register_fails < max_lrm_register_fails) {
 				crm_warn("Failed to sign on to the LRM %d"
 					 " (%d max) times",
 					 num_lrm_register_fails,
 					 max_lrm_register_fails);
 				
 				crm_timer_start(wait_timer);
 				crmd_fsa_stall(NULL);
 				return;
 			}
 		}
 
 		if(ret == HA_OK) {
 			crm_debug_4("LRM: set_lrm_callback...");
 			ret = fsa_lrm_conn->lrm_ops->set_lrm_callback(
 				fsa_lrm_conn, lrm_op_callback);
 			if(ret != HA_OK) {
 				crm_err("Failed to set LRM callbacks");
 			}
 		}
 		
 		if(ret != HA_OK) {
 			crm_err("Failed to sign on to the LRM %d"
 				" (max) times", num_lrm_register_fails);
 			register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 			return;
 		}
 
 		/* TODO: create a destroy handler that causes
 		 * some recovery to happen
 		 */
 		lrm_source = G_main_add_IPC_Channel(
 			G_PRIORITY_LOW,
 			fsa_lrm_conn->lrm_ops->ipcchan(fsa_lrm_conn),
 			FALSE, lrm_dispatch, fsa_lrm_conn,
 			lrm_connection_destroy);
 
 		set_bit_inplace(fsa_input_register, R_LRM_CONNECTED);
 		crm_debug("LRM connection established");
 		
 	}	
 
 	if(action & ~(A_LRM_CONNECT|A_LRM_DISCONNECT)) {
 		crm_err("Unexpected action %s in %s",
 		       fsa_action2string(action), __FUNCTION__);
 	}
 }
 
 static void
 ghash_print_pending(gpointer key, gpointer value, gpointer user_data) 
 {
 	const char *stop_id = key;
 	int *log_level = user_data;
 	struct recurring_op_s *pending = value;
 	do_crm_log(*log_level, "Pending action: %s (%s)", stop_id, pending->op_key);
 }
 
 static void
 ghash_print_pending_for_rsc(gpointer key, gpointer value, gpointer user_data) 
 {
 	const char *stop_id = key;
 	char *rsc = user_data;
 	struct recurring_op_s *pending = value;
 	if(safe_str_eq(rsc, pending->rsc_id)) {
 	    do_crm_log(LOG_NOTICE, "%sction %s (%s) incomplete at shutdown",
 		       pending->interval==0?"A":"Recurring a", stop_id, pending->op_key);
 	}
 }
 
 static void
 ghash_count_pending(gpointer key, gpointer value, gpointer user_data) 
 {
 	int *counter = user_data;
 	struct recurring_op_s *pending = value;
 
 	if(pending->interval > 0) {
 	    /* Ignore recurring actions in the shutdown calculations */
 	    return;
 	}
 
 	(*counter)++;
 }
 
 gboolean
 verify_stopped(enum crmd_fsa_state cur_state, int log_level)
 {
 	int counter = 0;
 	gboolean rc = TRUE;
 	GListPtr lrm_list = NULL;
 
 	crm_debug("Checking for active resources before exit");
 
 	if(cur_state == S_TERMINATE) {
 		log_level = LOG_ERR;
 	}	
 
-	g_hash_table_foreach(pending_ops, ghash_count_pending, &counter);
-
+	if(pending_ops) {
+	    g_hash_table_foreach(pending_ops, ghash_count_pending, &counter);
+	}
+	
 	if(counter > 0) {
 	    rc = FALSE;
 	    do_crm_log(log_level,
 		       "%d pending LRM operations at shutdown%s",
-		       g_hash_table_size(pending_ops),
-		       cur_state == S_TERMINATE?"":"... waiting");
+		       counter, cur_state == S_TERMINATE?"":"... waiting");
 	    
 	    if(cur_state == S_TERMINATE || !is_set(fsa_input_register, R_SENT_RSC_STOP)) {
 		g_hash_table_foreach(
 		    pending_ops, ghash_print_pending, &log_level);
 	    }
 	    goto bail;
 	}
 
 	if(is_set(fsa_input_register, R_LRM_CONNECTED)) {
 		lrm_list = fsa_lrm_conn->lrm_ops->get_all_rscs(fsa_lrm_conn);
 	}
 
 	slist_iter(
 		rsc_id, char, lrm_list, lpc,
 		if(is_rsc_active(rsc_id) == FALSE) {
 			continue;
 		}
 		
 		crm_err("Resource %s was active at shutdown."
 			"  You may ignore this error if it is unmanaged.",
 			rsc_id);
 
 		g_hash_table_foreach(
 		    pending_ops, ghash_print_pending_for_rsc, rsc_id);
 	    );
 	
   bail:
 	set_bit_inplace(fsa_input_register, R_SENT_RSC_STOP);
 
 	if(cur_state == S_TERMINATE) {
 	    rc = TRUE;
 	}
 
 	return rc;
 }
 
 static char *
 get_rsc_metadata(const char *type, const char *class, const char *provider)
 {
 	char *metadata = NULL;
 	CRM_CHECK(type != NULL, return NULL);
 	CRM_CHECK(class != NULL, return NULL);
 	if(provider == NULL) {
 		provider = "heartbeat";
 	}
 
 	crm_debug_2("Retreiving metadata for %s::%s:%s", type, class, provider);
 	metadata = fsa_lrm_conn->lrm_ops->get_rsc_type_metadata(
 		fsa_lrm_conn, class, type, provider);
 
 	if(metadata) {
 	    /* copy the metadata because the LRM likes using
 	     *   g_alloc instead of cl_malloc
 	     */
 	    char *m_copy = crm_strdup(metadata);
 	    g_free(metadata);
 	    metadata = m_copy;
 	    
 	} else {
 	    crm_warn("No metadata found for %s::%s:%s", type, class, provider);
 	}		
 
 	return metadata;
 }
 
 typedef struct reload_data_s 
 {
 	char *key;
 	char *metadata;
 	gboolean can_reload;
 	GListPtr restart_list;
 } reload_data_t;
 
 
 static void g_hash_destroy_reload(gpointer data)
 {
     reload_data_t *reload = data;
     crm_free(reload->key);
     crm_free(reload->metadata);
     slist_destroy(char, child, reload->restart_list, crm_free(child));
 }
 
 
 GHashTable *reload_hash = NULL;
 static GListPtr
 get_rsc_restart_list(lrm_rsc_t *rsc, lrm_op_t *op) 
 {
 	int len = 0;
 	char *key = NULL;
 	char *copy = NULL;
 	const char *value = NULL;
 	const char *provider = NULL;
 
 	xmlNode *params = NULL;
 	xmlNode *actions = NULL;
 	xmlNode *metadata = NULL;
 
 	reload_data_t *reload = NULL;
 	
 	if(reload_hash == NULL) {
 	    reload_hash = g_hash_table_new_full(
 		g_str_hash, g_str_equal, NULL, g_hash_destroy_reload);
 	}
 
 	provider = rsc->provider;
 	if(provider == NULL) {
 	    provider = "heartbeat";
 	}
 	
 	len = strlen(rsc->type) + strlen(rsc->class) + strlen(provider) + 4;
 	crm_malloc(key, len);
 	snprintf(key, len, "%s::%s:%s", rsc->type, rsc->class, provider);
 	
 	reload = g_hash_table_lookup(reload_hash, key);
 	if(reload == NULL) {
 	    crm_malloc0(reload, sizeof(reload_data_t));
 	    g_hash_table_insert(reload_hash, key, reload);
 	    
 	    reload->key = key; key = NULL;
 	    reload->metadata = get_rsc_metadata(rsc->type, rsc->class, provider);
 
 	    metadata = string2xml(reload->metadata);
 	    if(metadata == NULL) {
 		crm_err("Metadata for %s::%s:%s is not valid XML",
 			rsc->provider, rsc->class, rsc->type);
 		goto cleanup;
 	    }
 
 	    actions = find_xml_node(metadata, "actions", TRUE);
 	    
 	    xml_child_iter_filter(
 		actions, action, "action",
 		value = crm_element_value(action, "name");
 		if(safe_str_eq("reload", value)) {
 		    reload->can_reload = TRUE;
 		    break;
 		}
 		);
 	    
 	    if(reload->can_reload == FALSE) {
 		goto cleanup;
 	    }
 
 	    params = find_xml_node(metadata, "parameters", TRUE);
 	    xml_child_iter_filter(
 		params, param, "parameter",
 		value = crm_element_value(param, "unique");
 		if(crm_is_true(value)) {
 		    value = crm_element_value(param, "name");
 		    if(value == NULL) {
 			crm_err("%s: NULL param", key);
 			continue;
 		    }
 		    crm_debug("Attr %s is not reloadable", value);
 		    copy = crm_strdup(value);
 		    CRM_CHECK(copy != NULL, continue);
 		    reload->restart_list = g_list_append(reload->restart_list, copy);
 		}
 		);
 	}
 	
   cleanup:
 	crm_free(key);
 	free_xml(metadata);
 	return reload?reload->restart_list:NULL;
 }
 
 static void
 append_digest(lrm_rsc_t *rsc, lrm_op_t *op, xmlNode *update, const char *version, const char *magic, int level) 
 {
     /* this will enable us to later determine that the
      *   resource's parameters have changed and we should force
      *   a restart
      */
     char *digest = NULL;
     xmlNode *args_xml = NULL;
 
     if(op->params == NULL) {
 	return;
     }
     
     args_xml = create_xml_node(NULL, XML_TAG_PARAMS);
     g_hash_table_foreach(op->params, hash2field, args_xml);
     filter_action_parameters(args_xml, version);
     digest = calculate_xml_digest(args_xml, TRUE, FALSE);
 
 #if 0
     if(level < crm_log_level
        && op->interval == 0
        && crm_str_eq(op->op_type, CRMD_ACTION_START, TRUE)) {
 	char *digest_source = dump_xml_unformatted(args_xml);
 	do_crm_log(level, "Calculated digest %s for %s (%s). Source: %s\n", 
 		   digest, ID(update), magic, digest_source);
 	crm_free(digest_source);
     }
 #endif
     crm_xml_add(update, XML_LRM_ATTR_OP_DIGEST, digest);
 
     free_xml(args_xml);
     crm_free(digest);
 }
 
 static void
 append_restart_list(lrm_rsc_t *rsc, lrm_op_t *op, xmlNode *update, const char *version) 
 {
 	int len = 0;
 	char *list = NULL;
 	char *digest = NULL;
 	const char *value = NULL;
 	gboolean non_empty = FALSE;
 	xmlNode *restart = NULL;
 	GListPtr restart_list = NULL;
 
 	if(op->interval > 0) {
 		/* monitors are not reloadable */
 		return;
 
 	} else if(op->params == NULL) {
 		crm_debug("%s has no parameters", ID(update));
 		return;
 
 	} else if(rsc == NULL) {
 		return;
 
 	} else if(crm_str_eq(CRMD_ACTION_START, op->op_type, TRUE) == FALSE) {
 		/* only starts are potentially reloadable */
 		return;
 		
 	} else if(compare_version("1.0.8", version) > 0) {
 		/* Caller version does not support reloads */
 		return;
 	}
 
 	restart_list = get_rsc_restart_list(rsc, op);
 	if(restart_list == NULL) {
 		/* Resource does not support reloads */
 		return;
 	}
 
 	restart = create_xml_node(NULL, XML_TAG_PARAMS);
 	slist_iter(param, const char, restart_list, lpc,
 		   int start = len;
 		   CRM_CHECK(param != NULL, continue);
 		   value = g_hash_table_lookup(op->params, param);
 		   if(value != NULL) {
 			   non_empty = TRUE;
 			   crm_xml_add(restart, param, value);
 		   }
 		   len += strlen(param) + 2;
 		   crm_realloc(list, len+1);
 		   sprintf(list+start, " %s ", param);
 		);
 	
 	digest = calculate_xml_digest(restart, TRUE, FALSE);
 	crm_xml_add(update, XML_LRM_ATTR_OP_RESTART, list);
 	crm_xml_add(update, XML_LRM_ATTR_RESTART_DIGEST, digest);
 
 #if 0
 	crm_debug("%s: %s, %s", rsc->id, digest, list);
 	if(non_empty) {
 		crm_log_xml_debug(restart, "restart digest source");
 	}
 #endif
 	
 	free_xml(restart);
 	crm_free(digest);
 	crm_free(list);
 }
 
 gboolean
 build_operation_update(
     xmlNode *xml_rsc, lrm_rsc_t *rsc, lrm_op_t *op, const char *src, int lpc, int level)
 {
 	char *magic = NULL;
 	const char *task = NULL;
 	xmlNode *xml_op = NULL;
 	char *op_id = NULL;
 	char *local_user_data = NULL;
 	const char *caller_version = NULL;	
 
 	CRM_CHECK(op != NULL, return FALSE);
 	crm_debug_2("%s: Updating resouce %s after %s %s op",
 		  src, op->rsc_id, op_status2text(op->op_status), op->op_type);
 
 	if(op->op_status == LRM_OP_CANCELLED) {
 		crm_debug_3("Ignoring cancelled op");
 		return TRUE;
 	}
 
 	if(AM_I_DC) {
 		caller_version = CRM_FEATURE_SET;
 
 	} else if(fsa_our_dc_version != NULL) {
 		caller_version = fsa_our_dc_version;
 
 	} else {
 		/* there is a small risk in formerly mixed clusters that
 		 *   it will be sub-optimal.
 		 * however with our upgrade policy, the update we send
 		 *   should still be completely supported anyway
 		 */
 		caller_version = g_hash_table_lookup(
 			op->params, XML_ATTR_CRM_VERSION);
 		crm_warn("Falling back to operation originator version: %s",
 			 caller_version);
 	}
 	crm_debug_3("DC version: %s", caller_version);
 
 	task = op->op_type;
 	/* remap the task name under various scenarios
 	 * this makes life easier for the PE when its trying determin the current state 
 	 */
 	if(crm_str_eq(task, "reload", TRUE)) {
 		if(op->op_status == LRM_OP_DONE) {
 			task = CRMD_ACTION_START;
 		} else {
 			task = CRMD_ACTION_STATUS;
 		}
 
 	} else if(crm_str_eq(task, CRMD_ACTION_MIGRATE, TRUE)) {
 		/* if the migrate_from fails it will have enough info to do the right thing */
 		if(op->op_status == LRM_OP_DONE) {
 			task = CRMD_ACTION_STOP;
 		} else {
 			task = CRMD_ACTION_STATUS;
 		}
 
 	} else if(op->op_status == LRM_OP_DONE
 		  && crm_str_eq(task, CRMD_ACTION_MIGRATED, TRUE)) {
 		task = CRMD_ACTION_START;
 
 	} else if(crm_str_eq(task, CRMD_ACTION_NOTIFY, TRUE)) {
 		const char *n_type = g_hash_table_lookup(
 			op->params, crm_meta_name("notify_type"));
 		const char *n_task = g_hash_table_lookup(
 			op->params, crm_meta_name("notify_operation"));
 		CRM_DEV_ASSERT(n_type != NULL);
 		CRM_DEV_ASSERT(n_task != NULL);
 		op_id = generate_notify_key(op->rsc_id, n_type, n_task);
 
 		/* these are not yet allowed to fail */
 		op->op_status = LRM_OP_DONE;
 		op->rc = 0;
 		
 	}
 
 	if (op_id == NULL) {
 		op_id = generate_op_key(op->rsc_id, task, op->interval);
 	}
 
 	xml_op = find_entity(xml_rsc, XML_LRM_TAG_RSC_OP, op_id);
 	if(xml_op != NULL) {
 		crm_log_xml(LOG_DEBUG, "Replacing existing entry", xml_op);
 		
 	} else {
 		xml_op = create_xml_node(xml_rsc, XML_LRM_TAG_RSC_OP);
 	}
 	
 	if(op->user_data == NULL) {
 		crm_debug("Generating fake transition key for:"
 			  " %s_%s_%d %d from %s",
 			  op->rsc_id, op->op_type, op->interval, op->call_id,
 			  op->app_name);
 		local_user_data = generate_transition_key(-1, op->call_id, 0, FAKE_TE_ID);
 		op->user_data = local_user_data;
 	}
 	
 	magic = generate_transition_magic(op->user_data, op->op_status, op->rc);
 	
 	crm_xml_add(xml_op, XML_ATTR_ID,		op_id);
 	crm_xml_add(xml_op, XML_LRM_ATTR_TASK,		task);
 	crm_xml_add(xml_op, XML_ATTR_ORIGIN,		src);
 	crm_xml_add(xml_op, XML_ATTR_CRM_VERSION,	caller_version);
 	crm_xml_add(xml_op, XML_ATTR_TRANSITION_KEY,	op->user_data);
 	crm_xml_add(xml_op, XML_ATTR_TRANSITION_MAGIC,	magic);
 
 	crm_xml_add_int(xml_op, XML_LRM_ATTR_CALLID,	op->call_id);
 	crm_xml_add_int(xml_op, XML_LRM_ATTR_RC,	op->rc);
 	crm_xml_add_int(xml_op, XML_LRM_ATTR_OPSTATUS,	op->op_status);
 	crm_xml_add_int(xml_op, XML_LRM_ATTR_INTERVAL,	op->interval);
 
 	if(compare_version("2.1", caller_version) <= 0) {
 	    if(op->t_run || op->t_rcchange || op->exec_time || op->queue_time) {
 		crm_debug_2("Timing data (%s_%s_%d): last=%lu change=%lu exec=%lu queue=%lu",
 			    op->rsc_id, op->op_type, op->interval,
 			    op->t_run, op->t_rcchange, op->exec_time, op->queue_time);
 	
 		crm_xml_add_int(xml_op, "last-run",       op->t_run);
 		crm_xml_add_int(xml_op, "last-rc-change", op->t_rcchange);
 		crm_xml_add_int(xml_op, "exec-time",      op->exec_time);
 		crm_xml_add_int(xml_op, "queue-time",     op->queue_time);
 	    }
 	}
 	
 	append_digest(rsc, op, xml_op, caller_version, magic, level);
 	append_restart_list(rsc, op, xml_op, caller_version);
 	
 	if(op->op_status != LRM_OP_DONE
 	   && crm_str_eq(op->op_type, CRMD_ACTION_MIGRATED, TRUE)) {
 		const char *host = g_hash_table_lookup(
 			op->params, crm_meta_name("migrate_source_uuid"));
 		crm_xml_add(xml_op, CRMD_ACTION_MIGRATED, host);
 	}	
 	
 	if(local_user_data) {
 		crm_free(local_user_data);
 		op->user_data = NULL;
 	}
 	crm_free(magic);	
 	crm_free(op_id);
 	return TRUE;
 }
 
 gboolean
 is_rsc_active(const char *rsc_id) 
 {
 	GList *op_list  = NULL;
 	gboolean active = FALSE;
 	lrm_rsc_t *the_rsc = NULL;
 	state_flag_t cur_state = 0;
 	int max_call_id = -1;
 	
 	if(fsa_lrm_conn == NULL) {
 		return FALSE;
 	}
 
 	the_rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, rsc_id);
 
 	crm_debug_3("Processing lrm_rsc_t entry %s", rsc_id);
 	
 	if(the_rsc == NULL) {
 		crm_err("NULL resource returned from the LRM");
 		return FALSE;
 	}
 	
 	op_list = the_rsc->ops->get_cur_state(the_rsc, &cur_state);
 	
 	crm_debug_3("\tcurrent state:%s",cur_state==LRM_RSC_IDLE?"Idle":"Busy");
 	
 	slist_iter(
 		op, lrm_op_t, op_list, llpc,
 		
 		crm_debug_2("Processing op %s_%d (%d) for %s (status=%d, rc=%d)", 
 			    op->op_type, op->interval, op->call_id, the_rsc->id,
 			    op->op_status, op->rc);
 		
 		CRM_ASSERT(max_call_id <= op->call_id);			
 		if(op->rc == EXECRA_OK
 		   && safe_str_eq(op->op_type, CRMD_ACTION_STOP)) {
 			active = FALSE;
 			
 		} else if(op->rc == EXECRA_OK
 			  && safe_str_eq(op->op_type, CRMD_ACTION_MIGRATE)) {
 			/* a stricter check is too complex...
 			 * leave that to the PE
 			 */
 			active = FALSE;
 			
 		} else if(op->rc == EXECRA_NOT_RUNNING) {
 			active = FALSE;
 
 		} else {
 			active = TRUE;
 		}
 		
 		max_call_id = op->call_id;
 		lrm_free_op(op);
 		);
 
 	g_list_free(op_list);
 	lrm_free_rsc(the_rsc);
 
 	return active;
 }
 
 
 gboolean
 build_active_RAs(xmlNode *rsc_list)
 {
 	GList *op_list  = NULL;
 	GList *lrm_list = NULL;
 	gboolean found_op = FALSE;
 	state_flag_t cur_state = 0;
 	
 	if(fsa_lrm_conn == NULL) {
 		return FALSE;
 	}
 
 	lrm_list = fsa_lrm_conn->lrm_ops->get_all_rscs(fsa_lrm_conn);
 
 	slist_iter(
 		rid, char, lrm_list, lpc,
 
 		int max_call_id = -1;
 		xmlNode *xml_rsc = NULL;
 		lrm_rsc_t *the_rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, rid);
 		
 		if(the_rsc == NULL) {
 		    crm_err("NULL resource returned from the LRM: %s", rid);
 		    continue;
 		}
 
 		xml_rsc = create_xml_node(rsc_list, XML_LRM_TAG_RESOURCE);
 		crm_xml_add(xml_rsc, XML_ATTR_ID, the_rsc->id);
 		crm_xml_add(xml_rsc, XML_ATTR_TYPE, the_rsc->type);
 		crm_xml_add(xml_rsc, XML_AGENT_ATTR_CLASS, the_rsc->class);
 		crm_xml_add(xml_rsc, XML_AGENT_ATTR_PROVIDER,the_rsc->provider);
 
 		op_list = the_rsc->ops->get_cur_state(the_rsc, &cur_state);
 
 		slist_iter(
 			op, lrm_op_t, op_list, llpc,
 
 			if(max_call_id < op->call_id) {
 				build_operation_update(
 				    xml_rsc, the_rsc, op, __FUNCTION__, llpc, LOG_DEBUG_2);
 
 			} else if(max_call_id > op->call_id) {
 				crm_err("Bad call_id in list=%d. Previous call_id=%d",
 					op->call_id, max_call_id);
 
 			} else {
 				crm_warn("lrm->get_cur_state() returned"
 					 " duplicate entries for call_id=%d",
 					 op->call_id);
 			}
 			max_call_id = op->call_id;
 			found_op = TRUE;
 			lrm_free_op(op);
 			);
 		
 		if(found_op == FALSE && g_list_length(op_list) != 0) {
 			crm_err("Could not properly determin last op"
 				" for %s from %d entries", the_rsc->id,
 				g_list_length(op_list));
 		}
 
 		g_list_free(op_list);
 		lrm_free_rsc(the_rsc);
 		);
 
 	g_list_free(lrm_list);
 
 	return TRUE;
 }
 
 xmlNode*
 do_lrm_query(gboolean is_replace)
 {
 	gboolean shut_down = FALSE;
 	xmlNode *xml_result= NULL;
 	xmlNode *xml_state = NULL;
 	xmlNode *xml_data  = NULL;
 	xmlNode *rsc_list  = NULL;
 	const char *exp_state = CRMD_STATE_ACTIVE;
 
 	if(is_set(fsa_input_register, R_SHUTDOWN)) {
 		exp_state = CRMD_STATE_INACTIVE;
 		shut_down = TRUE;
 	}
 	
 	xml_state = create_node_state(
 		fsa_our_uname, ACTIVESTATUS, XML_BOOLEAN_TRUE,
 		ONLINESTATUS, CRMD_JOINSTATE_MEMBER, exp_state,
 		!shut_down, __FUNCTION__);
 
 	xml_data  = create_xml_node(xml_state, XML_CIB_TAG_LRM);
 	crm_xml_add(xml_data, XML_ATTR_ID, fsa_our_uuid);
 	rsc_list  = create_xml_node(xml_data, XML_LRM_TAG_RESOURCES);
 
 	/* Build a list of active (not always running) resources */
 	build_active_RAs(rsc_list);
 
 	xml_result = create_cib_fragment(xml_state, XML_CIB_TAG_STATUS);
 	free_xml(xml_state);
 	
 	crm_log_xml_debug_3(xml_state, "Current state of the LRM");
 	
 	return xml_result;
 }
 
 
 /*
  * Remove the rsc from the CIB
  *
  * Avoids refreshing the entire LRM section of this host
  */
 #define rsc_template "//"XML_CIB_TAG_STATE"[@uname='%s']//"XML_LRM_TAG_RESOURCE"[@id='%s']"
 static void
 delete_rsc_entry(const char *rsc_id) 
 {
 	int max = 0;
 	char *rsc_xpath = NULL;
 
 	CRM_CHECK(rsc_id != NULL, return);
 	
 	max = strlen(rsc_template) + strlen(rsc_id) + strlen(fsa_our_uname) + 1;
 	crm_malloc0(rsc_xpath, max);
 	snprintf(rsc_xpath, max, rsc_template, fsa_our_uname, rsc_id);
 	CRM_CHECK(rsc_id != NULL, return);
 
 	crm_debug("sync: Sending delete op for %s", rsc_id);
 	fsa_cib_conn->cmds->delete(
 	    fsa_cib_conn, rsc_xpath, NULL, cib_quorum_override|cib_xpath);
 
 	crm_free(rsc_xpath);
 }
 
 /*
  * Remove the op from the CIB
  *
  * Avoids refreshing the entire LRM section of this host
  */
 
 #define op_template "//"XML_CIB_TAG_STATE"[@uname='%s']//"XML_LRM_TAG_RESOURCE"[@id='%s']/"XML_LRM_TAG_RSC_OP"[@id='%s']"
 #define op_call_template "//"XML_CIB_TAG_STATE"[@uname='%s']//"XML_LRM_TAG_RESOURCE"[@id='%s']/"XML_LRM_TAG_RSC_OP"[@id='%s' and @"XML_LRM_ATTR_CALLID"='%d']"
 
 static void
 delete_op_entry(lrm_op_t *op, const char *rsc_id, const char *key, int call_id) 
 {
 	xmlNode *xml_top = NULL;
 	if(op != NULL) {
 		xml_top = create_xml_node(NULL, XML_LRM_TAG_RSC_OP);
 		crm_xml_add_int(xml_top, XML_LRM_ATTR_CALLID, op->call_id);
 		crm_xml_add(xml_top, XML_ATTR_TRANSITION_KEY, op->user_data);
 		
 		crm_debug("async: Sending delete op for %s_%s_%d (call=%d)",
 			  op->rsc_id, op->op_type, op->interval, op->call_id);
 
 		fsa_cib_conn->cmds->delete(
 		    fsa_cib_conn, XML_CIB_TAG_STATUS, xml_top, cib_quorum_override);		
 
 	} else if (rsc_id != NULL && key != NULL) {
 	    int max = 0;
 	    char *op_xpath = NULL;
 	    if(call_id > 0) {
 		max = strlen(op_call_template) + strlen(rsc_id) + strlen(fsa_our_uname) + strlen(key) + 10;
 		crm_malloc0(op_xpath, max);
 		snprintf(op_xpath, max, op_call_template, fsa_our_uname, rsc_id, key, call_id);
 		
 	    } else {
 		max = strlen(op_template) + strlen(rsc_id) + strlen(fsa_our_uname) + strlen(key) + 1;
 		crm_malloc0(op_xpath, max);
 		snprintf(op_xpath, max, op_template, fsa_our_uname, rsc_id, key);
 	    }
 	    
 	    crm_debug("sync: Sending delete op for %s (call=%d)", rsc_id, call_id);
 	    fsa_cib_conn->cmds->delete(
 		fsa_cib_conn, op_xpath, NULL, cib_quorum_override|cib_xpath);
 
 	    crm_free(op_xpath);
 		
 	} else {
 		crm_err("Not enough information to delete op entry: rsc=%p key=%p", rsc_id, key);
 		return;
 	}
 
  	crm_log_xml_debug_2(xml_top, "op:cancel");
  	free_xml(xml_top);
 }
 
 static gboolean
 cancel_op(lrm_rsc_t *rsc, const char *key, int op, gboolean remove)
 {
 	int rc = HA_OK;
 	struct recurring_op_s *pending = NULL;
 
 	CRM_CHECK(op != 0, return FALSE);
 	CRM_CHECK(rsc != NULL, return FALSE);
 	if(key == NULL) {
 	    key = make_stop_id(rsc->id, op);
 	}
 	pending = g_hash_table_lookup(pending_ops, key);
 
 	if(pending) {
 	    if(remove && pending->remove == FALSE) {
 		pending->remove = TRUE;
 		crm_debug("Scheduling %s for removal", key);
 	    }
 	    
 	    if(pending->cancelled) {
 		crm_debug("Operation %s already cancelled", key);
 		return TRUE;
 	    }
 
 	    pending->cancelled = TRUE;
 
 	} else {
 	    crm_info("No pending op found for %s", key);
 	}
 
 	crm_debug("Cancelling op %d for %s (%s)", op, rsc->id, key);
 
 	rc = rsc->ops->cancel_op(rsc, op);
 	if(rc != HA_OK) {
 		crm_debug("Op %d for %s (%s): Nothing to cancel", op, rsc->id, key);
 		/* The caller needs to make sure the entry is
 		 * removed from the pending_ops list
 		 *
 		 * Usually by returning TRUE inside the worker function
 		 * supplied to g_hash_table_foreach_remove()
 		 *
 		 * Not removing the entry from pending_ops will block
 		 * the node from shutting down
 		 */
 		return FALSE;
 	}
 	
 	return TRUE;
 }
 
 struct cancel_data 
 {
 	gboolean done;
 	gboolean remove;
 	const char *key;
 	lrm_rsc_t *rsc;
 };
 
 static gboolean
 cancel_action_by_key(gpointer key, gpointer value, gpointer user_data)
 {
 	struct cancel_data *data = user_data;
 	struct recurring_op_s *op = (struct recurring_op_s*)value;
 	
 	if(safe_str_eq(op->op_key, data->key)) {
 	    data->done = TRUE;
 	    if (cancel_op(data->rsc, key, op->call_id, data->remove) == FALSE) {
 		return TRUE;
 	    }
 	}
 	return FALSE;
 }
 
 static gboolean
 cancel_op_key(lrm_rsc_t *rsc, const char *key, gboolean remove)
 {
 	struct cancel_data data;
 
 	CRM_CHECK(rsc != NULL, return FALSE);
 	CRM_CHECK(key != NULL, return FALSE);
 
 	data.key = key;
 	data.rsc = rsc;
 	data.done = FALSE;
 	data.remove = remove;
 	
 	g_hash_table_foreach_remove(pending_ops, cancel_action_by_key, &data);
 	return data.done;
 }
 
 static lrm_rsc_t *
 get_lrm_resource(xmlNode *resource, xmlNode *op_msg, gboolean do_create)
 {
 	char rid[64];
 	lrm_rsc_t *rsc = NULL;
 	const char *short_id = ID(resource);
 	const char *long_id = crm_element_value(resource, XML_ATTR_ID_LONG);
 		
 	crm_debug_2("Retrieving %s from the LRM.", short_id);
 	CRM_CHECK(short_id != NULL, return NULL);
 	
 	if(rsc == NULL) {
 		/* check if its already there (short name) */
 		strncpy(rid, short_id, 64);
 		rid[63] = 0;
 		rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, rid);
 	}
 	if(rsc == NULL && long_id != NULL) {
 		/* try the long name instead */
 		strncpy(rid, long_id, 64);
 		rid[63] = 0;
 		rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, rid);
 	}
 
 	if(rsc == NULL && do_create) {
 		/* add it to the LRM */
 		const char *type = crm_element_value(resource, XML_ATTR_TYPE);
 		const char *class = crm_element_value(resource, XML_AGENT_ATTR_CLASS);
 		const char *provider = crm_element_value(resource, XML_AGENT_ATTR_PROVIDER);
 		GHashTable *params = xml2list(op_msg);
 
 		CRM_CHECK(class != NULL, return NULL);
 		CRM_CHECK(type != NULL, return NULL);
 
 		crm_debug_2("Adding rsc %s before operation", short_id);
 		strncpy(rid, short_id, 64);
 		rid[63] = 0;
 
 		if(g_hash_table_size(params) == 0) {
 			crm_log_xml_warn(op_msg, "EmptyParams");
 		}
 		
 		fsa_lrm_conn->lrm_ops->add_rsc(
 			fsa_lrm_conn, rid, class, type, provider, params);
 		
 		rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, rid);
 		g_hash_table_destroy(params);
 
 		if(rsc == NULL) {
 			fsa_data_t *msg_data = NULL;
 			crm_err("Could not add resource %s to LRM", rid);
 			register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
 		}
 	}
 	return rsc;
 }
 
 static gboolean lrm_remove_deleted_op(
     gpointer key, gpointer value, gpointer user_data)
 {
     const char *rsc = user_data;
     struct recurring_op_s *pending = value;
     if(safe_str_eq(rsc, pending->rsc_id)) {
 	crm_info("Removing op %s:%d for deleted resource %s",
 		 pending->op_key, pending->call_id, rsc);
 	return TRUE;
     }
     return FALSE;
 }
 
 
 /*	 A_LRM_INVOKE	*/
 void
 do_lrm_invoke(long long action,
 	      enum crmd_fsa_cause cause,
 	      enum crmd_fsa_state cur_state,
 	      enum crmd_fsa_input current_input,
 	      fsa_data_t *msg_data)
 {
 	gboolean done = FALSE;
 	gboolean create_rsc = TRUE;
 	const char *crm_op = NULL;
 	const char *from_sys = NULL;
 	const char *from_host = NULL;
 	const char *operation = NULL;
 	ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg);
 
 	crm_op    = crm_element_value(input->msg, F_CRM_TASK);
 	from_sys  = crm_element_value(input->msg, F_CRM_SYS_FROM);
 	if(safe_str_neq(from_sys, CRM_SYSTEM_TENGINE)) {
 		from_host = crm_element_value(input->msg, F_CRM_HOST_FROM);
 	}
 	
 	crm_debug_2("LRM command from: %s", from_sys);
 	
 	if(safe_str_eq(crm_op, CRM_OP_LRM_DELETE)) {
 		operation = CRMD_ACTION_DELETE;
 
 	} else if(safe_str_eq(operation, CRM_OP_LRM_REFRESH)) {
 		crm_op = CRM_OP_LRM_REFRESH;
 
 	} else if(safe_str_eq(crm_op, CRM_OP_LRM_FAIL)) {
 #if HAVE_LRM_ASYNC_FAIL
 		lrm_rsc_t *rsc = NULL;
 		xmlNode *xml_rsc = find_xml_node(
 			input->xml, XML_CIB_TAG_RESOURCE, TRUE);
 
 		CRM_CHECK(xml_rsc != NULL, return);
 
 		rsc = get_lrm_resource(xml_rsc, input->xml, create_rsc);
 		if(rsc) {
 		    int rc = HA_OK;
 		    crm_info("Failing resource %s...", rsc->id);
 
 		    rc = fsa_lrm_conn->lrm_ops->fail_rsc(fsa_lrm_conn, rsc->id, 1, "do_lrm_invoke: Async failure");
 		    if(rc != HA_OK) {
 			crm_err("Could not initiate an asynchronous failure for %s (%d)", rsc->id, rc);
 		    }
 
 		    lrm_free_rsc(rsc);
 		    
 		} else {
 		    crm_info("Cannot find/create resource in order to fail it...");
 		    crm_log_xml_warn(input->msg, "bad input");
 		}
 		return;
 #else
 		crm_info("Failing resource...");
 		operation = "fail";
 #endif
 
 	} else if(input->xml != NULL) {
 		operation = crm_element_value(input->xml, XML_LRM_ATTR_TASK);
 	}
 
 	if(safe_str_eq(crm_op, CRM_OP_LRM_REFRESH)) {
 		enum cib_errors rc = cib_ok;
 		xmlNode *fragment = do_lrm_query(TRUE);
 		crm_info("Forcing a local LRM refresh");
 
 		fsa_cib_update(XML_CIB_TAG_STATUS, fragment,
 			       cib_quorum_override, rc);
 		free_xml(fragment);
 		
 	} else if(safe_str_eq(crm_op, CRM_OP_LRM_QUERY)) {
 		xmlNode *data = do_lrm_query(FALSE);
 		xmlNode *reply = create_reply(input->msg, data);
 
 		if(relay_message(reply, TRUE) == FALSE) {
 			crm_err("Unable to route reply");
 			crm_log_xml(LOG_ERR, "reply", reply);
 		}
 		free_xml(reply);
 		free_xml(data);
 
 	} else if(safe_str_eq(operation, CRM_OP_PROBED)
 		  || safe_str_eq(crm_op, CRM_OP_REPROBE)) {
 		int cib_options = cib_inhibit_notify;
 		const char *probed = XML_BOOLEAN_TRUE;
 		if(safe_str_eq(crm_op, CRM_OP_REPROBE)) {
 			cib_options = cib_none;
 			probed = XML_BOOLEAN_FALSE;
 		}
 		
 		update_attr(fsa_cib_conn, cib_inhibit_notify, XML_CIB_TAG_STATUS,
 			    fsa_our_uuid, NULL, NULL, CRM_OP_PROBED, probed, FALSE);
 
 	} else if(operation != NULL) {
 		lrm_rsc_t *rsc = NULL;
 		xmlNode *params = NULL;
 		xmlNode *xml_rsc = find_xml_node(
 			input->xml, XML_CIB_TAG_RESOURCE, TRUE);
 
 		CRM_CHECK(xml_rsc != NULL, return);
 		
 		/* only the first 16 chars are used by the LRM */
 		params  = find_xml_node(input->xml, XML_TAG_ATTRS, TRUE);
 
 		if(safe_str_eq(operation, CRMD_ACTION_DELETE)) {
 			create_rsc = FALSE;
 		}
 		
 		rsc = get_lrm_resource(xml_rsc, input->xml, create_rsc);
 
 		if(rsc == NULL && create_rsc) {
 			crm_err("Invalid resource definition");
 			crm_log_xml_warn(input->msg, "bad input");
 
 		} else if(rsc == NULL) {
 			lrm_op_t* op = NULL;
 			crm_err("Not creating resource for a %s event: %s",
 				operation, ID(input->xml));
 			crm_log_xml_warn(input->msg, "bad input");
 
 			op = construct_op(input->xml, ID(xml_rsc), operation);
 			op->op_status = LRM_OP_DONE;
 			op->rc = EXECRA_OK;
 			CRM_ASSERT(op != NULL);
 			send_direct_ack(from_host, from_sys, NULL, op, ID(xml_rsc));
 			free_lrm_op(op);			
 			
 		} else if(safe_str_eq(operation, CRMD_ACTION_CANCEL)) {
 			lrm_op_t* op = NULL;
 			char *op_key = NULL;
 			int call = 0;
 			const char *call_id = NULL;
 			const char *op_task = NULL;
 			const char *op_interval = NULL;
 
 			CRM_CHECK(params != NULL,
 				  crm_log_xml_warn(input->xml, "Bad command");
 				  return);
 
 			op_interval = crm_element_value(params, crm_meta_name("interval"));
 			op_task = crm_element_value(params, crm_meta_name(XML_LRM_ATTR_TASK));
 			call_id = crm_element_value(params, crm_meta_name(XML_LRM_ATTR_CALLID));
 #if CRM_DEPRECATED_SINCE_2_0_5
 			if(op_interval == NULL) {
 				op_interval = crm_element_value(params, "interval");
 			}
 			if(op_task == NULL) {
 				op_task = crm_element_value(params, XML_LRM_ATTR_TASK);
 				if(op_task == NULL) {
 					op_task = crm_element_value(params, "task");
 				}
 			}
 #endif
 			CRM_CHECK(op_task != NULL,
 				  crm_log_xml_warn(input->xml, "Bad command");
 				  return);
 			CRM_CHECK(op_interval != NULL,
 				  crm_log_xml_warn(input->xml, "Bad command");
 				  return);
 
 			op = construct_op(input->xml, rsc->id, op_task);
 			CRM_ASSERT(op != NULL);
 			op_key = generate_op_key(
 				rsc->id,op_task,crm_parse_int(op_interval,"0"));
 
 			crm_debug("PE requested op %s (call=%s) be cancelled",
 				  op_key, call_id?call_id:"NA");
 			call = crm_parse_int(call_id, "0");
 			if(call == 0) {
 			    /* the normal case when the PE cancels a recurring op */
 			    done = cancel_op_key(rsc, op_key, TRUE);
 
 			} else {
 			    /* the normal case when the PE cancels an orphan op */
 			    done = cancel_op(rsc, NULL, call, TRUE);
 			}
 
 			if(done == FALSE) {
 			    crm_debug("Nothing known about operation %d for %s", call, op_key);
 			    delete_op_entry(NULL, rsc->id, op_key, call);
 
 			    /* needed?? surely not otherwise the cancel_op_(_key) wouldn't
 			     * have failed in the first place
 			     */
 			    g_hash_table_remove(pending_ops, op_key);
 			}
 
 			op->rc = EXECRA_OK;
 			op->op_status = LRM_OP_DONE;
 			send_direct_ack(from_host, from_sys, rsc, op, rsc->id);
 			
 			crm_free(op_key);
 			free_lrm_op(op);			
 			
 		} else if(safe_str_eq(operation, CRMD_ACTION_DELETE)) {
 			int rc = HA_OK;
 			lrm_op_t* op = NULL;
 
 			CRM_ASSERT(rsc != NULL);
 			op = construct_op(input->xml, rsc->id, operation);
 			CRM_ASSERT(op != NULL);
 			op->op_status = LRM_OP_DONE;
 			op->rc = EXECRA_OK;
 
 			crm_info("Removing resource %s from the LRM", rsc->id);
 			rc = fsa_lrm_conn->lrm_ops->delete_rsc(fsa_lrm_conn, rsc->id);
 			
 			if(rc != HA_OK) {
 			    crm_err("Failed to remove resource %s", rsc->id);
 			    op->op_status = LRM_OP_ERROR;
 			    op->rc = EXECRA_UNKNOWN_ERROR;
 			}
 
 			delete_rsc_entry(rsc->id);
 			send_direct_ack(from_host, from_sys, rsc, op, rsc->id);
 			free_lrm_op(op);			
 
 			g_hash_table_foreach_remove(pending_ops, lrm_remove_deleted_op, rsc->id);
 			
 			if(safe_str_neq(from_sys, CRM_SYSTEM_TENGINE)) {
 				/* this isn't expected - trigger a new transition */
 				time_t now = time(NULL);
 				char *now_s = crm_itoa(now);
 
 				crm_debug("Triggering a refresh after %s deleted %s from the LRM",
 					  from_sys, rsc->id);
 
 				update_attr(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG,
 					    NULL, NULL, NULL, "last-lrm-refresh", now_s, FALSE);
 				crm_free(now_s);
 			}
 			
 			
 		} else if(rsc != NULL) {
 		    do_lrm_rsc_op(rsc, operation, input->xml, input->msg);
 		}
 		
 		lrm_free_rsc(rsc);
 
 	} else {
 		crm_err("Operation was neither a lrm_query, nor a rsc op.  %s",
 			crm_str(crm_op));
 		register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 	}
 }
 
 lrm_op_t *
 construct_op(xmlNode *rsc_op, const char *rsc_id, const char *operation)
 {
 	lrm_op_t *op = NULL;
 	const char *op_delay = NULL;
 	const char *op_timeout = NULL;
 	const char *op_interval = NULL;
 	
 	const char *transition = NULL;
 	CRM_DEV_ASSERT(rsc_id != NULL);
 
 	crm_malloc0(op, sizeof(lrm_op_t));
 	op->op_type   = crm_strdup(operation);
 	op->op_status = LRM_OP_PENDING;
 	op->rc = -1;
 	op->rsc_id = crm_strdup(rsc_id);
 	op->interval = 0;
 	op->timeout  = 0;
 	op->start_delay = 0;
 	op->copyparams = 0;
 	op->app_name = crm_strdup(CRM_SYSTEM_CRMD);
 
 	if(rsc_op == NULL) {
 		CRM_DEV_ASSERT(safe_str_eq(CRMD_ACTION_STOP, operation));
 		op->user_data = NULL;
 		op->user_data_len = 0;
 		/* the stop_all_resources() case
 		 * by definition there is no DC (or they'd be shutting
 		 *   us down).
 		 * So we should put our version here.
 		 */
 		op->params = g_hash_table_new_full(
 			g_str_hash, g_str_equal,
 			g_hash_destroy_str, g_hash_destroy_str);
 		
 		g_hash_table_insert(op->params,
 				    crm_strdup(XML_ATTR_CRM_VERSION),
 				    crm_strdup(CRM_FEATURE_SET));
 
 		crm_debug_2("Constructed %s op for %s", operation, rsc_id);
 		return op;
 	}
 
 	op->params = xml2list(rsc_op);
 	if(op->params == NULL) {
 		CRM_DEV_ASSERT(safe_str_eq(CRMD_ACTION_STOP, operation));
 	}
 
 	op_delay = g_hash_table_lookup(op->params, crm_meta_name("start_delay"));
 	op_timeout = g_hash_table_lookup(op->params, crm_meta_name("timeout"));
 	op_interval = g_hash_table_lookup(op->params, crm_meta_name("interval"));
 #if CRM_DEPRECATED_SINCE_2_0_5
 	if(op_delay == NULL) {
 		op_delay = g_hash_table_lookup(op->params, "start_delay");
 	}
 	if(op_timeout == NULL) {
 		op_timeout = g_hash_table_lookup(op->params, "timeout");
 	}
 	if(op_interval == NULL) {
 		op_interval = g_hash_table_lookup(op->params, "interval");
 	}
 #endif
 	
 	op->interval = crm_parse_int(op_interval, "0");
 	op->timeout  = crm_parse_int(op_timeout,  "0");
 	op->start_delay = crm_parse_int(op_delay, "0");
 
 	/* sanity */
 	if(op->interval < 0) {
 		op->interval = 0;
 	}
 	if(op->timeout < 0) {
 		op->timeout = 0;
 	}
 	if(op->start_delay < 0) {
 		op->start_delay = 0;
 	}
 
 	transition = crm_element_value(rsc_op, XML_ATTR_TRANSITION_KEY);
 	CRM_CHECK(transition != NULL, return op);
 	
 	op->user_data = crm_strdup(transition);
 	op->user_data_len = 1+strlen(op->user_data);
 
 	if(op->interval != 0) {
 		if(safe_str_eq(operation, CRMD_ACTION_START)
 		   || safe_str_eq(operation, CRMD_ACTION_STOP)) {
 			crm_err("Start and Stop actions cannot have an interval");
 			op->interval = 0;
 		}
 	}
 
 	/* reset the resource's parameters? */
 	if(op->interval == 0) {
 	    if(safe_str_eq(CRMD_ACTION_START, operation)
 	       || safe_str_eq(CRMD_ACTION_STATUS, operation)) {
 		op->copyparams = 1;
 	    }
 	}
 	
 	crm_debug_2("Constructed %s op for %s: interval=%d",
 		    operation, rsc_id, op->interval);	
 	
 	return op;
 }
 
 void
 send_direct_ack(const char *to_host, const char *to_sys,
 		lrm_rsc_t *rsc, lrm_op_t* op, const char *rsc_id)
 {
 	xmlNode *reply = NULL;
 	xmlNode *update, *iter;
 	xmlNode *fragment;
 	
 	CRM_CHECK(op != NULL, return);
 	if(op->rsc_id == NULL) {
 		CRM_DEV_ASSERT(rsc_id != NULL);
 		op->rsc_id = crm_strdup(rsc_id);
 	}
 	if(to_sys == NULL) {
 		to_sys = CRM_SYSTEM_TENGINE;
 	}
 	update = create_node_state(
 		fsa_our_uname, NULL, NULL, NULL, NULL, NULL, FALSE, __FUNCTION__);
 
 	iter = create_xml_node(update, XML_CIB_TAG_LRM);
 	crm_xml_add(iter, XML_ATTR_ID, fsa_our_uuid);
 	iter = create_xml_node(iter,   XML_LRM_TAG_RESOURCES);
 	iter = create_xml_node(iter,   XML_LRM_TAG_RESOURCE);
 
 	crm_xml_add(iter, XML_ATTR_ID, op->rsc_id);
 
 	build_operation_update(iter, rsc, op, __FUNCTION__, 0, LOG_DEBUG);
 	fragment = create_cib_fragment(update, XML_CIB_TAG_STATUS);
 
 	reply = create_request(CRM_OP_INVOKE_LRM, fragment, to_host,
 			       to_sys, CRM_SYSTEM_LRMD, NULL);
 
 	crm_log_xml_debug_2(update, "ACK Update");
 
 	crm_info("ACK'ing resource op %s_%s_%d from %s: %s",
 		 op->rsc_id, op->op_type, op->interval, op->user_data,
 		 crm_element_value(reply, XML_ATTR_REFERENCE));
 
 	if(relay_message(reply, TRUE) == FALSE) {
 		crm_log_xml(LOG_ERR, "Unable to route reply", reply);
 	}
 
 	free_xml(fragment);
 	free_xml(update);
 	free_xml(reply);
 }
 
 static gboolean
 stop_recurring_action_by_rsc(gpointer key, gpointer value, gpointer user_data)
 {
 	lrm_rsc_t *rsc = user_data;
 	struct recurring_op_s *op = (struct recurring_op_s*)value;
 	
 	if(op->interval != 0 && safe_str_eq(op->rsc_id, rsc->id)) {
 		if (cancel_op(rsc, key, op->call_id, FALSE) == FALSE) {
 			return TRUE;
 		}
 	}
 
 	return FALSE;
 }
 
 void
 do_lrm_rsc_op(lrm_rsc_t *rsc, const char *operation,
 	      xmlNode *msg, xmlNode *request)
 {
 	int call_id  = 0;
 	char *op_id  = NULL;
 	lrm_op_t* op = NULL;
 
 	fsa_data_t *msg_data = NULL;
 	const char *transition = NULL;	
 
 	CRM_CHECK(rsc != NULL, return);
 	
 	if(msg != NULL) {
 		transition = crm_element_value(msg, XML_ATTR_TRANSITION_KEY);
 		if(transition == NULL) {
 			crm_log_xml_err(msg, "Missing transition number");
 		}
 	}
 
 	op = construct_op(msg, rsc->id, operation);
 
 	/* stop the monitor before stopping the resource */
 	if(crm_str_eq(operation, CRMD_ACTION_STOP, TRUE)
 	   || crm_str_eq(operation, CRMD_ACTION_DEMOTE, TRUE)
 	   || crm_str_eq(operation, CRMD_ACTION_PROMOTE, TRUE)
 	   || crm_str_eq(operation, CRMD_ACTION_MIGRATE, TRUE)) {
 		g_hash_table_foreach_remove(pending_ops, stop_recurring_action_by_rsc, rsc);
 	}
 	
 	/* now do the op */
 	crm_info("Performing key=%s op=%s_%s_%d )",
 		 transition, rsc->id, operation, op->interval);
 
 	if(fsa_state != S_NOT_DC && fsa_state != S_TRANSITION_ENGINE) {
 		if(safe_str_neq(operation, "fail")
 		   && safe_str_neq(operation, CRMD_ACTION_STOP)) {
 			crm_info("Discarding attempt to perform action %s on %s"
 				 " in state %s", operation, rsc->id,
 				 fsa_state2string(fsa_state));
 			op->rc = 99;
 			op->op_status = LRM_OP_ERROR;
 			send_direct_ack(NULL, NULL, rsc, op, rsc->id);
 			free_lrm_op(op);
 			crm_free(op_id);
 			return;
 		}
 	}
 
 	op_id = generate_op_key(rsc->id, op->op_type, op->interval);
 
 	if(op->interval > 0) {
 		/* cancel it so we can then restart it without conflict */
 		cancel_op_key(rsc, op_id, FALSE);
 		op->target_rc = CHANGED;
 
 	} else {
 		op->target_rc = EVERYTIME;
 	}
 
 	g_hash_table_replace(resources,crm_strdup(rsc->id), crm_strdup(op_id));
 	call_id = rsc->ops->perform_op(rsc, op);
 
 	if(call_id <= 0) {
 		crm_err("Operation %s on %s failed: %d",
 			operation, rsc->id, call_id);
 		register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
 
 	} else if(op->interval > 0 && op->start_delay > 5 * 60 * 1000) {
 	    char *uuid = NULL;
 	    int dummy = 0, target_rc = 0;
 	    crm_info("Faking confirmation of %s: execution postponed for over 5 minutes", op_id);
 	    
 	    decode_transition_key(op->user_data, &uuid, &dummy, &dummy, &target_rc);
 	    crm_free(uuid);
 
 	    op->rc = target_rc;
 	    op->op_status = LRM_OP_DONE;
 	    send_direct_ack(NULL, NULL, rsc, op, rsc->id);
 	    
 	} else {
 		/* record all operations so we can wait
 		 * for them to complete during shutdown
 		 */
 		char *call_id_s = make_stop_id(rsc->id, call_id);
 		struct recurring_op_s *pending = NULL;
 		crm_malloc0(pending, sizeof(struct recurring_op_s));
 		crm_debug_2("Recording pending op: %d - %s %s", call_id, op_id, call_id_s);
 		
 		pending->call_id  = call_id;
 		pending->interval = op->interval;
 		pending->op_key   = crm_strdup(op_id);
 		pending->rsc_id   = crm_strdup(rsc->id);
 		g_hash_table_replace(pending_ops, call_id_s, pending);
 	}
 
 	crm_free(op_id);
 	free_lrm_op(op);		
 	return;
 }
 
 void
 free_recurring_op(gpointer value)
 {
 	struct recurring_op_s *op = (struct recurring_op_s*)value;
 	crm_free(op->rsc_id);
 	crm_free(op->op_key);
 	crm_free(op);
 }
 
 
 void
 free_lrm_op(lrm_op_t *op) 
 {
 	g_hash_table_destroy(op->params);
 	crm_free(op->user_data);
 	crm_free(op->output);
 	crm_free(op->rsc_id);
 	crm_free(op->op_type);
 	crm_free(op->app_name);
 	crm_free(op);	
 }
 
 
 static void dup_attr(gpointer key, gpointer value, gpointer user_data)
 {
 	g_hash_table_replace(user_data, crm_strdup(key), crm_strdup(value));
 }
 
 lrm_op_t *
 copy_lrm_op(const lrm_op_t *op)
 {
 	lrm_op_t *op_copy = NULL;
 
 	CRM_CHECK(op != NULL, return NULL);
 	CRM_CHECK(op->rsc_id != NULL, return NULL);
 
 	crm_malloc0(op_copy, sizeof(lrm_op_t));
 
 	op_copy->op_type = crm_strdup(op->op_type);
  	/* input fields */
 	op_copy->params = g_hash_table_new_full(
 		g_str_hash, g_str_equal,
 		g_hash_destroy_str, g_hash_destroy_str);
 	
 	if(op->params != NULL) {
 		g_hash_table_foreach(op->params, dup_attr, op_copy->params);
 	}
 	op_copy->timeout   = op->timeout;
 	op_copy->interval  = op->interval; 
 	op_copy->target_rc = op->target_rc; 
 
 	/* in the CRM, this is always a string */
 	if(op->user_data != NULL) {
 		op_copy->user_data = crm_strdup(op->user_data); 
 	}
 	
 	/* output fields */
 	op_copy->op_status = op->op_status; 
 	op_copy->rc        = op->rc; 
 	op_copy->call_id   = op->call_id; 
 	op_copy->output    = NULL;
 	op_copy->rsc_id    = crm_strdup(op->rsc_id);
 	if(op->app_name != NULL) {
 		op_copy->app_name  = crm_strdup(op->app_name);
 	}
 	if(op->output != NULL) {
 		op_copy->output = crm_strdup(op->output);
 	}
 	
 	return op_copy;
 }
 
 
 lrm_rsc_t *
 copy_lrm_rsc(const lrm_rsc_t *rsc)
 {
 	lrm_rsc_t *rsc_copy = NULL;
 
 	if(rsc == NULL) {
 		return NULL;
 	}
 	
 	crm_malloc0(rsc_copy, sizeof(lrm_rsc_t));
 
 	rsc_copy->id       = crm_strdup(rsc->id);
 	rsc_copy->type     = crm_strdup(rsc->type);
 	rsc_copy->class    = NULL;
 	rsc_copy->provider = NULL;
 
 	if(rsc->class != NULL) {
 		rsc_copy->class    = crm_strdup(rsc->class);
 	}
 	if(rsc->provider != NULL) {
 		rsc_copy->provider = crm_strdup(rsc->provider);
 	}
 /* 	GHashTable* 	params; */
 	rsc_copy->params = NULL;
 	rsc_copy->ops    = NULL;
 
 	return rsc_copy;
 }
 
 void
 cib_rsc_callback(xmlNode *msg, int call_id, int rc,
 		 xmlNode *output, void *user_data)
 {
     switch(rc) {
 	case cib_ok:
 	case cib_diff_failed:
 	case cib_diff_resync:
 	    crm_debug_2("Resource update %d complete: rc=%d", call_id, rc);
 	    break;
 	default:
 	    crm_err("Resource update %d failed: (rc=%d) %s",
 		    call_id, rc, cib_error2string(rc));	
     }
 }
 
 
 int
 do_update_resource(lrm_op_t* op)
 {
 /*
   <status>
     <nodes_status id=uname>
       <lrm>
         <lrm_resources>
           <lrm_resource id=...>
           </...>
 */
 	int rc = cib_ok;
 	lrm_rsc_t *rsc = NULL;
 	xmlNode *update, *iter;
 	
 	CRM_CHECK(op != NULL, return 0);
 
 	update = create_node_state(
 		fsa_our_uname, NULL, NULL, NULL, NULL, NULL, FALSE, __FUNCTION__);
 
 	iter = create_xml_node(update, XML_CIB_TAG_LRM);
 	crm_xml_add(iter, XML_ATTR_ID, fsa_our_uuid);
 	iter = create_xml_node(iter,   XML_LRM_TAG_RESOURCES);
 	iter = create_xml_node(iter,   XML_LRM_TAG_RESOURCE);
 
 	crm_xml_add(iter, XML_ATTR_ID, op->rsc_id);
 		
 	rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, op->rsc_id);
 
 	CRM_CHECK(rsc->type != NULL,
 		  crm_err("Resource %s has no value for type", op->rsc_id));
 	CRM_CHECK(rsc->class != NULL,
 		  crm_err("Resource %s has no value for class", op->rsc_id));
 
 	crm_xml_add(iter, XML_ATTR_TYPE, rsc->type);
 	crm_xml_add(iter, XML_AGENT_ATTR_CLASS, rsc->class);
 	crm_xml_add(iter, XML_AGENT_ATTR_PROVIDER,rsc->provider);	
 	
 	build_operation_update(iter, rsc, op, __FUNCTION__, 0, LOG_DEBUG);
 	lrm_free_rsc(rsc);
 
 	/* make it an asyncronous call and be done with it
 	 *
 	 * Best case:
 	 *   the resource state will be discovered during
 	 *   the next signup or election.
 	 *
 	 * Bad case:
 	 *   we are shutting down and there is no DC at the time,
 	 *   but then why were we shutting down then anyway?
 	 *   (probably because of an internal error)
 	 *
 	 * Worst case:
 	 *   we get shot for having resources "running" when the really weren't
 	 *
 	 * the alternative however means blocking here for too long, which
 	 * isnt acceptable
 	 */
 	fsa_cib_update(XML_CIB_TAG_STATUS, update, cib_quorum_override, rc);
 			
 	/* the return code is a call number, not an error code */
 	crm_debug_2("Sent resource state update message: %d", rc);
 	fsa_cib_conn->cmds->register_callback(
 	    fsa_cib_conn, rc, 60, FALSE, NULL, "cib_rsc_callback", cib_rsc_callback);
 	
 	free_xml(update);
 	return rc;
 }
 
 void
 do_lrm_event(long long action,
 	     enum crmd_fsa_cause cause,
 	     enum crmd_fsa_state cur_state,
 	     enum crmd_fsa_input cur_input,
 	     fsa_data_t *msg_data)
 {
     CRM_CHECK(FALSE, return);
 }
 
 
 gboolean
 process_lrm_event(lrm_op_t *op)
 {
 	char *op_id = NULL;
 	char *op_key = NULL;
 
 	int update_id = 0;
 	int log_level = LOG_ERR;
 	gboolean removed = FALSE;
 	
 	struct recurring_op_s *pending = NULL;
 	CRM_CHECK(op != NULL, return FALSE);
 	CRM_CHECK(op->rsc_id != NULL, return FALSE);
 
 	op_key = generate_op_key(op->rsc_id, op->op_type, op->interval);
 	
 	switch(op->op_status) {
 		case LRM_OP_ERROR:
 		case LRM_OP_PENDING:
 		case LRM_OP_NOTSUPPORTED:
 			break;
 		case LRM_OP_CANCELLED:
 			log_level = LOG_INFO;
 			break;
 		case LRM_OP_DONE:
 			log_level = LOG_INFO;
 			break;
 		case LRM_OP_TIMEOUT:
 			log_level = LOG_DEBUG_3;
 			crm_err("LRM operation %s (%d) %s (timeout=%dms)",
 				op_key, op->call_id,
 				op_status2text(op->op_status), op->timeout);
 			break;
 		default:
 			crm_err("Mapping unknown status (%d) to ERROR",
 				op->op_status);
 			op->op_status = LRM_OP_ERROR;
 	}
 
 	if(op->op_status == LRM_OP_ERROR
 	   && (op->rc == EXECRA_RUNNING_MASTER || op->rc == EXECRA_NOT_RUNNING)) {
 		/* Leave it up to the TE/PE to decide if this is an error */ 
 		op->op_status = LRM_OP_DONE;
 		log_level = LOG_INFO;
 	}
 
 	op_id = make_stop_id(op->rsc_id, op->call_id);
 	pending = g_hash_table_lookup(pending_ops, op_id);
 
 	if(op->op_status != LRM_OP_CANCELLED) {
 		update_id = do_update_resource(op);
 		if(op->interval != 0) {
 			goto out;
 		}
 		
 	} else if(op->interval == 0) {
 		/* no known valid reason for this to happen */
 		crm_err("Op %s (call=%d): Cancelled", op_key, op->call_id);
 
 	} else if(pending == NULL) {
 		crm_err("Op %s (call=%d): No 'pending' entry",
 			op_key, op->call_id);
 
 	} else if(op->user_data == NULL) {
 		crm_err("Op %s (call=%d): No user data", op_key, op->call_id);
 	    
 	} else if(pending->remove) {
 		delete_op_entry(op, op->rsc_id, op_key, op->call_id);
 
 	} else {
 		crm_debug("Op %s (call=%d): no delete event required", op_key, op->call_id);
 	}
 
 	if(g_hash_table_remove(pending_ops, op_id)) {
 	    removed = TRUE;
 	    crm_debug_2("Op %s (call=%d, stop-id=%s): Confirmed", op_key, op->call_id, op_id);
 	}
 
   out:
 	do_crm_log(log_level,
 		   "LRM operation %s (call=%d, rc=%d, cib-update=%d, confirmed=%s) %s %s",
 		   op_key, op->call_id, op->rc, update_id, removed?"true":"false",
 		   op_status2text(op->op_status), execra_code2string(op->rc));
 
 	if(op->rc != 0 && op->output != NULL) {
 		crm_info("Result: %s", op->output);
 	} else if(op->output != NULL) {
 		crm_debug("Result: %s", op->output);
 	}
 	
 	crm_free(op_key);
 	crm_free(op_id);
 	return TRUE;
 }
 
 char *
 make_stop_id(const char *rsc, int call_id)
 {
 	char *op_id = NULL;
 	crm_malloc0(op_id, strlen(rsc) + 34);
 	if(op_id != NULL) {
 		snprintf(op_id, strlen(rsc) + 34, "%s:%d", rsc, call_id);
 	}
 	return op_id;
 }
diff --git a/cts/CM_LinuxHAv2.py.in b/cts/CM_LinuxHAv2.py.in
index 9af2dc645e..e86955b637 100755
--- a/cts/CM_LinuxHAv2.py.in
+++ b/cts/CM_LinuxHAv2.py.in
@@ -1,586 +1,588 @@
 #!@PYTHON@
 
 '''CTS: Cluster Testing System: LinuxHA v2 dependent modules...
 '''
 
 __copyright__='''
 Author: Huang Zhen <zhenhltc@cn.ibm.com>
 Copyright (C) 2004 International Business Machines
 
 Additional Audits, Revised Start action, Default Configuration:
      Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
 
 '''
 
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 # as published by the Free Software Foundation; either version 2
 # of the License, or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 
 import os,sys,CTS,CTSaudits,CTStests, warnings
 from CTS import *
 from CM_hb import HeartbeatCM
 from CTSaudits import ClusterAudit
 from CTStests import *
 from CIB import *
 try:
     from xml.dom.minidom import *
 except ImportError:
     sys.__stdout__.write("Python module xml.dom.minidom not found\n")
     sys.__stdout__.write("Please install python-xml or similar before continuing\n")
     sys.__stdout__.flush()
     sys.exit(1)
 
 #######################################################################
 #
 #  LinuxHA v2 dependent modules
 #
 #######################################################################
 
 
 class LinuxHAv2(HeartbeatCM):
     '''
     The linux-ha version 2 cluster manager class.
     It implements the things we need to talk to and manipulate
     linux-ha version 2 clusters
     '''
     def __init__(self, Environment, randseed=None):
         HeartbeatCM.__init__(self, Environment, randseed=randseed)
 
         self.fastfail = 0
         self.clear_cache = 0
         self.cib_installed = 0
         self.config = None
         self.cluster_monitor = 0
         self.use_short_names = 1
         self.update({
             "Name"           : "linux-ha-v2",
             "DeadTime"       : 300,
             "StartTime"      : 300,        # Max time to start up
             "StableTime"     : 30,
             "StartCmd"       : "@INITDIR@/heartbeat@INIT_EXT@ start > /dev/null 2>&1",
             "StopCmd"        : "@INITDIR@/heartbeat@INIT_EXT@ stop  > /dev/null 2>&1",
             "ElectionCmd"    : "@sbindir@/crmadmin -E %s",
             "StatusCmd"      : "@sbindir@/crmadmin -t 60000 -S %s 2>/dev/null",
             "EpocheCmd"      : "@sbindir@/ccm_tool -H -e",
             "QuorumCmd"      : "@sbindir@/ccm_tool -H -q",
             "ParitionCmd"    : "@sbindir@/ccm_tool -H -p",
             "CibQuery"       : "@sbindir@/cibadmin -Ql",
             "ExecuteRscOp"   : "@libdir@/heartbeat/lrmadmin -n %s -E %s %s 0 %d EVERYTIME 2>&1",
             "CIBfile"        : "%s:@HA_VARLIBDIR@/heartbeat/crm/cib.xml",
             "TmpDir"         : "/tmp",
             "BreakCommCmd2"  : "@HA_NOARCHDATAHBDIR@/TestHeartbeatComm break-communication %s>/dev/null 2>&1",
             "IsIPAddrRscRunning"   : "",
 
             "StandbyCmd"   : "@sbindir@/crm_standby -U %s -v %s 2>/dev/null",
             "UUIDQueryCmd"   : "@sbindir@/crmadmin -N",
             "StandbyQueryCmd"    : "@sbindir@/crm_standby -GQ -U %s 2>/dev/null",
 
             # Patterns to look for in the log files for various occasions...
             "Pat:DC_IDLE"      : "crmd.*State transition.*-> S_IDLE",
             
             # This wont work if we have multiple partitions
             "Pat:Local_started" : "%s crmd:.*The local CRM is operational",
             "Pat:Slave_started" : "%s crmd:.*State transition.*-> S_NOT_DC",
             "Pat:Master_started"   : "%s crmd:.* State transition.*-> S_IDLE",
             "Pat:We_stopped"   : "heartbeat.*%s.*Heartbeat shutdown complete",
             "Pat:Logd_stopped" : "%s logd:.*Exiting write process",
             "Pat:They_stopped" : "%s crmd:.*LOST:.* %s ",
             "Pat:All_stopped"  : "heartbeat.*%s.*Heartbeat shutdown complete",
             "Pat:They_dead"    : "node %s.*: is dead",
             "Pat:TransitionComplete" : "Transition status: Complete: complete",
 
             "Pat:ChildKilled"  : "%s heartbeat.*%s.*killed by signal 9",
             "Pat:ChildRespawn" : "%s heartbeat.*Respawning client.*%s",
             "Pat:ChildExit"    : "ERROR: Client .* exited with return code",
             
             # Bad news Regexes.  Should never occur.
             "BadRegexes"   : (
                 r"ERROR:",
                 r"CRIT:",
                 r"Shutting down\.",
                 r"Forcing shutdown\.",
                 r"Timer I_TERMINATE just popped",
                 r"input=I_ERROR",
                 r"input=I_FAIL",
                 r"input=I_INTEGRATED cause=C_TIMER_POPPED",
                 r"input=I_FINALIZED cause=C_TIMER_POPPED",
                 r"input=I_ERROR",
                 r", exiting\.",
                 r"WARN.*Ignoring HA message.*vote.*not in our membership list",
                 r"pengine.*Attempting recovery of resource",
                 r"is taking more than 2x its timeout",
                 r"Confirm not received from",
                 r"Welcome reply not received from",
                 r"Attempting to schedule .* after a stop",
                 r"Resource .* was active at shutdown",
                 r"duplicate entries for call_id",
                 r"Search terminated:",
                 r"No need to invoke the TE",
                 r"global_timer_callback:",
                 r"Faking parameter digest creation",
                 r"Parameters to .* action changed:",
                 r"Parameters to .* changed",
             ),
         })
         del self["Standby"]
         if self.Env["DoBSC"]:
             del self["Pat:They_stopped"]
             del self["Pat:Logd_stopped"]
             self.Env["use_logd"] = 0
 
         self.check_transitions = 0
         self.check_elections = 0
         self.CIBsync = {}
         self.default_cts_cib=CIB(self).cib()
         self.debug(self.default_cts_cib)
     
     def errorstoignore(self):
         # At some point implement a more elegant solution that 
         #   also produces a report at the end
         '''Return list of errors which are known and very noisey should be ignored'''
         if 1:
             return [ 
                 "ERROR: Message hist queue is filling up",
                 "stonithd: .*CRIT: external_hostlist: 'vmware gethosts' returned an empty hostlist",
                 "stonithd: .*ERROR: Could not list nodes for stonith RA external/vmware.",
                 "pengine: Preventing .* from re-starting",
                 ]
         return []
 
     def install_config(self, node):
         if not self.ns.WaitForNodeToComeUp(node):
             self.log("Node %s is not up." % node)
             return None
 
         if not self.CIBsync.has_key(node) and self.Env["ClobberCIB"] == 1:
             self.CIBsync[node] = 1
             self.rsh.remote_py(node, "os", "system", "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml")
             self.rsh.remote_py(node, "os", "system", "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml.sig")
             self.rsh.remote_py(node, "os", "system", "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml.last")
             self.rsh.remote_py(node, "os", "system", "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml.sig.last")
 
             # Only install the CIB on the first node, all the other ones will pick it up from there
             if self.cib_installed == 1:
                 return None
 
             self.cib_installed = 1
             if self.Env["CIBfilename"] == None:
                 self.debug("Installing Generated CIB on node %s" %(node))
                 warnings.filterwarnings("ignore")
                 cib_file=os.tmpnam()
                 warnings.resetwarnings()
                 os.system("rm -f "+cib_file)
                 self.debug("Creating new CIB for " + node + " in: " + cib_file)
                 os.system("echo \'" + self.default_cts_cib + "\' > " + cib_file)
                 if 0!=self.rsh.echo_cp(None, cib_file, node, "@HA_VARLIBDIR@/heartbeat/crm/cib.xml"):
                     raise ValueError("Can not create CIB on %s "%node)
 
                 os.system("rm -f "+cib_file)
             else:
                 self.debug("Installing CIB (%s) on node %s" %(self.Env["CIBfilename"], node))
                 if 0!=self.rsh.cp(self.Env["CIBfilename"], "root@" + (self["CIBfile"]%node)):
                     raise ValueError("Can not scp file to %s "%node)
         
             self.rsh.remote_py(node, "os", "system", "chown @HA_CCMUSER@ @HA_VARLIBDIR@/heartbeat/crm/cib.xml")
 
     def prepare(self):
         '''Finish the Initialization process. Prepare to test...'''
 
         for node in self.Env["nodes"]:
             self.ShouldBeStatus[node] = ""
             self.StataCM(node)
 
     def test_node_CM(self, node):
         '''Report the status of the cluster manager on a given node'''
 
         watchpats = [ ]
         watchpats.append("Current ping state: (S_IDLE|S_NOT_DC)")
         watchpats.append(self["Pat:Slave_started"]%node)
         idle_watch = CTS.LogWatcher(self["LogFileName"], watchpats)
         idle_watch.setwatch()
 
         out=self.rsh.readaline(node, self["StatusCmd"]%node)
         self.debug("Node %s status: '%s'" %(node, out))            
 
         if not out or string.find(out, 'ok') < 0:
             if self.ShouldBeStatus[node] == self["up"]:
                 self.log(
                     "Node status for %s is %s but we think it should be %s"
                     %(node, self["down"], self.ShouldBeStatus[node]))
             self.ShouldBeStatus[node]=self["down"]
             return 0
 
         if self.ShouldBeStatus[node] == self["down"]:
             self.log(
                 "Node status for %s is %s but we think it should be %s: %s"
                 %(node, self["up"], self.ShouldBeStatus[node], out))
 
         self.ShouldBeStatus[node]=self["up"]
 
         # check the output first - because syslog-ng looses messages
         if string.find(out, 'S_NOT_DC') != -1:
             # Up and stable
             return 2
         if string.find(out, 'S_IDLE') != -1:
             # Up and stable
             return 2
 
         # fall back to syslog-ng and wait
         if not idle_watch.look():
             # just up
             self.debug("Warn: Node %s is unstable: %s" %(node, out))
             return 1
 
         # Up and stable
         return 2
 
     # Is the node up or is the node down
     def StataCM(self, node):
         '''Report the status of the cluster manager on a given node'''
 
         if self.test_node_CM(node) > 0:
             return 1
         return None
 
     # Being up and being stable is not the same question...
     def node_stable(self, node):
         '''Report the status of the cluster manager on a given node'''
 
         if self.test_node_CM(node) == 2:
             return 1
         self.log("Warn: Node %s not stable" %(node)) 
         return None
 
     def cluster_stable(self, timeout=None):
         watchpats = [ ]
         watchpats.append("Current ping state: S_IDLE")
         watchpats.append(self["Pat:DC_IDLE"])
         self.debug("Waiting for cluster stability...") 
 
         if timeout == None:
             timeout = self["DeadTime"]
 
         idle_watch = CTS.LogWatcher(self["LogFileName"], watchpats, timeout)
         idle_watch.setwatch()
 
         any_up = 0
         for node in self.Env["nodes"]:
             # have each node dump its current state
             if self.ShouldBeStatus[node] == self["up"]:
                 self.rsh.readaline(node, (self["StatusCmd"] %node) )
                 any_up = 1
 
         if any_up == 0:
             self.debug("Cluster is inactive") 
             return 1
 
         ret = idle_watch.look()
         if ret:
             self.debug(ret) 
             return 1
 
         self.log("Warn: Cluster Master not IDLE after %ds" % timeout) 
         return None
 
     def is_node_dc(self, node, status_line=None):
         rc = 0
 
         if not status_line: 
             status_line = self.rsh.readaline(node, self["StatusCmd"]%node)
 
         if not status_line:
             rc = 0
         elif string.find(status_line, 'S_IDLE') != -1:
             rc = 1
         elif string.find(status_line, 'S_INTEGRATION') != -1: 
             rc = 1
         elif string.find(status_line, 'S_FINALIZE_JOIN') != -1: 
             rc = 1
         elif string.find(status_line, 'S_POLICY_ENGINE') != -1: 
             rc = 1
         elif string.find(status_line, 'S_TRANSITION_ENGINE') != -1: 
             rc = 1
 
         if rc == 1:
             self.debug("%s _is_ the DC" % node)
             
         return rc
 
     def active_resources(self, node):
         # [SM].* {node} matches Started, Slave, Master
         # Stopped wont be matched as it wont include {node}
         (rc, output) = self.rsh.remote_py(
             node, "os", "system", """@sbindir@/crm_mon -1 | grep -e "[SM].* %s" """ % node)
 
         resources = []
         for line in output:
             fields = line.split()
             resources.append(fields[0])
         return resources
 
     def ResourceOp(self, resource, op, node, interval=0, app="lrmadmin"):
         '''
         Execute an operation on a resource
         '''
         cmd = self["ExecuteRscOp"] % (app, resource, op, interval)
         (rc, lines) = self.rsh.remote_py(node, "os", "system", cmd)
 
         #self.debug("RscOp '%s' on %s: %d" % (cmd, node, rc))
         #for line in lines:
         #    self.debug("RscOp: "+line)
 
         return rc
 
     def ResourceLocation(self, rid):
         ResourceNodes = []
         for node in self.Env["nodes"]:
             if self.ShouldBeStatus[node] == self["up"]:
                 dummy = 0
                 rc = self.ResourceOp(rid, "monitor", node)
                 # Strange error codes from remote_py
                 # 65024 == not installed
                 # 2048 == 8
                 # 1792 == 7
                 # 0    == 0
                 if rc == 65024:
                     dummy = 1
                     #self.debug("%s is not installed on %s: %d" % (rid, node, rc))
 
                 elif rc == 0 or rc == 2048 or rc == 8:
                     ResourceNodes.append(node)
 
                 elif rc == 7 or rc == 1792:
                     dummy = 1
                     #self.debug("%s is not running on %s: %d" % (rid, node, rc))
 
                 else:
                     # not active on this node?
                     self.log("Unknown rc code for %s on %s: %d" % (rid, node, rc))
 
         return ResourceNodes
 
     def isolate_node(self, node, allowlist):
         '''isolate the communication between the nodes'''
         rc = self.rsh(node, self["BreakCommCmd2"]%allowlist)
         if rc == 0:
             return 1
         else:
             self.log("Could not break the communication from node: %s",node)
         return None
 
     def find_partitions(self):
         ccm_partitions = []
 
         for node in self.Env["nodes"]:
             if self.ShouldBeStatus[node] == self["up"]:
                 partition = self.rsh.readaline(node, self["ParitionCmd"])
 
                 if not partition:
                     self.log("no partition details for %s" %node)
                 elif len(partition) > 2:
                     partition = partition[:-1]
                     found=0
                     for a_partition in ccm_partitions:
                         if partition == a_partition:
                             found = 1
                     if found == 0:
                         self.debug("Adding partition from %s: %s" %(node, partition))
                         ccm_partitions.append(partition)
                     else:
                         self.debug("Partition '%s' is consistent with existing entries" %(partition))
 
                 else:
                     self.log("bad partition details for %s" %node)
             else:
                 self.debug("Node %s is down... skipping" %node)
 
         return ccm_partitions
 
     def HasQuorum(self, node_list):
         # If we are auditing a partition, then one side will
         #   have quorum and the other not.
         # So the caller needs to tell us which we are checking
         # If no value for node_list is specified... assume all nodes  
         if not node_list:
             node_list = self.Env["nodes"]
 
         for node in node_list:
             if self.ShouldBeStatus[node] == self["up"]:
                 quorum = self.rsh.readaline(node, self["QuorumCmd"])
                 if string.find(quorum, "1") != -1:
                     return 1
                 elif string.find(quorum, "0") != -1:
                     return 0
                 else:
                     self.log("WARN: Unexpected quorum test result from "+ node +":"+ quorum)
 
         return 0
     def Components(self):    
         complist = []
         common_ignore = [
                     "Pending action:",
                     "ERROR: crm_log_message_adv:",
                     "ERROR: MSG: No message to dump",
                     "pending LRM operations at shutdown",
                     "Lost connection to the CIB service",
                     "Connection to the CIB terminated...",
                     "Sending message to CIB service FAILED",
                     "crmd: .*Action A_RECOVER .* not supported",
                     "ERROR: stonithd_op_result_ready: not signed on",
                     "send_ipc_message: IPC Channel to .* is not connected",
                     "unconfirmed_actions: Waiting on .* unconfirmed actions",
                     "cib_native_msgready: Message pending on command channel",
                     "crmd:.*do_exit: Performing A_EXIT_1 - forcefully exiting the CRMd",
                     "verify_stopped: Resource .* was active at shutdown.  You may ignore this error if it is unmanaged.",
             ]
 
         stonith_ignore = [
             "ERROR: stonithd_signon: ",
             "update_failcount: Updating failcount for child_DoFencing",
             "ERROR: te_connect_stonith: Sign-in failed: triggered a retry",
             ]
 
         stonith_ignore.extend(common_ignore)
 
         ccm = Process("ccm", 0, [
                     "State transition S_IDLE",
                     "CCM connection appears to have failed",
                     "crmd: .*Action A_RECOVER .* not supported",
                     "crmd: .*Input I_TERMINATE from do_recover",
                     "Exiting to recover from CCM connection failure",
                     "crmd:.*do_exit: Could not recover from internal error",
                     "crmd: .*I_ERROR.*(ccm_dispatch|crmd_cib_connection_destroy)",
 #                    "WARN: determine_online_status: Node .* is unclean",
 #                    "Scheduling Node .* for STONITH",
 #                    "Executing .* fencing operation",
 #                    "tengine_stonith_callback: .*result=0",
                     "A new node joined the cluster",
 #                    "Processing I_NODE_JOIN:.* cause=C_HA_MESSAGE",
 #                    "State transition S_.* -> S_INTEGRATION.*input=I_NODE_JOIN",
                     "State transition S_STARTING -> S_PENDING",
                     ], [], common_ignore, self.fastfail, self)
 
-
         cib = Process("cib", 0, [
                     "State transition S_IDLE",
                     "Lost connection to the CIB service",
                     "Connection to the CIB terminated...",
                     "crmd: .*Input I_TERMINATE from do_recover",
                     "crmd: .*I_ERROR.*crmd_cib_connection_destroy",
                     "crmd:.*do_exit: Could not recover from internal error",
                     ], [], common_ignore, self.fastfail, self)
 
         lrmd = Process("lrmd", 0, [
                     "State transition S_IDLE",
                     "LRM Connection failed",
                     "crmd: .*I_ERROR.*lrm_connection_destroy",
                     "State transition S_STARTING -> S_PENDING",
                     ".*crmd .*exited with return code 2.",
                     "crmd: .*Input I_TERMINATE from do_recover",
                     "crmd:.*do_exit: Could not recover from internal error",
                     ], [], common_ignore, self.fastfail, self)
 
         crmd = Process("crmd", 0, [
 #                    "WARN: determine_online_status: Node .* is unclean",
 #                    "Scheduling Node .* for STONITH",
 #                    "Executing .* fencing operation",
 #                    "tengine_stonith_callback: .*result=0",
                     "State transition .* S_IDLE",
                     "State transition S_STARTING -> S_PENDING",
                     ], [
-                    "pengine: .*ERROR: subsystem_msg_dispatch: The server .* has left us: Shutting down...NOW",
                     ], common_ignore, self.fastfail, self)
 
         pengine = Process("pengine", 1, [
                     "State transition S_IDLE",
                     ".*crmd .*exited with return code 2.",
                     "crmd: .*Input I_TERMINATE from do_recover",
-                    "crmd:.*do_exit: Could not recover from internal error",
+                    "crmd: .*do_exit: Could not recover from internal error",
+                    "crmd: .*CRIT: pe_connection_destroy: Connection to the Policy Engine failed",
+                    "crmd: .*I_ERROR.*pe_connection_destroy",
+                    "crmd: .*I_ERROR.*crmdManagedChildDied",
+                    "crmd: .*ERROR: crmdManagedChildDied: The pengine subsystem terminated unexpectedly",
                     ], [], common_ignore, self.fastfail, self)
 
         if self.Env["DoFencing"] == 1 :
             complist.append(Process("stonithd", 0, [], [
-                        "tengine_stonith_connection_destroy: Fencing daemon has left us",
+                        "crmd: .*CRIT: tengine_stonith_connection_destroy: Fencing daemon connection failed",
                         "Attempting connection to fencing daemon",
                         "te_connect_stonith: Connected",
                         ], stonith_ignore, 0, self))
 #            complist.append(Process("heartbeat", 0, [], [], [], None, self))
 
 
         if self.fastfail == 0:
             ccm.pats.extend([
                 "ERROR: Client .*attrd exited with return code 1",
                 "ERROR: Respawning client .*attrd",
                 "ERROR: Client .*cib exited with return code 2",
                 "ERROR: Respawning client .*cib",
                 "ERROR: Client .*crmd exited with return code 2",
                 "ERROR: Respawning client .*crmd" 
                 ])
             cib.pats.extend([
                 "ERROR: Client .*attrd exited with return code 1",
                 "ERROR: Respawning client .*attrd",
                 "ERROR: Client .*crmd exited with return code 2",
                 "ERROR: Respawning client .*crmd" 
                 ])
             lrmd.pats.extend([
                 "ERROR: Client .*crmd exited with return code 2",
                 "ERROR: Respawning client .*crmd" 
                 ])
             pengine.pats.extend([
                 "ERROR: Client .*crmd exited with return code 2",
                 "ERROR: Respawning client .*crmd" 
                 ])
 
         complist.append(ccm)
         complist.append(cib)
         complist.append(lrmd)
         complist.append(crmd)
         complist.append(pengine)
 
         return complist
 
     def NodeUUID(self, node):
         lines = self.rsh.readlines(node, self["UUIDQueryCmd"])
         for line in lines:
             self.debug("UUIDLine:"+ line)
             m = re.search(r'%s.+\((.+)\)' % node, line)
             if m:
                 return m.group(1)
         return ""
 
     def StandbyStatus(self, node):
         out=self.rsh.readaline(node, self["StandbyQueryCmd"]%node)
         if not out:
             return "off"
         out = out[:-1]
         self.debug("Standby result: "+out)
         return out
 
     # status == "on" : Enter Standby mode
     # status == "off": Enter Active mode
     def SetStandbyMode(self, node, status):
         current_status = self.StandbyStatus(node)
         cmd = self["StandbyCmd"] % (node, status)
         ret = self.rsh(node, cmd)
         return True
 
 #######################################################################
 #
 #   A little test code...
 #
 #   Which you are advised to completely ignore...
 #
 #######################################################################
 if __name__ == '__main__': 
     pass
diff --git a/cts/CTStests.py.in b/cts/CTStests.py.in
index 7c2d77a83c..96b567863b 100644
--- a/cts/CTStests.py.in
+++ b/cts/CTStests.py.in
@@ -1,2471 +1,2469 @@
 #!@PYTHON@
 
 '''CTS: Cluster Testing System: Tests module
 
 There are a few things we want to do here:
 
  '''
 
 __copyright__='''
 Copyright (C) 2000, 2001 Alan Robertson <alanr@unix.sh>
 Licensed under the GNU GPL.
 
 Add RecourceRecover testcase Zhao Kai <zhaokai@cn.ibm.com>
 '''
 
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 # as published by the Free Software Foundation; either version 2
 # of the License, or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 
 #
 #        SPECIAL NOTE:
 #
 #        Tests may NOT implement any cluster-manager-specific code in them.
 #        EXTEND the ClusterManager object to provide the base capabilities
 #        the test needs if you need to do something that the current CM classes
 #        do not.  Otherwise you screw up the whole point of the object structure
 #        in CTS.
 #
 #                Thank you.
 #
 
 import CTS
 from CM_hb import HBConfig
 import CTSaudits
 import time, os, re, types, string, tempfile, sys
 from CTSaudits import *
 from stat import *
 
 #        List of all class objects for tests which we ought to
 #        consider running.
 
 class RandomTests:
     '''
     A collection of tests which are run at random.
     '''
     def __init__(self, scenario, cm, tests, Audits):
 
         self.CM = cm
         self.Env = cm.Env
         self.Scenario = scenario
         self.Tests = []
         self.Audits = []
         self.ns=CTS.NodeStatus(self.Env)
 
         for test in tests:
             if not issubclass(test.__class__, CTSTest):
                 raise ValueError("Init value must be a subclass of CTSTest")
             if test.is_applicable():
                 self.Tests.append(test)
 
         if not scenario.IsApplicable():
                 raise ValueError("Scenario not applicable in"
                 " given Environment")
 
        
         self.Stats = {"success":0, "failure":0, "BadNews":0}
         self.IndividualStats= {}
 
         for audit in Audits:
             if not issubclass(audit.__class__, ClusterAudit):
                 raise ValueError("Init value must be a subclass of ClusterAudit")
             if audit.is_applicable():
                 self.Audits.append(audit)
         
     def incr(self, name):
         '''Increment (or initialize) the value associated with the given name'''
         if not self.Stats.has_key(name):
             self.Stats[name]=0
         self.Stats[name] = self.Stats[name]+1
 
     def audit(self, BadNews, test):
             errcount=0
             BadNewsDebug=0
             #BadNews.debug=1
             ignorelist = []                
             ignorelist.append(" CTS: ")
             ignorelist.append("BadNews:")
             ignorelist.extend(self.CM.errorstoignore())
 
             if test:
                 ignorelist.extend(test.errorstoignore())
 
             while errcount < 1000:
                 if BadNewsDebug: print "Looking for BadNews"
                 match=BadNews.look(0)
                 if match:
                    if BadNewsDebug: print "BadNews found: "+match
                    add_err = 1
                    for ignore in ignorelist:
                        if add_err == 1 and re.search(ignore, match):
                            if BadNewsDebug: print "Ignoring based on pattern: ("+ignore+")"
                            add_err = 0
                    if add_err == 1:
                        self.CM.log("BadNews: " + match)
                        self.incr("BadNews")
                        errcount=errcount+1
                 else:
                   break
             else:
               self.CM.log("Big problems.  Shutting down.")
               self.CM.stopall()
               self.summarize()
               raise ValueError("Looks like we hit the jackpot!        :-)")
 
             for audit in self.Audits:
                 if not audit():
                     self.CM.log("Audit " + audit.name() + " FAILED.")
                     self.incr("auditfail")
                     if test:
                         test.incr("auditfail")
 
     def summarize(self):
         self.CM.log("****************")
         self.CM.log("Overall Results:" + repr(self.Stats))
         self.CM.log("****************")
         self.CM.log("Detailed Results")
         for test in self.Tests:
             self.CM.log("Test %s: \t%s" %(test.name, repr(test.Stats)))
         self.CM.log("<<<<<<<<<<<<<<<< TESTS COMPLETED")
 
     def run(self, max=1):
         (
 '''
 Set up the given scenario, then run the selected tests at
 random for the selected number of iterations.
 ''')
         BadNews=CTS.LogWatcher(self.CM["LogFileName"], self.CM["BadRegexes"]
         ,        timeout=0)
         BadNews.setwatch()
 
         self.CM.ns.WaitForAllNodesToComeUp(self.CM.Env["nodes"])
         self.CM.oprofileStop()
         self.CM.oprofileStart() 
 
         if not self.CM.Env["DoBSC"]:
             audit = LogAudit(self.CM)
             if not audit():
                 self.CM.log("Audit " + audit.name() + " FAILED.")
                 return None
             else:
                 self.CM.log("Audit " + audit.name() + " passed.")
 
             audit = DiskAudit(self.CM)
             if not audit():
                 self.CM.log("Audit " + audit.name() + " FAILED.")
                 return None
             else:
                 self.CM.log("Audit " + audit.name() + " passed.")
 
         if not self.Scenario.SetUp(self.CM):
             return None
 
         self.CM.oprofileSave(0) 
 
         testcount=1
         time.sleep(30)
 
         # This makes sure everything is stabilized before starting...
         self.audit(BadNews, None)
 
         while testcount <= max:
             test = self.Env.RandomGen.choice(self.Tests)
 
             # Some tests want a node as an argument.
 
             nodechoice = self.Env.RandomNode()
             #logsize = os.stat(self.CM["LogFileName"])[ST_SIZE]
             #self.CM.log("Running test %s (%s) \t[%d : %d]"  
             #            % (test.name, nodechoice, testcount, logsize))
             self.CM.log(("Running test %s" % test.name).ljust(35) + (" (%s) " % nodechoice).ljust(15) +"["+ ("%d" % testcount).rjust(3) +"]")
             starttime=time.time()
             test.starttime=starttime
             ret=test(nodechoice)
             stoptime=time.time()
             self.CM.oprofileSave(testcount)
 
             testcount = testcount + 1
 
             if ret:
                 self.incr("success")
             else:
                 self.incr("failure")
                 self.CM.log("Test %s (%s) \t[FAILED]" %(test.name,nodechoice))
                 # Better get the current info from the cluster...
                 self.CM.statall()
             elapsed_time = stoptime - starttime
             test_time = stoptime - test.starttime
             if not test.has_key("min_time"):
                 test["elapsed_time"] = elapsed_time
                 test["min_time"] = test_time
                 test["max_time"] = test_time
             else:
                 test["elapsed_time"] = test["elapsed_time"] + elapsed_time
                 if test_time < test["min_time"]:
                     test["min_time"] = test_time
                 if test_time > test["max_time"]:
                     test["max_time"] = test_time
                
             self.audit(BadNews, test)
 
         self.Scenario.TearDown(self.CM)
         self.CM.oprofileSave(testcount) 
         self.CM.oprofileStop()
 
         self.audit(BadNews, None)
 
         for test in self.Tests:
             self.IndividualStats[test.name] = test.Stats
 
         return self.Stats, self.IndividualStats
 
 AllTestClasses = [ ]
 
 class CTSTest:
     '''
     A Cluster test.
     We implement the basic set of properties and behaviors for a generic
     cluster test.
 
     Cluster tests track their own statistics.
     We keep each of the kinds of counts we track as separate {name,value}
     pairs.
     '''
 
     def __init__(self, cm):
         #self.name="the unnamed test"
         self.Stats = {"calls":0
         ,        "success":0
         ,        "failure":0
         ,        "skipped":0
         ,        "auditfail":0}
 
 #        if not issubclass(cm.__class__, ClusterManager):
 #            raise ValueError("Must be a ClusterManager object")
         self.CM = cm
         self.timeout=120
         self.starttime=0
 
     def has_key(self, key):
         return self.Stats.has_key(key)
 
     def __setitem__(self, key, value):
         self.Stats[key] = value
         
     def __getitem__(self, key):
         return self.Stats[key]
 
     def incr(self, name):
         '''Increment (or initialize) the value associated with the given name'''
         if not self.Stats.has_key(name):
             self.Stats[name]=0
         self.Stats[name] = self.Stats[name]+1
 
     def failure(self, reason="none"):
         '''Increment the failure count'''
         self.incr("failure")
         self.CM.log("Test " + self.name + " failed [reason:" + reason + "]")
         return None
 
     def success(self):
         '''Increment the success count'''
         self.incr("success")
         return 1
 
     def skipped(self):
         '''Increment the skipped count'''
         self.incr("skipped")
         return 1
 
     def __call__(self, node):
         '''Perform the given test'''
         raise ValueError("Abstract Class member (__call__)")
         self.incr("calls")
         return self.failure()
 
     def is_applicable(self):
         '''Return TRUE if we are applicable in the current test configuration'''
         raise ValueError("Abstract Class member (is_applicable)")
         return 1
 
     def canrunnow(self):
         '''Return TRUE if we can meaningfully run right now'''
         return 1
 
     def errorstoignore(self):
         '''Return list of errors which are 'normal' and should be ignored'''
         return []
 
 ###################################################################
 class StopTest(CTSTest):
 ###################################################################
     '''Stop (deactivate) the cluster manager on a node'''
     def __init__(self, cm):
         CTSTest.__init__(self, cm)
         self.name="Stop"
 
     def __call__(self, node):
         '''Perform the 'stop' test. '''
         self.incr("calls")
         if self.CM.ShouldBeStatus[node] != self.CM["up"]:
             return self.skipped()
 
         patterns = []
         # Technically we should always be able to notice ourselves stopping
         patterns.append(self.CM["Pat:We_stopped"] % node)
 
         if self.CM.Env["use_logd"]:
             patterns.append(self.CM["Pat:Logd_stopped"] % node)
 
         # Any active node needs to notice this one left
         # NOTE: This wont work if we have multiple partitions
         for other in self.CM.Env["nodes"]:
             if self.CM.ShouldBeStatus[other] == self.CM["up"] and other != node:
                 patterns.append(self.CM["Pat:They_stopped"] %(other, node))
                 #self.debug("Checking %s will notice %s left"%(other, node))
                 
         watch = CTS.LogWatcher(
             self.CM["LogFileName"], patterns, self.CM["DeadTime"])
         watch.setwatch()
 
         if node == self.CM.OurNode:
             self.incr("us")
         else:
             if self.CM.upcount() <= 1:
                 self.incr("all")
             else:
                 self.incr("them")
 
         self.CM.StopaCM(node)
         watch_result = watch.lookforall()
 
         failreason=None
         UnmatchedList = "||"
         if watch.unmatched:
             (rc, output) = self.CM.rsh.remote_py(node, "os", "system", "/bin/ps axf")
             for line in output:
                 self.CM.debug(line)
                 
             for regex in watch.unmatched:
                 self.CM.log ("ERROR: Shutdown pattern not found: %s" % (regex))
                 UnmatchedList +=  regex + "||";
                 failreason="Missing shutdown pattern"
 
         self.CM.cluster_stable(self.CM["DeadTime"])
 
         if not watch.unmatched or self.CM.upcount() == 0:
             return self.success()
 
         if len(watch.unmatched) >= self.CM.upcount():
             return self.failure("no match against (%s)" % UnmatchedList)
 
         if failreason == None:
             return self.success()
         else:
             return self.failure(failreason)
 #
 # We don't register StopTest because it's better when called by
 # another test...
 #
 
 ###################################################################
 class StartTest(CTSTest):
 ###################################################################
     '''Start (activate) the cluster manager on a node'''
     def __init__(self, cm, debug=None):
         CTSTest.__init__(self,cm)
         self.name="start"
         self.debug = debug
 
     def __call__(self, node):
         '''Perform the 'start' test. '''
         self.incr("calls")
 
         if self.CM.upcount() == 0:
             self.incr("us")
         else:
             self.incr("them")
 
         if self.CM.ShouldBeStatus[node] != self.CM["down"]:
             return self.skipped()
         elif self.CM.StartaCM(node):
             return self.success()
         else:
             return self.failure("Startup %s on node %s failed"
                                 %(self.CM["Name"], node))
 
     def is_applicable(self):
         '''StartTest is always applicable'''
         return 1
 #
 # We don't register StartTest because it's better when called by
 # another test...
 #
 
 ###################################################################
 class FlipTest(CTSTest):
 ###################################################################
     '''If it's running, stop it.  If it's stopped start it.
        Overthrow the status quo...
     '''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="Flip"
         self.start = StartTest(cm)
         self.stop = StopTest(cm)
 
     def __call__(self, node):
         '''Perform the 'Flip' test. '''
         self.incr("calls")
         if self.CM.ShouldBeStatus[node] == self.CM["up"]:
             self.incr("stopped")
             ret = self.stop(node)
             type="up->down"
             # Give the cluster time to recognize it's gone...
             time.sleep(self.CM["StableTime"])
         elif self.CM.ShouldBeStatus[node] == self.CM["down"]:
             self.incr("started")
             ret = self.start(node)
             type="down->up"
         else:
             return self.skipped()
 
         self.incr(type)
         if ret:
             return self.success()
         else:
             return self.failure("%s failure" % type)
 
     def is_applicable(self):
         '''FlipTest is always applicable'''
         return 1
 
 #        Register FlipTest as a good test to run
 AllTestClasses.append(FlipTest)
 
 ###################################################################
 class RestartTest(CTSTest):
 ###################################################################
     '''Stop and restart a node'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="Restart"
         self.start = StartTest(cm)
         self.stop = StopTest(cm)
 
     def __call__(self, node):
         '''Perform the 'restart' test. '''
         self.incr("calls")
 
         self.incr("node:" + node)
         
         ret1 = 1
         if self.CM.StataCM(node):
             self.incr("WasStopped")
             if not self.start(node):
                 return self.failure("start (setup) failure: "+node)
 
         self.starttime=time.time()
         if not self.stop(node):
             return self.failure("stop failure: "+node)
         if not self.start(node):
             return self.failure("start failure: "+node)
         return self.success()
 
     def is_applicable(self):
         '''RestartTest is always applicable'''
         return 1
 
 #        Register RestartTest as a good test to run
 AllTestClasses.append(RestartTest)
 
 ###################################################################
 class StonithTest(CTSTest):
 ###################################################################
     '''Reboot a node by whacking it with stonith.'''
     def __init__(self, cm, timeout=900):
         CTSTest.__init__(self,cm)
         self.name="Stonith"
         self.theystopped  = self.CM["Pat:They_dead"]
         self.allstopped   = self.CM["Pat:All_stopped"]
         self.usstart      = self.CM["Pat:Master_started"]
         self.themstart    = self.CM["Pat:Slave_started"]
         self.timeout      = timeout
         self.ssherror     = False
     
     def _reset(self, node):
         StonithWorked=False
         for tries in 1,2,3,4,5:
           if self.CM.Env.ResetNode(node):
             StonithWorked=True
             break
         return StonithWorked
 
     def setup(self, target_node):
         # nothing to do
         return 1
 
     def __call__(self, node):
         '''Perform the 'stonith' test. (whack the node)'''
         self.incr("calls")
         stopwatch = 0
         rc = 0
 
         if not self.setup(node):
             return self.failure("Setup failed")
 
         # Figure out what log message to look for when/if it goes down
         #
         # Any active node needs to notice this one left
         # NOTE: This wont work if we have multiple partitions
         stop_patterns = []
         for other in self.CM.Env["nodes"]:
             if self.CM.ShouldBeStatus[other] == self.CM["up"] and other != node:
                 stop_patterns.append(self.CM["Pat:They_stopped"] %(other, node))
                 stopwatch = 1
                 #self.debug("Checking %s will notice %s left"%(other, node))
 
         if self.CM.ShouldBeStatus[node] == self.CM["down"]:
             # actually no-one will notice this node die since HA isnt running
             stopwatch = 0
 
         #        Figure out what log message to look for when it comes up
         if self.CM.upcount() == 1 and self.CM.ShouldBeStatus[node] == self.CM["up"]:
             uppat = (self.usstart % node)
         else:
             uppat = (self.themstart % node)
 
         upwatch = CTS.LogWatcher(self.CM["LogFileName"], [uppat]
         ,        timeout=self.timeout)
 
         if stopwatch == 1:
             watch = CTS.LogWatcher(self.CM["LogFileName"], stop_patterns
             ,        timeout=self.CM["DeadTime"]+10)
             watch.setwatch()
 
         #        Reset (stonith) the node
 
         self.CM.debug("Resetting: "+node)
         StonithWorked = self._reset(node)
 
         if not StonithWorked:
             return self.failure("Stonith didn't work")
         if self.ssherror == True:
             self.CM.log("NOTE: Stonith command reported success but node %s did not restart (atd, reboot or ssh error)" % node)
             return self.success()
 
         upwatch.setwatch()
 
         #        Look() and see if the machine went down
         if stopwatch == 0:
             # Allow time for the node to die
             time.sleep(self.CM["DeadTime"]+10)
         elif not watch.lookforall():
             if watch.unmatched:
                 for regex in watch.unmatched:
                     self.CM.log("Warn: STONITH pattern not found: %s"%regex)
                 # !!no-one!! saw this node die
                 if len(watch.unmatched) == len(stop_patterns):
                     return self.failure("No-one saw %s die" %node)
                 # else: syslog* lost a message
 
         # Alas I dont think this check is plausable (beekhof)
         #
         # Check it really stopped...
         #self.CM.ShouldBeStatus[node] = self.CM["down"]
         #if self.CM.StataCM(node) == 1:
         #    ret1=0
 
         #        Look() and see if the machine came back up
         rc=0
         if upwatch.look():
             self.CM.debug("Startup pattern found: %s" %uppat)
             rc=1
         else:
             self.CM.log("Warn: Startup pattern not found: %s" %uppat)
 
         # Check it really started...
         self.CM.ShouldBeStatus[node] = self.CM["up"]
         if rc == 0 and self.CM.StataCM(node) == 1:
             rc=1
 
         # wait for the cluster to stabilize
         self.CM.cluster_stable()
 
         if node in self.CM.Env["oprofile"]:
             self.CM.oprofileStart(node) 
 
         # return case processing
         if rc == 0:
             return self.failure("Node %s did not restart" %node)
         else:
             return self.success()
 
     def is_applicable(self):
         '''StonithTest is applicable unless suppressed by CM.Env["DoStonith"] == FALSE'''
 
         # for v2, stonithd test is a better test to run.
         if self.CM["Name"] == "heartbeat":
             return self.CM.Env["DoStonith"]
         else:
             return None
 
 #        Register StonithTest as a good test to run
 AllTestClasses.append(StonithTest)
 
 
 ###################################################################
 class StonithdTest(StonithTest):
 ###################################################################
     def __init__(self, cm, timeout=600):
         StonithTest.__init__(self, cm, timeout=600)
         self.name="Stonithd"
         self.startall = SimulStartLite(cm)
         self.start = StartTest(cm)
         self.stop = StopTest(cm)
         self.init_node = None
 
     def _reset(self, target_node):
 
         if len(self.CM.Env["nodes"]) < 2:
             return self.skipped()
 
         StonithWorked = False
         SshNotWork = 0
         for tries in range(1,5):
             # For some unknown reason, every now and then the ssh plugin just
             # can't kill the target_node - everything works fine with stonithd
             # and the plugin, but atd, reboot or ssh (or maybe something else)
             # doesn't do its job and target_node remains alive.  So look for
             # the indicative messages and bubble-up the error via ssherror
             watchpats = []
             watchpats.append("Initiating ssh-reset")
             watchpats.append("CRIT: still able to ping")
                 
             watch = CTS.LogWatcher(self.CM["LogFileName"], watchpats
             ,     timeout=self.CM["DeadTime"]+30)
             watch.setwatch()
         
             fail_reasons = []
             if self.CM.Env.ResetNode2(self.init_node, target_node, fail_reasons):
                 StonithWorked = True
                 break
             if watch.lookforall():
                 SshNotWork = SshNotWork + 1
                 continue
             for reason in fail_reasons:
                 self.CM.log(reason)
 
         if StonithWorked == False and SshNotWork == tries:
             StonithWorked = True
             self.ssherror = True
 
         return StonithWorked
 
     def setup(self, target_node):
         if len(self.CM.Env["nodes"]) < 2:
             return 1
 
         self.init_node = self.CM.Env.RandomNode()
         while self.init_node == target_node:
             self.init_node = self.CM.Env.RandomNode()
 
         if not self.startall(None):
             return self.failure("Test setup failed")
 
         return 1
         
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return [
 #            """stonithd: .*CRIT: command ssh -q -x -n -l root.*/sbin/reboot -nf .* SHELL=/bin/sh at now.* failed"""
             ]
 
     def is_applicable(self):
 
         if self.CM["Name"] != "heartbeat":
             if self.CM.Env.has_key("DoStonith"):
                 return self.CM.Env["DoStonith"]
             return 1
         return 0
            
 AllTestClasses.append(StonithdTest)
 
 ###################################################################
 class IPaddrtest(CTSTest):
 ###################################################################
     '''Find the machine supporting a particular IP address, and knock it down.
 
     [Hint:  This code isn't finished yet...]
     '''
 
     def __init__(self, cm, IPaddrs):
         CTSTest.__init__(self,cm)
         self.name="IPaddrtest"
         self.IPaddrs = IPaddrs
 
         self.start = StartTest(cm)
         self.stop = StopTest(cm)
 
     def __call__(self, IPaddr):
         '''
         Perform the IPaddr test...
         '''
         self.incr("calls")
 
         node = self.CM.Env.RandomNode()
         self.incr("node:" + node)
 
         if self.CM.ShouldBeStatus[node] == self.CM["down"]:
             self.incr("WasStopped")
             self.start(node)
 
         ret1 = self.stop(node)
         # Give the cluster time to recognize we're gone...
         time.sleep(self.CM["StableTime"])
         ret2 = self.start(node)
 
 
         if not ret1:
             return self.failure("Could not stop")
         if not ret2:
             return self.failure("Could not start")
 
         return self.success()
 
     def is_applicable(self):
         '''IPaddrtest is always applicable (but shouldn't be)'''
         return 1
 
 ###################################################################
 class StartOnebyOne(CTSTest):
 ###################################################################
     '''Start all the nodes ~ one by one'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="StartOnebyOne"
         self.stopall = SimulStopLite(cm)
         self.start = StartTest(cm)
         self.ns=CTS.NodeStatus(cm.Env)
 
     def __call__(self, dummy):
         '''Perform the 'StartOnebyOne' test. '''
         self.incr("calls")
 
         #        We ignore the "node" parameter...
 
         #        Shut down all the nodes...
         ret = self.stopall(None)
         if not ret:
             return self.failure("Test setup failed")
 
         failed=[]
         self.starttime=time.time()
         for node in self.CM.Env["nodes"]:
             if not self.start(node):
                 failed.append(node)
 
         if len(failed) > 0:
             return self.failure("Some node failed to start: " + repr(failed))
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return []
 
     def is_applicable(self):
         '''StartOnebyOne is always applicable'''
         return 1
 
 #        Register StartOnebyOne as a good test to run
 AllTestClasses.append(StartOnebyOne)
 
 ###################################################################
 class SimulStart(CTSTest):
 ###################################################################
     '''Start all the nodes ~ simultaneously'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="SimulStart"
         self.stopall = SimulStopLite(cm)
         self.startall = SimulStartLite(cm)
 
     def __call__(self, dummy):
         '''Perform the 'SimulStart' test. '''
         self.incr("calls")
 
         #        We ignore the "node" parameter...
 
         #        Shut down all the nodes...
         ret = self.stopall(None)
         if not ret:
             return self.failure("Setup failed")
         
         self.CM.clear_all_caches()
  
         if not self.startall(None):
             return self.failure("Startall failed")
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return []
 
     def is_applicable(self):
         '''SimulStart is always applicable'''
         return 1
 
 #        Register SimulStart as a good test to run
 AllTestClasses.append(SimulStart)
 
 ###################################################################
 class SimulStop(CTSTest):
 ###################################################################
     '''Stop all the nodes ~ simultaneously'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="SimulStop"
         self.startall = SimulStartLite(cm)
         self.stopall = SimulStopLite(cm)
 
     def __call__(self, dummy):
         '''Perform the 'SimulStop' test. '''
         self.incr("calls")
 
         #     We ignore the "node" parameter...
 
         #     Start up all the nodes...
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         if not self.stopall(None):
             return self.failure("Stopall failed")
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return []
 
     def is_applicable(self):
         '''SimulStop is always applicable'''
         return 1
 
 #     Register SimulStop as a good test to run
 AllTestClasses.append(SimulStop)
 
 ###################################################################
 class StopOnebyOne(CTSTest):
 ###################################################################
     '''Stop all the nodes in order'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="StopOnebyOne"
         self.startall = SimulStartLite(cm)
         self.stop = StopTest(cm)
 
     def __call__(self, dummy):
         '''Perform the 'StopOnebyOne' test. '''
         self.incr("calls")
 
         #     We ignore the "node" parameter...
 
         #     Start up all the nodes...
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         failed=[]
         self.starttime=time.time()
         for node in self.CM.Env["nodes"]:
             if not self.stop(node):
                 failed.append(node)
 
         if len(failed) > 0:
             return self.failure("Some node failed to stop: " + repr(failed))
 
         self.CM.clear_all_caches()
         return self.success()
 
     def is_applicable(self):
         '''StopOnebyOne is always applicable'''
         return 1
 
 #     Register StopOnebyOne as a good test to run
 AllTestClasses.append(StopOnebyOne)
 
 ###################################################################
 class RestartOnebyOne(CTSTest):
 ###################################################################
     '''Restart all the nodes in order'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="RestartOnebyOne"
         self.startall = SimulStartLite(cm)
 
     def __call__(self, dummy):
         '''Perform the 'RestartOnebyOne' test. '''
         self.incr("calls")
 
         #     We ignore the "node" parameter...
 
         #     Start up all the nodes...
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         did_fail=[]
         self.starttime=time.time()
         self.restart = RestartTest(self.CM)
         for node in self.CM.Env["nodes"]:
             if not self.restart(node):
                 did_fail.append(node)
 
         if did_fail:
             return self.failure("Could not restart %d nodes: %s" 
                                 %(len(did_fail), repr(did_fail)))
         return self.success()
 
     def is_applicable(self):
         '''RestartOnebyOne is always applicable'''
         return 1
 
 #     Register StopOnebyOne as a good test to run
 AllTestClasses.append(RestartOnebyOne)
 
 ###################################################################
 class PartialStart(CTSTest):
 ###################################################################
     '''Start a node - but tell it to stop before it finishes starting up'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="PartialStart"
         self.startall = SimulStartLite(cm)
         self.stopall = SimulStopLite(cm)
 
     def __call__(self, node):
         '''Perform the 'PartialStart' test. '''
         self.incr("calls")
 
         ret = self.stopall(None)
         if not ret:
             return self.failure("Setup failed")
 
 #	FIXME!  This should use the CM class to get the pattern
 #		then it would be applicable in general
         watchpats = []
         watchpats.append("Starting crmd")
         watch = CTS.LogWatcher(self.CM["LogFileName"], watchpats,
                                timeout=self.CM["DeadTime"]+10)
         watch.setwatch()
 
         self.CM.StartaCMnoBlock(node)
         ret = watch.lookforall()
         if not ret:
             self.CM.log("Patterns not found: " + repr(watch.unmatched))
             return self.failure("Setup of %s failed" % node) 
 
         ret = self.stopall(None)
         if not ret:
             return self.failure("%s did not stop in time" % node)
 
         return self.success()
 
     def is_applicable(self):
         '''Partial is always applicable'''
         if self.CM["Name"] != "heartbeat":
           return 1
         else:
           return 0
 
 #     Register StopOnebyOne as a good test to run
 AllTestClasses.append(PartialStart)
 
 ###################################################################
 class StandbyTest(CTSTest):
 ###################################################################
     '''Put a node in standby mode'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="standby"
         self.successpat          = self.CM["Pat:StandbyOK"]
         self.nostandbypat        = self.CM["Pat:StandbyNONE"]
         self.transient           = self.CM["Pat:StandbyTRANSIENT"]
 
     def __call__(self, node):
         '''Perform the 'standby' test. '''
         self.incr("calls")
 
         if self.CM.ShouldBeStatus[node] == self.CM["down"]:
             return self.skipped()
 
         if self.CM.upcount() < 2:
             self.incr("nostandby")
             pat = self.nostandbypat
         else:
             self.incr("standby")
             pat = self.successpat
 
         #
         # You could make a good argument that the cluster manager
         # ought to give us good clues on when its a bad time to
         # switch over to the other side, but heartbeat doesn't...
         # It could also queue the request.  But, heartbeat
         # doesn't do that either :-)
         #
         retrycount=0
         while (retrycount < 10):
             watch = CTS.LogWatcher(self.CM["LogFileName"]
             ,        [pat, self.transient]
             ,        timeout=self.CM["DeadTime"]+10)
             watch.setwatch()
 
             self.CM.rsh(node, self.CM["Standby"])
 
             match = watch.look()
             if match:
                 if re.search(self.transient, match):
                     self.incr("retries")
                     time.sleep(2)
                     retrycount=retrycount+1
                 else:
                     return self.success()
             else:
                 break  # No point in retrying...
         return self.failure("did not find pattern " + pat)
 
     def is_applicable(self):
         '''StandbyTest is applicable when the CM has a Standby command'''
 
         if not self.CM.has_key("Standby"):
            return None
         else:
 
             #if self.CM.Env.has_key("DoStandby"):
                 #flag=self.CM.Env["DoStandby"]
                 #if type(flag) == types.IntType:
                     #return flag
                 #if not re.match("[yt]", flag, re.I):
                     #return None
             #
             # We need to strip off everything after the first blank
             #
             cmd=self.CM["Standby"]
             cmd = cmd.split()[0]
             if not os.access(cmd, os.X_OK):
                 return None
 
             cf = self.CM.cf
             if not cf.Parameters.has_key("auto_failback"):
                 return None
             elif cf.Parameters["auto_failback"][0] == "legacy":
                 return None
             return 1
 
 #        Register StandbyTest as a good test to run
 AllTestClasses.append(StandbyTest)
 
 
 #######################################################################
 class StandbyTest2(CTSTest):
 #######################################################################
     '''Standby with CRM of HA release 2'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="standby2"
             
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         
     # make sure the node is active
     # set the node to standby mode
     # check resources, none resource should be running on the node
     # set the node to active mode
     # check resouces, resources should have been migrated back (SHOULD THEY?)
     
     def __call__(self, node):
     
         self.incr("calls")
         ret=self.startall(None)
         if not ret:
             return self.failure("Start all nodes failed")
         
         self.CM.debug("Make sure node %s is active" % node)    
         if self.CM.StandbyStatus(node) != "off":
             if not self.CM.SetStandbyMode(node, "off"):
                 return self.failure("can't set node %s to active mode" % node)
                
         self.CM.cluster_stable()
 
         status = self.CM.StandbyStatus(node)
         if status != "off":
             return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))
 
         self.CM.debug("Getting resources running on node %s" % node)
         rsc_on_node = self.CM.active_resources(node)
 
         self.CM.debug("Setting node %s to standby mode" % node) 
         if not self.CM.SetStandbyMode(node, "on"):
             return self.failure("can't set node %s to standby mode" % node)
 
         time.sleep(30)  # Allow time for the update to be applied and cause something
         self.CM.cluster_stable()
 
         status = self.CM.StandbyStatus(node)
         if status != "on":
             return self.failure("standby status of %s is [%s] but we expect [on]" % (node, status))
 
         self.CM.debug("Checking resources")
         bad_run = self.CM.active_resources(node)
         if len(bad_run) > 0:
             return self.failure("%s set to standby, %s is still running on it" % (node, repr(bad_run)))
 
         self.CM.debug("Setting node %s to active mode" % node) 
         if not self.CM.SetStandbyMode(node, "off"):
             return self.failure("can't set node %s to active mode" % node)
 
         time.sleep(30)  # Allow time for the update to be applied and cause something
         self.CM.cluster_stable()
 
         status = self.CM.StandbyStatus(node)
         if status != "off":
             return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))
 
         return self.success()
 
     def is_applicable(self):
         if self.CM["Name"] != "heartbeat":
             return 1
         return 0
 
 AllTestClasses.append(StandbyTest2)
 
 #######################################################################
 class Fastdetection(CTSTest):
 #######################################################################
     '''Test the time which one node find out the other node is killed very quickly'''
     def __init__(self,cm,timeout=60):
         CTSTest.__init__(self, cm)
         self.name = "DetectionTime"
         self.they_stopped = self.CM["Pat:They_stopped"]
         self.timeout = timeout
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.standby = StandbyTest(cm)
         self.__setitem__("min", 0)
         self.__setitem__("max", 0)
         self.__setitem__("totaltime", 0)
 
     def __call__(self, node):
         '''Perform the fastfailureDetection test'''
         self.incr("calls")
 
         ret=self.startall(None)
         if not ret:
             return self.failure("Test setup failed")
 
         if self.CM.upcount() < 2:
             return self.skipped()
 
         # Make sure they're not holding any resources
         ret = self.standby(node)
         if not ret:
             return ret
 
         stoppat = (self.they_stopped % ("", node))
         stopwatch = CTS.LogWatcher(self.CM["LogFileName"], [stoppat], timeout=self.timeout)
         stopwatch.setwatch()
 
 #
 #        This test is CM-specific - FIXME!!
 #
         if self.CM.rsh(node, "killall -9 heartbeat")==0:
             Starttime = os.times()[4]
             if stopwatch.look():
                 Stoptime = os.times()[4]
 #        This test is CM-specific - FIXME!!
                 self.CM.rsh(node, "killall -9 @libdir@/heartbeat/ccm @libdir@/heartbeat/ipfail >/dev/null 2>&1; true")
                 Detectiontime = Stoptime-Starttime
                 detectms = int(Detectiontime*1000+0.5)
                 self.CM.log("...failure detection time: %d ms" % detectms)
                 self.Stats["totaltime"] = self.Stats["totaltime"] + Detectiontime
                 if self.Stats["min"] == 0:
                     self.Stats["min"] = Detectiontime
                 if Detectiontime > self.Stats["max"]:
                     self.Stats["max"] = Detectiontime
                 if Detectiontime < self.Stats["min"]:
                     self.Stats["min"] = Detectiontime
                 self.CM.ShouldBeStatus[node] = self.CM["down"]
                 self.start(node)
                 return self.success()
             else:
 #        This test is CM-specific - FIXME!!
                 self.CM.rsh(node, "killall -9 @libdir@/heartbeat/ccm @libdir@/heartbeat/ipfail >/dev/null 2>&1; true")
                 self.CM.ShouldBeStatus[node] = self.CM["down"]
                 ret=self.start(node)
                 return self.failure("Didn't find the log message")
         else:
             return self.failure("Couldn't kill cluster manager")
 
     def is_applicable(self):
         '''This test is applicable when auto_failback != legacy'''
         return self.standby.is_applicable()
 
 #        This test is CM-specific - FIXME!!
     def errorstoignore(self):
         '''Return list of errors which are 'normal' and should be ignored'''
         return [ "ccm.*ERROR: ccm_control_process:failure to send protoversion request"
         ,        "ccm.*ERROR: Lost connection to heartbeat service. Need to bail out"
         ]
 
 AllTestClasses.append(Fastdetection)
 
 ##############################################################################
 class BandwidthTest(CTSTest):
 ##############################################################################
 #        Tests should not be cluster-manager-specific
 #        If you need to find out cluster manager configuration to do this, then
 #        it should be added to the generic cluster manager API.
     '''Test the bandwidth which heartbeat uses'''
     def __init__(self, cm):
         CTSTest.__init__(self, cm)
         self.name = "Bandwidth"
         self.start = StartTest(cm)
         self.__setitem__("min",0)
         self.__setitem__("max",0)
         self.__setitem__("totalbandwidth",0)
         self.tempfile = tempfile.mktemp(".cts")
         self.startall = SimulStartLite(cm)
         
     def __call__(self, node):
         '''Perform the Bandwidth test'''
         self.incr("calls")
         
         if self.CM.upcount()<1:
             return self.skipped()
 
         Path = self.CM.InternalCommConfig()
         if "ip" not in Path["mediatype"]:
              return self.skipped()
 
         port = Path["port"][0]
         port = int(port)
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Test setup failed")
         time.sleep(5)  # We get extra messages right after startup.
 
 
         fstmpfile = "/var/run/band_estimate"
         dumpcmd = "tcpdump -p -n -c 102 -i any udp port %d > %s 2>&1" \
         %                (port, fstmpfile)
  
         rc = self.CM.rsh(node, dumpcmd)
         if rc == 0:
             farfile = "root@%s:%s" % (node, fstmpfile)
             self.CM.rsh.cp(farfile, self.tempfile)
             Bandwidth = self.countbandwidth(self.tempfile)
             if not Bandwidth:
                 self.CM.log("Could not compute bandwidth.")
                 return self.success()
             intband = int(Bandwidth + 0.5)
             self.CM.log("...bandwidth: %d bits/sec" % intband)
             self.Stats["totalbandwidth"] = self.Stats["totalbandwidth"] + Bandwidth
             if self.Stats["min"] == 0:
                 self.Stats["min"] = Bandwidth
             if Bandwidth > self.Stats["max"]:
                 self.Stats["max"] = Bandwidth
             if Bandwidth < self.Stats["min"]:
                 self.Stats["min"] = Bandwidth
             self.CM.rsh(node, "rm -f %s" % fstmpfile)
             os.unlink(self.tempfile)
             return self.success()
         else:
             return self.failure("no response from tcpdump command [%d]!" % rc)
 
     def countbandwidth(self, file):
         fp = open(file, "r")
         fp.seek(0)
         count = 0
         sum = 0
         while 1:
             line = fp.readline()
             if not line:
                 return None
             if re.search("udp",line) or re.search("UDP,", line):
                 count=count+1
                 linesplit = string.split(line," ")
                 for j in range(len(linesplit)-1):
                     if linesplit[j]=="udp": break
                     if linesplit[j]=="length:": break
                         
                 try:
                     sum = sum + int(linesplit[j+1])
                 except ValueError:
                     self.CM.log("Invalid tcpdump line: %s" % line)
                     return None
                 T1 = linesplit[0]
                 timesplit = string.split(T1,":")
                 time2split = string.split(timesplit[2],".")
                 time1 = (long(timesplit[0])*60+long(timesplit[1]))*60+long(time2split[0])+long(time2split[1])*0.000001
                 break
 
         while count < 100:
             line = fp.readline()
             if not line:
                 return None
             if re.search("udp",line) or re.search("UDP,", line):
                 count = count+1
                 linessplit = string.split(line," ")
                 for j in range(len(linessplit)-1):
                     if linessplit[j] =="udp": break
                     if linesplit[j]=="length:": break
                 try:
                     sum=int(linessplit[j+1])+sum
                 except ValueError:
                     self.CM.log("Invalid tcpdump line: %s" % line)
                     return None
 
         T2 = linessplit[0]
         timesplit = string.split(T2,":")
         time2split = string.split(timesplit[2],".")
         time2 = (long(timesplit[0])*60+long(timesplit[1]))*60+long(time2split[0])+long(time2split[1])*0.000001
         time = time2-time1
         if (time <= 0):
             return 0
         return (sum*8)/time
 
     def is_applicable(self):
         '''BandwidthTest is always applicable'''
         return 0
 
 AllTestClasses.append(BandwidthTest)
 
 ##########################################################################
 class RedundantpathTest(CTSTest):
 ##########################################################################
     '''In heartbeat, it has redundant path to communicate between the cluster'''
 #
 #        Tests should not be cluster-manager specific
 #        One needs to isolate what you need from the cluster manager and then
 #        add a (new) API to do it.
 #
     def __init__(self,cm,timeout=60):
         CTSTest.__init__(self,cm)
         self.name = "RedundantpathTest"
         self.timeout = timeout 
 
     def PathCount(self):
         '''Return number of communication paths'''
         Path = self.CM.InternalCommConfig()
         cf = self.CM.cf
         eths = []
         serials = []
         num = 0
         for interface in Path["interface"]:
             if re.search("eth",interface):
                 eths.append(interface)
                 num = num + 1
             if re.search("/dev",interface):
                 serials.append(interface)
                 num = num + 1
 
         return (num, eths, serials)
 
     def __call__(self,node):
         '''Perform redundant path test'''
         self.incr("calls")
         if self.CM.ShouldBeStatus[node]!=self.CM["up"]:
             return self.skipped()
     
         (num, eths, serials) = self.PathCount()
 
         for eth in eths:
             if self.CM.rsh(node,"ifconfig %s down" % eth)==0:
                 PathDown = "OK"
                 break
         
         if PathDown != "OK":
             for serial in serials:
                 if self.CM.rsh(node,"setserial %s uart none" % serial)==0:
                     PathDown = "OK"
                     break
                    
         if PathDown != "OK":
             return self.failure("Cannot break the path")
         
         time.sleep(self.timeout)
 
         for audit in CTSaudits.AuditList(self.CM):
             if not audit():
                 for eth in eths:
                     self.CM.rsh(node,"ifconfig %s up" % eth)
                 for serial in serials:
                     self.CM.rsh(node,"setserial %s uart 16550" % serial) 
                 return self.failure("Redundant path fail")
 
         for eth in eths:
             self.CM.rsh(node,"ifconfig %s up" % eth)
         for serial in serials:
             self.CM.rsh(node,"setserial %s uart 16550" % serial)
        
         return self.success()
 
     def is_applicable(self):
         '''It is applicable when you have more than one connection'''
         return self.PathCount()[0] > 1
 
 # FIXME!!  Why is this one commented out?
 #AllTestClasses.append(RedundantpathTest)
 
 ##########################################################################
 class DRBDTest(CTSTest):
 ##########################################################################
     '''In heartbeat, it provides replicated storage.'''
     def __init__(self,cm, timeout=10):
         CTSTest.__init__(self,cm)
         self.name = "DRBD"
         self.timeout = timeout
 
     def __call__(self, dummy):
         '''Perform the 'DRBD' test.'''
         self.incr("calls")
         
         for node in self.CM.Env["nodes"]:
             if self.CM.ShouldBeStatus[node] == self.CM["down"]:
                 return self.skipped()
 
         # Note:  All these special cases with Start/Stop/StatusDRBD
         # should be reworked to use resource objects instead of
         # being hardwired to bypass the objects here.
 
         for node in self.CM.Env["nodes"]:
             done=time.time()+self.timeout+1
             while (time.time()<done):
                  line=self.CM.rsh.readaline(node,self.CM["StatusDRBDCmd"])
                  if re.search("running",line):
                      break
                  else:
                       self.CM.rsh(node,self.CM["StartDRBDCmd"])
                       time.sleep(1)
             if time.time()>done:
                 return self.failure("Can't start drbd, please check it") 
 
         device={}
         for node in self.CM.Env["nodes"]:
             device[node]=self.getdevice(node)
 
         node = self.CM.Env["nodes"][0]
         done=time.time()+self.timeout+1
         while 1:
             if (time.time()>done):
                 return self.failure("the drbd could't sync")
             self.CM.rsh(node,"cp /proc/drbd /var/run >/dev/null 2>&1")
             if self.CM.rsh.cp("%s:/var/run/drbd" % node,"/var/run"):
                 line = open("/tmp/var/run").readlines()[2]
                 p = line.find("Primary")
                 s1 = line.find("Secondary")
                 s2 = line.rfind("Secondary")
                 if s1!=s2:
                     if self.CM.rsh(node,"drbdsetup %s primary" % device[node]):
                        pass
                 if p!=-1:
                     if p<s1:
                         primarynode = node
                         secondarynode = self.CM.Env["nodes"][1]
                         break
                 else:
                     if s1!=-1:
                         primarynode = self.CM.Env["nodes"][1]
                         secondarynode = node
                         break
                 time.sleep(1)
                  
         self.CM.rsh(secondarynode, self.CM["StopCmd"])
         self.CM.rsh(primarynode, self.CM["StopCmd"])
 
         line1 = self.CM.rsh.readaline(node,"md5sum %s" % device[primarynode])
         line2 = self.CM.rsh.readaline(node,"md5sum %s" % device[secondarynode])
 
         self.CM.rsh(primarynode,self.CM["StartCmd"])
         self.CM.rsh(secondarynode,self.CM["StartCmd"])
 
         if string.split(line1," ")[0] == string.split(line2, " "):
             return self.failure("Drbd desnt't work good")
 
         return self.success()
 
     def getdevice(self,node):
         device=None
         if self.CM.rsh(node,self.CM["DRBDCheckconf"])==0:
             self.CM.rsh.cp("%s:/var/run/drbdconf" % node, "/var/run")
             lines=open("/var/run/drbdconf","r")
             for line in lines:
                 if line.find("%s:device" % node)!=-1:
                     device=string.split(line," ")[8]
                     break
         return device
 
     def is_applicable(self):
         '''DRBD is applicable when there are drbd devices'''
 
         #for group in self.CM.ResourceGroups():
         #    for resource in group:
         #        if resource.Type() == "datadisk":
         #            return 1
         return None
 
 AllTestClasses.append(DRBDTest)
 
 ####################################################################
 class Split_brainTest(CTSTest):
 ####################################################################
     '''It is used to test split-brain. when the path between the two nodes break
        check the two nodes both take over the resource'''
     def __init__(self,cm):
         CTSTest.__init__(self,cm)
         self.name = "Split_brain"
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
 
     def __call__(self, node):
         '''Perform split-brain test'''
         self.incr("calls")
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Test setup failed")
 
         '''isolate node, Look for node is dead message'''
         watchstoppats = [ ]
         stoppat = self.CM["Pat:They_stopped"]
         for member in self.CM.Env["nodes"]:
             if member != node:
                 thispat = (stoppat % (node,member))
                 watchstoppats.append(thispat)
                 thatpat = (stoppat % (member,node))
                 watchstoppats.append(thatpat)
 
         watchstop = CTS.LogWatcher(self.CM["LogFileName"], watchstoppats\
         ,       timeout=self.CM["DeadTime"]+60)
         watchstop.ReturnOnlyMatch()
 
         watchstop.setwatch()
         if float(self.CM.Env["XmitLoss"])!=0 or float(self.CM.Env["RecvLoss"])!=0 :
             self.CM.savecomm_node(node)
         if not self.CM.isolate_node(node):
             return self.failure("Could not isolate the nodes")
         if not watchstop.lookforall():
             self.CM.unisolate_node(node)
             self.CM.log("Patterns not found: " + repr(watchstop.unmatched))
             return self.failure("Didn't find the log 'dead' message")
 
         '''
         Unisolate the node, look for the return partition message
         and check whether they restart
         '''
         watchpartitionpats = [ ]
         partitionpat = self.CM["Pat:Return_partition"]
         watchstartpats = [ ]
         startpat = self.CM["Pat:Master_started"]
 
         for member in self.CM.Env["nodes"]:
             thispat = (partitionpat % member)
             thatpat = (startpat % member)
             watchpartitionpats.append(thispat)
             watchstartpats.append(thatpat)
         watchpartition = CTS.LogWatcher(self.CM["LogFileName"], watchpartitionpats\
         ,               timeout=self.CM["DeadTime"]+60)
         watchstart = CTS.LogWatcher(self.CM["LogFileName"], watchstartpats\
         ,                timeout=self.CM["DeadTime"]+60)
         watchstart.ReturnOnlyMatch()
 
         watchpartition.setwatch()
         watchstart.setwatch()
         
         self.CM.unisolate_node(node)
         if float(self.CM.Env["XmitLoss"])!=0 or float(self.CM.Env["RecvLoss"])!=0 :
             self.CM.restorecomm_node(node)
 
         if not watchpartition.lookforall():
             self.CM.log("Patterns not found: " + repr(watchpartition.unmatched))
             return self.failure("Didn't find return from partition messages")
         
         if not watchstart.lookforall():
             self.CM.log("Patterns not found: " + repr(watchstart.unmatched))
             return self.failure("Both nodes didn't restart")
         return self.success()
 
     def is_applicable(self):
         '''Split_brain is applicable for 1.X'''
         if self.CM["Name"] == "heartbeat":
             return 1
         return 0
 
 #
 #        FIXME!!  This test has hard-coded cluster-manager-specific things in it!!
 #        
     def errorstoignore(self):
         '''Return list of errors which are 'normal' and should be ignored'''
         return [ "ERROR:.*Both machines own.*resources"
         ,        "ERROR:.*lost a lot of packets!"
         ,        "ERROR: Cannot rexmit pkt .*: seqno too low"
         ,        "ERROR: Irretrievably lost packet: node"
         ,        "CRIT: Cluster node .* returning after partition"
         ]
 
 AllTestClasses.append(Split_brainTest)
 
 
 ###################################################################
 class ResourceRecover(CTSTest):
 ###################################################################
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="ResourceRecover"
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.max=30
         self.rid=None
 
         # these are the values used for the new LRM API call
         self.action = "asyncmon"
         self.interval = 0
 
     def __call__(self, node):
         '''Perform the 'ResourceRecover' test. '''
         self.incr("calls")
         
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         resourcelist = self.CM.active_resources(node)
         # if there are no resourcelist, return directly
         if len(resourcelist)==0:
             self.CM.log("No active resources on %s" % node)
             return self.skipped()
 
         self.rid = self.CM.Env.RandomGen.choice(resourcelist)
         self.CM.debug("Shooting %s..." % self.rid)
 
         pats = []
         pats.append("crmd:.* Performing .* op=%s_stop_0" % self.rid)
         pats.append("crmd:.* Performing .* op=%s_start_0" % self.rid)
         pats.append("crmd:.* LRM operation %s_start_0.*complete" % self.rid)
         pats.append("Updating failcount for %s on .* after .* %s"
                     % (self.rid, self.action))
 
         watch = CTS.LogWatcher(self.CM["LogFileName"], pats, timeout=60)
         watch.setwatch()
         
         # fail a resource by calling an action it doesn't support
         self.CM.rsh.remote_py(node, "os", "system",
                               "@sbindir@/crm_resource -F -r %s -H %s &>/dev/null" % (self.rid, node))
 
         watch.lookforall()
 
         self.CM.cluster_stable()
         recovernode=self.CM.ResourceLocation(self.rid)
 
         if len(recovernode)==1:
             self.CM.debug("Recovered: %s is running on %s" 
                           %(self.rid, recovernode[0]))
             if not watch.unmatched: 
                 return self.success()
             else:
                 return self.failure("Patterns not found: %s" 
                                     % repr(watch.unmatched))
 
         elif len(recovernode)==0:
             return self.failure("%s was not recovered and is inactive" 
                                 % self.rid)
         else:
             return self.failure("%s is now active on more than one node: %s"
                                 %(self.rid, str(recovernode)))
 
     def is_applicable(self):
         '''ResourceRecover is applicable only when there are resources running
          on our cluster and environment is linux-ha-v2'''
         if self.CM["Name"] != "heartbeat":
             return 1
         return 0
     
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return [ """Updating failcount for %s""" % self.rid,
                  """Unknown operation: fail""",
                  """ERROR: sending stonithRA op to stonithd failed.""",
                  """ERROR: process_lrm_event: LRM operation %s_%s_%d""" % (self.rid, self.action, self.interval),
                  """ERROR: process_graph_event: Action %s_%s_%d .* initiated outside of a transition""" % (self.rid, self.action, self.interval),
                  ]
 
 AllTestClasses.append(ResourceRecover)
 
 ###################################################################
 class ComponentFail(CTSTest):
 ###################################################################
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="ComponentFail"
         self.startall = SimulStartLite(cm)
         self.complist = cm.Components()
         self.patterns = []
         self.okerrpatterns = []
 
     def __call__(self, node):
         '''Perform the 'ComponentFail' test. '''
         self.incr("calls")
         self.patterns = []
         self.okerrpatterns = []
 
         # start all nodes
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         if not self.CM.cluster_stable(self.CM["StableTime"]):
             return self.failure("Setup failed - unstable")
 
         node_is_dc = self.CM.is_node_dc(node, None)
 
         # select a component to kill
         chosen = self.CM.Env.RandomGen.choice(self.complist)
         while chosen.dc_only == 1 and node_is_dc == 0:
             chosen = self.CM.Env.RandomGen.choice(self.complist)
 
         self.CM.debug("...component %s (dc=%d,boot=%d)" % (chosen.name, node_is_dc,chosen.triggersreboot))
         self.incr(chosen.name)
         
-        self.patterns.append(self.CM["Pat:ChildKilled"] %(node, chosen.name))
-        self.patterns.append(self.CM["Pat:ChildRespawn"] %(node, chosen.name))
+        if self.CM["Name"] != "linux-ha-v2" or chosen.name != "pengine":
+            self.patterns.append(self.CM["Pat:ChildKilled"] %(node, chosen.name))
+            self.patterns.append(self.CM["Pat:ChildRespawn"] %(node, chosen.name))
+
         self.patterns.extend(chosen.pats)
         if node_is_dc:
           self.patterns.extend(chosen.dc_pats)
 
         # Make sure the node goes down and then comes back up if it should reboot...
         if chosen.triggersreboot:
           for other in self.CM.Env["nodes"]:
               if other != node:
                   self.patterns.append(self.CM["Pat:They_stopped"] %(other, node))
           self.patterns.append(self.CM["Pat:Slave_started"] % node)
           self.patterns.append(self.CM["Pat:Local_started"] % node)
 
         # In an ideal world, this next stuff should be in the "chosen" object as a member function
         if self.CM["Name"] == "linux-ha-v2":
             if chosen.triggersreboot:
                 if chosen.dc_only: 
                     # Sometimes these will be in the log, and sometimes they won't...
                     self.okerrpatterns.append("%s crmd:.*Process %s:.* exited" %(node, chosen.name))
                     self.okerrpatterns.append("%s crmd:.*I_ERROR.*crmdManagedChildDied" %node)
                     self.okerrpatterns.append("%s crmd:.*The %s subsystem terminated unexpectedly" %(node, chosen.name))
                     self.okerrpatterns.append("ERROR: Client .* exited with return code")
                 else:
                     # Sometimes this won't be in the log...
                     self.okerrpatterns.append(self.CM["Pat:ChildKilled"] %(node, chosen.name))
                     self.okerrpatterns.append(self.CM["Pat:ChildRespawn"] %(node, chosen.name))
                     self.okerrpatterns.append(self.CM["Pat:ChildExit"])
-            else:
-                self.patterns.append("%s crmd:.*Process %s:.* exited" %(node, chosen.name))
-                self.patterns.append("%s crmd:.*I_ERROR.*crmdManagedChildDied" %node)
-                self.patterns.append("%s crmd:.*The %s subsystem terminated unexpectedly" %(node, chosen.name))
 
         # supply a copy so self.patterns doesnt end up empty
         tmpPats = []
         tmpPats.extend(self.patterns)
         self.patterns.extend(chosen.badnews_ignore)
 
         # set the watch for stable
         watch = CTS.LogWatcher(
             self.CM["LogFileName"], tmpPats, 
             self.CM["DeadTime"] + self.CM["StableTime"] + self.CM["StartTime"])
         watch.setwatch()
         
         # kill the component
         chosen.kill(node)
 
         # check to see Heartbeat noticed
         matched = watch.lookforall()
         if not matched:
             self.CM.log("Patterns not found: " + repr(watch.unmatched))
             self.CM.cluster_stable(self.CM["StartTime"])
             return self.failure("Didn't find all expected patterns")
         
         self.CM.debug("Found: "+ repr(matched))
 
         # now watch it recover...
         for attempt in (1, 2, 3, 4, 5):
             self.CM.debug("Waiting for the cluster to recover...")
             if self.CM.cluster_stable(self.CM["StartTime"]):
                 return self.success()
 
         return self.failure("Cluster did not become stable")
 
     def is_applicable(self):
         if self.CM["Name"] != "heartbeat":
             return 1
         return 0
     
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
 	# Note that okerrpatterns refers to the last time we ran this test
 	# The good news is that this works fine for us...
         self.okerrpatterns.extend(self.patterns)
         return self.okerrpatterns
     
 AllTestClasses.append(ComponentFail)
 
 ####################################################################
 class Split_brainTest2(CTSTest):
 ####################################################################
     '''It is used to test split-brain. when the path between the two nodes break
        check the two nodes both take over the resource'''
     def __init__(self,cm):
         CTSTest.__init__(self,cm)
         self.name = "Split_brain2"
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
 
     def __call__(self, node):
         '''Perform split-brain test'''
         self.incr("calls")
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
         
         count1 = self.CM.Env.RandomGen.randint(1,len(self.CM.Env["nodes"])-1)
         partition1 = []
         while len(partition1) < count1:
             select = self.CM.Env.RandomGen.choice(self.CM.Env["nodes"])
             if not select in partition1:
                 partition1.append(select)
         partition2 = []
         for member in self.CM.Env["nodes"]:
             if not member in partition1:
                 partition2.append(member)
         allownodes1 = ""
         for member in partition1:
             allownodes1 += member + " "
         allownodes2 = ""
         for member in partition2:
             allownodes2 += member + " "
         self.CM.log("Partition1: " + str(partition1))
         self.CM.log("Partition2: " + str(partition2))
 
         '''isolate nodes, Look for node is dead message'''
         watchdeadpats = [ ]
         deadpat = self.CM["Pat:They_dead"]
         for member in self.CM.Env["nodes"]:
             thispat = (deadpat % member)
             watchdeadpats.append(thispat)
 
         watchdead = CTS.LogWatcher(self.CM["LogFileName"], watchdeadpats\
         ,       timeout=self.CM["DeadTime"]+60)
         watchdead.ReturnOnlyMatch()
         watchdead.setwatch()
         
         for member in partition1:
             if float(self.CM.Env["XmitLoss"])!=0 or float(self.CM.Env["RecvLoss"])!=0 :
                 self.CM.savecomm_node(node)
         if not self.CM.isolate_node(member,allownodes1):
                 return self.failure("Could not isolate the nodes")
         for member in partition2:
             if float(self.CM.Env["XmitLoss"])!=0 or float(self.CM.Env["RecvLoss"])!=0 :
                 self.CM.savecomm_node(node)
             if not self.CM.isolate_node(member,allownodes2):
                 return self.failure("Could not isolate the nodes")
         
         if not watchdead.lookforall():
             for member in self.CM.Env["nodes"]:
                 self.CM.unisolate_node(member)
             self.CM.log("Patterns not found: " + repr(watchdead.unmatched))
             return self.failure("Didn't find the log 'dead' message")
         
         dcnum=0
         while dcnum < 2:
             dcnum = 0
             for member in self.CM.Env["nodes"]:
                 if self.CM.is_node_dc(member):
                     dcnum += 1
             time.sleep(1)  
                     
         '''
         Unisolate the node, look for the return partition message
         and check whether they restart
         '''
         watchpartitionpats = [self.CM["Pat:DC_IDLE"]]
         partitionpat = self.CM["Pat:Return_partition"]
         for member in self.CM.Env["nodes"]:
             thispat = (partitionpat % member)
             watchpartitionpats.append(thispat)
         
         watchpartition = CTS.LogWatcher(self.CM["LogFileName"], watchpartitionpats\
         ,               timeout=self.CM["DeadTime"]+60)
         watchpartition.setwatch()
         
         for member in self.CM.Env["nodes"]:
             if float(self.CM.Env["XmitLoss"])!=0 or float(self.CM.Env["RecvLoss"])!=0 :
                 self.CM.restorecomm_node(node)
             self.CM.unisolate_node(member)
 
         if not watchpartition.lookforall():
             self.CM.log("Patterns not found: " + repr(watchpartition.unmatched))
             return self.failure("Didn't find return from partition messages")
         
         return self.success()
 
     def is_applicable(self):
         if self.CM["Name"] != "heartbeat":
             return 1
         return 0
 
     def errorstoignore(self):
         '''Return list of errors which are 'normal' and should be ignored'''
         return [ "ERROR:.*Both machines own.*resources"
         ,        "ERROR:.*lost a lot of packets!"
         ,        "ERROR: Cannot rexmit pkt .*: seqno too low"
         ,        "ERROR: Irretrievably lost packet: node"
         ]
 
 #AllTestClasses.append(Split_brainTest2)
 
 ####################################################################
 class MemoryTest(CTSTest):
 ####################################################################
     '''Check to see if anyone is leaking memory'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="Memory"
 #        self.test = ElectionMemoryTest(cm)
         self.test = ResourceRecover(cm)
         self.startall = SimulStartLite(cm)
         self.before = {}
         self.after = {}
 
     def __call__(self, node):
         ps_command='''ps -eo ucomm,pid,pmem,tsiz,dsiz,rss,vsize | grep -e ccm -e ha_logd -e cib -e crmd -e lrmd -e tengine -e pengine'''
 
         memory_error = [ 
             "", "", "", 
             "Code", 
             "Data", 
             "Resident", 
             "Total" 
             ]
         
         ret = self.startall(None)
         if not ret:
             return self.failure("Test setup failed")
 
         time.sleep(10)  
 
         for node in self.CM.Env["nodes"]:
             self.before[node] = {}
             rsh_pipe = self.CM.rsh.popen(node, ps_command)
             rsh_pipe.tochild.close()
             result = rsh_pipe.fromchild.readline()
             while result:
                 tokens = result.split()
                 self.before[node][tokens[1]] = result
                 result = rsh_pipe.fromchild.readline()
             rsh_pipe.fromchild.close()
             self.lastrc = rsh_pipe.wait()
 
         # do something...
         if not self.test(node):
             return self.failure("Underlying test failed")
 
         time.sleep(10)  
 
         for node in self.CM.Env["nodes"]:
             self.after[node] = {}
             rsh_pipe = self.CM.rsh.popen(node, ps_command)
             rsh_pipe.tochild.close()
             result = rsh_pipe.fromchild.readline()
             while result:
                 tokens = result.split()
                 self.after[node][tokens[1]] = result
                 result = rsh_pipe.fromchild.readline()
             rsh_pipe.fromchild.close()
             self.lastrc = rsh_pipe.wait()
 
         failed_nodes = []
         for node in self.CM.Env["nodes"]:
             failed = 0
             for process in self.before[node]:
                 messages = []
                 before_line = self.before[node][process]
                 after_line = self.after[node][process]
 
                 if not after_line:
                     self.CM.log("%s %s[%s] exited during the test"
                               %(node, before_tokens[0], before_tokens[1]))
                     continue
 
                 before_tokens = before_line.split()
                 after_tokens = after_line.split()
                 
                 # 3 : Code size
                 # 4 : Data size
                 # 5 : Resident size
                 # 6 : Total size
                 for index in [ 3, 4, 6 ]:
                     mem_before = int(before_tokens[index])
                     mem_after  = int(after_tokens[index])
                     mem_diff   = mem_after - mem_before
                     mem_allow  = mem_before * 0.01
                     
                     # for now...
                     mem_allow  = 0
 
                     if mem_diff > mem_allow:
                         failed = 1
                         messages.append("%s size grew by %dkB (%dkB)"
                                         %(memory_error[index], mem_diff, mem_after))
                     elif mem_diff < 0:
                         messages.append("%s size shrank by %dkB (%dkB)"
                                         %(memory_error[index], mem_diff, mem_after))
 
                 if len(messages) > 0:
                     self.CM.log("Process %s[%s] on %s: %s"
                                 %(before_tokens[0], before_tokens[1], node,
                                   repr(messages)))
                     self.CM.debug("%s Before: %s[%s] (%s%%):\tcode=%skB, data=%skB, resident=%skB, total=%skB"
                                   %(node, before_tokens[0], before_tokens[1],
                                     before_tokens[2], before_tokens[3], 
                                     before_tokens[4], before_tokens[5],
                                     before_tokens[6]))
                     self.CM.debug("%s After:  %s[%s] (%s%%):\tcode=%skB, data=%skB, resident=%skB, total=%skB"
                                   %(node, after_tokens[0], after_tokens[1],
                                     after_tokens[2], after_tokens[3],
                                     after_tokens[4], after_tokens[5],
                                     after_tokens[6]))
                     
             if failed == 1:
                 failed_nodes.append(node)
 
         if len(failed_nodes) > 0:
             return self.failure("Memory leaked on: " + repr(failed_nodes))
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return [ """ERROR: .* LRM operation.*monitor on .*: not running""",
                  """pengine:.*Handling failed """]
 
     def is_applicable(self):
         if self.CM["Name"] != "heartbeat":
             return 1
         return 0
 
 #AllTestClasses.append(MemoryTest)
 
 ####################################################################
 class ElectionMemoryTest(CTSTest):
 ####################################################################
     '''Check to see if anyone is leaking memory'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="Election"
 
     def __call__(self, node):
         self.rsh.readaline(node, self.CM["ElectionCmd"]%node)
 
         if self.CM.cluster_stable():
             return self.success()
         
         return self.failure("Cluster not stable")
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return []
 
     def is_applicable(self):
         '''Never applicable, only for use by the memory test'''
         return 0
 
 AllTestClasses.append(ElectionMemoryTest)
 
 
 ####################################################################
 class SpecialTest1(CTSTest):
 ####################################################################
     '''Set up a custom test to cause quorum failure issues for Andrew'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="SpecialTest1"
         self.startall = SimulStartLite(cm)
         self.restart1 = RestartTest(cm)
         self.stopall = SimulStopLite(cm)
 
     def __call__(self, node):
         '''Perform the 'SpecialTest1' test for Andrew. '''
         self.incr("calls")
         #        Shut down all the nodes...
         ret = self.stopall(None)
         if not ret:
             return ret
         #        Start the selected node
         ret = self.restart1(node)
         if not ret:
             return ret
         #        Start all remaining nodes
         ret = self.startall(None)
         return ret
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return []
 
     def is_applicable(self):
         return 1
 
 AllTestClasses.append(SpecialTest1)
 ###################################################################
 class NearQuorumPointTest(CTSTest):
 ###################################################################
     '''
     This test brings larger clusters near the quorum point (50%).
     In addition, it will test doing starts and stops at the same time.
 
     Here is how I think it should work:
     - loop over the nodes and decide randomly which will be up and which
       will be down  Use a 50% probability for each of up/down.
     - figure out what to do to get into that state from the current state
     - in parallel, bring up those going up  and bring those going down.
     '''
     
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="NearQuorumPoint"
 
     def __call__(self, dummy):
         '''Perform the 'NearQuorumPoint' test. '''
         self.incr("calls")
         startset = []
         stopset = []
        
         #decide what to do with each node
         for node in self.CM.Env["nodes"]:
             action = self.CM.Env.RandomGen.choice(["start","stop"])
             #action = self.CM.Env.RandomGen.choice(["start","stop","no change"])
             if action == "start" :
                 startset.append(node)
             elif action == "stop" :
                 stopset.append(node)
                 
         self.CM.debug("start nodes:" + repr(startset))
         self.CM.debug("stop nodes:" + repr(stopset))
 
         #add search patterns
         watchpats = [ ]
         for node in stopset:
             if self.CM.ShouldBeStatus[node] == self.CM["up"]:
                 watchpats.append(self.CM["Pat:We_stopped"] % node)
                 
         for node in startset:
             if self.CM.ShouldBeStatus[node] == self.CM["down"]:
                 #watchpats.append(self.CM["Pat:Slave_started"] % node)
                 watchpats.append(self.CM["Pat:Local_started"] % node)
                 
         if len(watchpats) == 0:
             return self.skipped()
 
         if len(startset) != 0:
             watchpats.append(self.CM["Pat:DC_IDLE"])
 
         watch = CTS.LogWatcher(self.CM["LogFileName"], watchpats
         ,     timeout=self.CM["DeadTime"]+10)
         
         watch.setwatch()
         
         #begin actions
         for node in stopset:
             if self.CM.ShouldBeStatus[node] == self.CM["up"]:
                 self.CM.StopaCMnoBlock(node)
                 
         for node in startset:
             if self.CM.ShouldBeStatus[node] == self.CM["down"]:
                 self.CM.StartaCMnoBlock(node)
         
         #get the result        
         if watch.lookforall():
             self.CM.cluster_stable()
             return self.success()
 
         self.CM.log("Warn: Patterns not found: " + repr(watch.unmatched))
         
         #get the "bad" nodes
         upnodes = []        
         for node in stopset:
             if self.CM.StataCM(node) == 1:
                 upnodes.append(node)
         
         downnodes = []
         for node in startset:
             if self.CM.StataCM(node) == 0:
                 downnodes.append(node)
 
         if upnodes == [] and downnodes == []:
             self.CM.cluster_stable()
             return self.success()
 
         if len(upnodes) > 0:
             self.CM.log("Warn: Unstoppable nodes: " + repr(upnodes))
         
         if len(downnodes) > 0:
             self.CM.log("Warn: Unstartable nodes: " + repr(downnodes))
         
         return self.failure()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return []
 
     def is_applicable(self):
         if self.CM["Name"] != "heartbeat":
             return 1
         return 0
 
 AllTestClasses.append(NearQuorumPointTest)
 
 ###################################################################
 class BSC_AddResource(CTSTest):
 ###################################################################
     '''Add a resource to the cluster'''
     def __init__(self, cm):
         CTSTest.__init__(self, cm)
         self.name="AddResource"
         self.resource_offset = 0
         self.cib_cmd="""@sbindir@/cibadmin -C -o %s -X '%s' """
 
     def __call__(self, node):
         self.resource_offset =         self.resource_offset  + 1
 
         r_id = "bsc-rsc-%s-%d" % (node, self.resource_offset)
         start_pat = "crmd.*%s_start_0.*complete"
 
         patterns = []
         patterns.append(start_pat % r_id)
 
         watch = CTS.LogWatcher(
             self.CM["LogFileName"], patterns, self.CM["DeadTime"])
         watch.setwatch()
 
         fields = string.split(self.CM.Env["IPBase"], '.')
         fields[3] = str(int(fields[3])+1)
         ip = string.join(fields, '.')
         self.CM.Env["IPBase"] = ip
 
         if not self.make_ip_resource(node, r_id, "ocf", "IPaddr", ip):
             return self.failure("Make resource %s failed" % r_id)
 
         failed = 0
         watch_result = watch.lookforall()
         if watch.unmatched:
             for regex in watch.unmatched:
                 self.CM.log ("Warn: Pattern not found: %s" % (regex))
                 failed = 1
 
         if failed:
             return self.failure("Resource pattern(s) not found")
 
         if not self.CM.cluster_stable(self.CM["DeadTime"]):
             return self.failure("Unstable cluster")
 
         return self.success()
 
     def make_ip_resource(self, node, id, rclass, type, ip):
         self.CM.log("Creating %s::%s:%s (%s) on %s" % (rclass,type,id,ip,node))
         rsc_xml="""
 <primitive id="%s" class="%s" type="%s"  provider="heartbeat">
     <instance_attributes id="%s"><attributes>
         <nvpair id="%s" name="ip" value="%s"/>
     </attributes></instance_attributes>
 </primitive>""" % (id, rclass, type, id, id, ip)
 
         node_constraint="""
       <rsc_location id="run_%s" rsc="%s">
         <rule id="pref_run_%s" score="100">
           <expression id="%s_loc_expr" attribute="#uname" operation="eq" value="%s"/>
         </rule>
       </rsc_location>""" % (id, id, id, id, node)
 
         rc = 0
         (rc, lines) = self.CM.rsh.remote_py(node, "os", "system", self.cib_cmd % ("constraints", node_constraint))
         if rc != 0:
             self.CM.log("Constraint creation failed: %d" % rc)
             return None
 
         (rc, lines) = self.CM.rsh.remote_py(node, "os", "system", self.cib_cmd % ("resources", rsc_xml))
         if rc != 0:
             self.CM.log("Resource creation failed: %d" % rc)
             return None
 
         return 1
 
     def is_applicable(self):
         if self.CM["Name"] != "heartbeat" and self.CM.Env["DoBSC"]:
             return 1
         return None
 
 def TestList(cm):
     result = []
     for testclass in AllTestClasses:
         bound_test = testclass(cm)
         if bound_test.is_applicable():
             result.append(bound_test)
     return result
 
          
 
 class SimulStopLite(CTSTest):
 ###################################################################
     '''Stop any active nodes ~ simultaneously'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="SimulStopLite"
 
     def __call__(self, dummy):
         '''Perform the 'SimulStopLite' setup work. '''
         self.incr("calls")
 
         self.CM.debug("Setup: " + self.name)
 
         #     We ignore the "node" parameter...
         watchpats = [ ]
 
         for node in self.CM.Env["nodes"]:
             if self.CM.ShouldBeStatus[node] == self.CM["up"]:
                 self.incr("WasStarted")
                 watchpats.append(self.CM["Pat:All_stopped"] % node)
                 if self.CM.Env["use_logd"]:
                     watchpats.append(self.CM["Pat:Logd_stopped"] % node)
 
         if len(watchpats) == 0:
             self.CM.clear_all_caches()
             return self.skipped()
 
         #     Stop all the nodes - at about the same time...
         watch = CTS.LogWatcher(self.CM["LogFileName"], watchpats
         ,     timeout=self.CM["DeadTime"]+10)
 
         watch.setwatch()
         self.starttime=time.time()
         for node in self.CM.Env["nodes"]:
             if self.CM.ShouldBeStatus[node] == self.CM["up"]:
                 self.CM.StopaCMnoBlock(node)
         if watch.lookforall():
             self.CM.clear_all_caches()
             return self.success()
 
         did_fail=0
         up_nodes = []
         for node in self.CM.Env["nodes"]:
             if self.CM.StataCM(node) == 1:
                 did_fail=1
                 up_nodes.append(node)
 
         if did_fail:
             return self.failure("Active nodes exist: " + repr(up_nodes))
 
         self.CM.log("Warn: All nodes stopped but CTS didnt detect: " 
                     + repr(watch.unmatched))
 
         self.CM.clear_all_caches()
         return self.failure("Missing log message: "+repr(watch.unmatched))
 
     def is_applicable(self):
         '''SimulStopLite is a setup test and never applicable'''
         return 0
 
 ###################################################################
 class SimulStartLite(CTSTest):
 ###################################################################
     '''Start any stopped nodes ~ simultaneously'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name="SimulStartLite"
         
     def __call__(self, dummy):
         '''Perform the 'SimulStartList' setup work. '''
         self.incr("calls")
         self.CM.debug("Setup: " + self.name)
 
         #        We ignore the "node" parameter...
         watchpats = [ ]
 
         uppat = self.CM["Pat:Slave_started"]
         if self.CM.upcount() == 0:
             uppat = self.CM["Pat:Local_started"]
 
         for node in self.CM.Env["nodes"]:
             if self.CM.ShouldBeStatus[node] == self.CM["down"]:
                 self.incr("WasStopped")
                 watchpats.append(uppat % node)
         
         if len(watchpats) == 0:
             return self.skipped()
 
         watchpats.append(self.CM["Pat:DC_IDLE"])
         
         #        Start all the nodes - at about the same time...
         watch = CTS.LogWatcher(self.CM["LogFileName"], watchpats
         ,        timeout=self.CM["DeadTime"]+10)
 
         watch.setwatch()
 
         self.starttime=time.time()
         for node in self.CM.Env["nodes"]:
             if self.CM.ShouldBeStatus[node] == self.CM["down"]:
                 self.CM.StartaCMnoBlock(node)
         if watch.lookforall():
             for attempt in (1, 2, 3, 4, 5):
                 if self.CM.cluster_stable():
                     return self.success()
             return self.failure("Cluster did not stabilize") 
                 
         did_fail=0
         unstable = []
         for node in self.CM.Env["nodes"]:
             if self.CM.StataCM(node) == 0:
                 did_fail=1
                 unstable.append(node)
                 
         if did_fail:
             return self.failure("Unstarted nodes exist: " + repr(unstable))
 
         unstable = []
         for node in self.CM.Env["nodes"]:
             if not self.CM.node_stable(node):
                 did_fail=1
                 unstable.append(node)
 
         if did_fail:
             return self.failure("Unstable cluster nodes exist: " 
                                 + repr(unstable))
 
         self.CM.log("ERROR: All nodes started but CTS didnt detect: " 
                     + repr(watch.unmatched))
         return self.failure() 
 
 
     def is_applicable(self):
         '''SimulStartLite is a setup test and never applicable'''
         return 0
diff --git a/include/crm/crm.h b/include/crm/crm.h
index baada9b34e..509097c620 100644
--- a/include/crm/crm.h
+++ b/include/crm/crm.h
@@ -1,328 +1,332 @@
 /* 
  * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
  * 
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  * 
  * This software is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #ifndef CRM__H
 #define CRM__H
 
 #include <heartbeat/hb_config.h>
 #include <crm_config.h>
 #include <stdlib.h>
 #include <glib.h>
 
 #undef MIN
 #undef MAX
 #include <string.h>
 
 #include <clplumbing/cl_log.h>
 
 #include <libxml/tree.h> 
 
 #define CRM_FEATURE_SET		"3.0"
 #define MINIMUM_SCHEMA_VERSION	"pacemaker-1.0"
 #define LATEST_SCHEMA_VERSION	"pacemaker-"DTD_VERSION
 
 #define EOS		'\0'
 #define DIMOF(a)	((int) (sizeof(a)/sizeof(a[0])) )
 #define	HAURL(url)	HA_URLBASE url
 
 #ifndef CRM_DEV_BUILD
 #  define CRM_DEV_BUILD 0
 #endif
 
 #define CRM_DEPRECATED_SINCE_2_0_1 0
 #define CRM_DEPRECATED_SINCE_2_0_2 0
 #define CRM_DEPRECATED_SINCE_2_0_3 0
 #define CRM_DEPRECATED_SINCE_2_0_4 0
 #define CRM_DEPRECATED_SINCE_2_0_5 0
 #define CRM_DEPRECATED_SINCE_2_0_6 1
 #define CRM_DEPRECATED_SINCE_2_0_7 1
 #define CRM_DEPRECATED_SINCE_2_0_8 1
 #define CRM_DEPRECATED_SINCE_2_1_0 1
 
 #define CRM_META			"CRM_meta"
 #define crm_meta_name(field) CRM_META"_"field
 
 #define ipc_call_diff_max_ms 5000
 #define action_diff_warn_ms  5000
 #define action_diff_max_ms   20000
 #define fsa_diff_warn_ms     10000
 #define fsa_diff_max_ms      30000
 
 #define CRM_ASSERT(expr) do {						\
 	if((expr) == FALSE) {						\
 	    crm_abort(__FILE__, __PRETTY_FUNCTION__, __LINE__, #expr, TRUE, FALSE); \
 	}								\
     } while(0)
 
 #define CRM_DEV_ASSERT(expr) do {					\
 	if((expr) == FALSE) {						\
 		crm_abort(__FILE__,__PRETTY_FUNCTION__,__LINE__, #expr, FALSE, TRUE); \
 	}								\
     } while(0)
 
 #define CRM_CHECK(expr, failure_action) do {				\
 	if((expr) == FALSE) {						\
 	    crm_abort(__FILE__,__PRETTY_FUNCTION__,__LINE__, #expr, FALSE, TRUE); \
 	    failure_action;						\
 	}								\
     } while(0)
 
 #define CRM_CHECK_AND_STORE(expr, failure_action) do {			\
 	if((expr) == FALSE) {						\
 	    crm_abort(__FILE__,__PRETTY_FUNCTION__,__LINE__, #expr, TRUE, TRUE); \
 	    failure_action;						\
 	}								\
     } while(0)
 
 extern const char *crm_system_name;
 
 /* Clean these up at some point, some probably should be runtime options */
 #define WORKING_DIR	HA_VARLIBDIR"/heartbeat/crm"
 #define CRM_SOCK_DIR	HA_VARRUNDIR"/heartbeat/crm"
 #define BIN_DIR		HA_LIBDIR"/heartbeat"
 #define SOCKET_LEN	1024
 #define APPNAME_LEN	256
 #define MAX_IPC_FAIL	5
 #define CIB_FILENAME	WORKING_DIR"/cib.xml"
 #define CIB_BACKUP	WORKING_DIR"/cib_backup.xml"
 
 #define MSG_LOG			1
 #define DOT_FSA_ACTIONS		1
 #define DOT_ALL_FSA_INPUTS	1
 /* #define FSA_TRACE		1 */
 
 #define INFINITY_S        "INFINITY"
 #define MINUS_INFINITY_S "-INFINITY"
 
 #define INFINITY        1000000
 
 /* Sub-systems */
 #define CRM_SYSTEM_DC		"dc"
 #define CRM_SYSTEM_DCIB		"dcib" /*  The master CIB */
 #define CRM_SYSTEM_CIB		"cib"
 #define CRM_SYSTEM_CRMD		"crmd"
 #define CRM_SYSTEM_LRMD		"lrmd"
 #define CRM_SYSTEM_PENGINE	"pengine"
 #define CRM_SYSTEM_TENGINE	"tengine"
 #define CRM_SYSTEM_STONITHD	"stonithd"
 
 /* Valid operations */
 #define CRM_OP_NOOP		"noop"
 
 #define CRM_OP_JOIN_ANNOUNCE	"join_announce"
 #define CRM_OP_JOIN_OFFER	"join_offer"
 #define CRM_OP_JOIN_REQUEST	"join_request"
 #define CRM_OP_JOIN_ACKNAK	"join_ack_nack"
 #define CRM_OP_JOIN_CONFIRM	"join_confirm"
 
 #define CRM_OP_DIE		"die_no_respawn"
 #define CRM_OP_RETRIVE_CIB	"retrieve_cib"
 #define CRM_OP_PING		"ping"
 #define CRM_OP_VOTE		"vote"
 #define CRM_OP_NOVOTE		"no-vote"
 #define CRM_OP_HELLO		"hello"
 #define CRM_OP_HBEAT		"dc_beat"
 #define CRM_OP_PECALC		"pe_calc"
 #define CRM_OP_ABORT		"abort"
 #define CRM_OP_QUIT		"quit"
 #define CRM_OP_LOCAL_SHUTDOWN 	"start_shutdown"
 #define CRM_OP_SHUTDOWN_REQ	"req_shutdown"
 #define CRM_OP_SHUTDOWN 	"do_shutdown"
 #define CRM_OP_FENCE	 	"stonith"
 #define CRM_OP_EVENTCC		"event_cc"
 #define CRM_OP_TEABORT		"te_abort"
 #define CRM_OP_TEABORTED	"te_abort_confirmed" /* we asked */
 #define CRM_OP_TE_HALT		"te_halt"
 #define CRM_OP_TECOMPLETE	"te_complete"
 #define CRM_OP_TETIMEOUT	"te_timeout"
 #define CRM_OP_TRANSITION	"transition"
 #define CRM_OP_REGISTER		"register"
 #define CRM_OP_DEBUG_UP		"debug_inc"
 #define CRM_OP_DEBUG_DOWN	"debug_dec"
 #define CRM_OP_INVOKE_LRM	"lrm_invoke"
 #define CRM_OP_LRM_REFRESH	"lrm_refresh"
 #define CRM_OP_LRM_QUERY	"lrm_query"
 #define CRM_OP_LRM_DELETE	"lrm_delete"
 #define CRM_OP_LRM_FAIL		"lrm_fail"
 #define CRM_OP_PROBED		"probe_complete"
 #define CRM_OP_REPROBE		"probe_again"
 
 #define CRMD_STATE_ACTIVE	"member"
 #define CRMD_STATE_INACTIVE	"down"
 
 #define CRMD_JOINSTATE_DOWN	CRMD_STATE_INACTIVE
 #define CRMD_JOINSTATE_PENDING	"pending"
 #define CRMD_JOINSTATE_MEMBER	CRMD_STATE_ACTIVE
 #define CRMD_JOINSTATE_NACK	"banned"
 
 #define CRMD_ACTION_DELETE		"delete"
 #define CRMD_ACTION_CANCEL		"cancel"
 
 #define CRMD_ACTION_MIGRATE		"migrate_to"
 #define CRMD_ACTION_MIGRATED		"migrate_from"
 
 #define CRMD_ACTION_START		"start"
 #define CRMD_ACTION_STARTED		"running"
 
 #define CRMD_ACTION_STOP		"stop"
 #define CRMD_ACTION_STOPPED		"stopped"
 
 #define CRMD_ACTION_PROMOTE		"promote"
 #define CRMD_ACTION_PROMOTED		"promoted"
 #define CRMD_ACTION_DEMOTE		"demote"
 #define CRMD_ACTION_DEMOTED		"demoted"
 
 #define CRMD_ACTION_NOTIFY		"notify"
 #define CRMD_ACTION_NOTIFIED		"notified"
 
 #define CRMD_ACTION_STATUS		"monitor"
 
 /* short names */
 #define RSC_DELETE	CRMD_ACTION_DELETE
 #define RSC_CANCEL	CRMD_ACTION_CANCEL
 
 #define RSC_MIGRATE	CRMD_ACTION_MIGRATE
 #define RSC_MIGRATED	CRMD_ACTION_MIGRATED
 
 #define RSC_START	CRMD_ACTION_START
 #define RSC_STARTED	CRMD_ACTION_STARTED
 
 #define RSC_STOP	CRMD_ACTION_STOP
 #define RSC_STOPPED	CRMD_ACTION_STOPPED
 
 #define RSC_PROMOTE	CRMD_ACTION_PROMOTE
 #define RSC_PROMOTED	CRMD_ACTION_PROMOTED
 #define RSC_DEMOTE	CRMD_ACTION_DEMOTE
 #define RSC_DEMOTED	CRMD_ACTION_DEMOTED
 
 #define RSC_NOTIFY	CRMD_ACTION_NOTIFY
 #define RSC_NOTIFIED	CRMD_ACTION_NOTIFIED
 
 #define RSC_STATUS	CRMD_ACTION_STATUS
 
 
 
 
 typedef GList* GListPtr;
 #define slist_destroy(child_type, child, parent, a)			\
 	{		 						\
 		GListPtr __crm_iter_head = parent;			\
 		child_type *child = NULL;				\
 		while(__crm_iter_head != NULL) {			\
 			child = (child_type *) __crm_iter_head->data;	\
 			__crm_iter_head = __crm_iter_head->next;	\
 			{ a; }						\
 		}							\
 		g_list_free(parent);					\
 	}
 
 #define slist_iter(child, child_type, parent, counter, a)		\
 	{		 						\
 		GListPtr __crm_iter_head = parent;			\
 		child_type *child = NULL;				\
 		int counter = 0;					\
 		for(; __crm_iter_head != NULL; counter++) {		\
 			child = (child_type *) __crm_iter_head->data;	\
 			__crm_iter_head = __crm_iter_head->next;	\
 			{ a; }						\
 		}							\
 	}
 
 #define LOG_DEBUG_2  LOG_DEBUG+1
 #define LOG_DEBUG_3  LOG_DEBUG+2
 #define LOG_DEBUG_4  LOG_DEBUG+3
 #define LOG_DEBUG_5  LOG_DEBUG+4
 #define LOG_DEBUG_6  LOG_DEBUG+5
 
 #define LOG_MSG  LOG_DEBUG_3
 
 /*
  * Throughout the macros below, note the leading, pre-comma, space in the
  * various ' , ##args' occurences to aid portability across versions of 'gcc'.
  *	http://gcc.gnu.org/onlinedocs/cpp/Variadic-Macros.html#Variadic-Macros
  */
 #define do_crm_log(level, fmt, args...) do {				\
 		if(crm_log_level < (level)) {				\
 			continue;					\
 		} else if((level) > LOG_DEBUG) {			\
 			cl_log(LOG_DEBUG, "debug%d: %s: " fmt,		\
 			       level-LOG_INFO, __PRETTY_FUNCTION__ , ##args); \
 		} else {						\
 			cl_log(level, "%s: " fmt,			\
 			       __PRETTY_FUNCTION__ , ##args);		\
 		}							\
 	} while(0)
 
 #define do_crm_log_always(level, fmt, args...) cl_log(level, "%s: " fmt, __PRETTY_FUNCTION__ , ##args)
 
 #define crm_crit(fmt, args...)    do_crm_log_always(LOG_CRIT,    fmt , ##args)
 #define crm_err(fmt, args...)     do_crm_log_always(LOG_ERR,     fmt , ##args)
 #define crm_warn(fmt, args...)    do_crm_log_always(LOG_WARNING, fmt , ##args)
 #define crm_notice(fmt, args...)  do_crm_log_always(LOG_NOTICE,  fmt , ##args)
 #define crm_info(fmt, args...)    do_crm_log_always(LOG_INFO,    fmt , ##args)
 #define crm_debug(fmt, args...)   do_crm_log(LOG_DEBUG,   fmt , ##args)
 #define crm_debug_2(fmt, args...) do_crm_log(LOG_DEBUG_2, fmt , ##args)
 #define crm_debug_3(fmt, args...) do_crm_log(LOG_DEBUG_3, fmt , ##args)
 #define crm_debug_4(fmt, args...) do_crm_log(LOG_DEBUG_4, fmt , ##args)
 #define crm_debug_5(fmt, args...) do_crm_log(LOG_DEBUG_5, fmt , ##args)
 #define crm_debug_6(fmt, args...) do_crm_log(LOG_DEBUG_6, fmt , ##args)
+#define crm_perror(level, fmt, args...) do {				\
+	const char *err = strerror(errno);				\
+	cl_log(level, "%s: " fmt ": %s", __PRETTY_FUNCTION__, ##args, err); \
+    } while(0)
 
 #include <crm/common/util.h>
 
 #define crm_log_xml(level, text, xml)   if(crm_log_level >= (level)) {	\
 		print_xml_formatted(level,  __PRETTY_FUNCTION__, xml, text); \
 	}
 #define crm_log_xml_crit(xml, text)    crm_log_xml(LOG_CRIT,    text, xml)
 #define crm_log_xml_err(xml, text)     crm_log_xml(LOG_ERR,     text, xml)
 #define crm_log_xml_warn(xml, text)    crm_log_xml(LOG_WARNING, text, xml)
 #define crm_log_xml_notice(xml, text)  crm_log_xml(LOG_NOTICE,  text, xml)
 #define crm_log_xml_info(xml, text)    crm_log_xml(LOG_INFO,    text, xml)
 #define crm_log_xml_debug(xml, text)   crm_log_xml(LOG_DEBUG,   text, xml)
 #define crm_log_xml_debug_2(xml, text) crm_log_xml(LOG_DEBUG_2, text, xml)
 #define crm_log_xml_debug_3(xml, text) crm_log_xml(LOG_DEBUG_3, text, xml)
 #define crm_log_xml_debug_4(xml, text) crm_log_xml(LOG_DEBUG_4, text, xml)
 #define crm_log_xml_debug_5(xml, text) crm_log_xml(LOG_DEBUG_5, text, xml)
 
 #define crm_str(x)    (const char*)(x?x:"<null>")
 
 #define crm_malloc0(malloc_obj, length) do {				\
 	malloc_obj = malloc(length);					\
 	if(malloc_obj == NULL) {					\
 	    crm_err("Failed allocation of %lu bytes", (unsigned long)length); \
 	    CRM_ASSERT(malloc_obj != NULL);				\
 	}								\
 	memset(malloc_obj, 0, length);					\
     } while(0)
 
 #define crm_malloc(malloc_obj, length) do {				\
 	malloc_obj = malloc(length);					\
 	if(malloc_obj == NULL) {					\
 	    crm_err("Failed allocation of %lu bytes", (unsigned long)length); \
 	    CRM_ASSERT(malloc_obj != NULL);				\
 	}								\
     } while(0)
 
 #define crm_realloc(realloc_obj, length) do {				\
 	realloc_obj = realloc(realloc_obj, length);			\
 	CRM_ASSERT(realloc_obj != NULL);				\
     } while(0)
 	
 #define crm_free(free_obj) do { free(free_obj); free_obj=NULL; } while(0)
 #define crm_msg_del(msg) do { if(msg != NULL) { ha_msg_del(msg); msg = NULL; } } while(0)
 
 #define crm_strdup(str) crm_strdup_fn(str, __FILE__, __PRETTY_FUNCTION__, __LINE__)
 #endif
diff --git a/lib/pengine/utils.c b/lib/pengine/utils.c
index c1d75fbb56..e778d6e856 100644
--- a/lib/pengine/utils.c
+++ b/lib/pengine/utils.c
@@ -1,1359 +1,1359 @@
 /* 
  * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
  * 
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  * 
  * This software is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 #include <crm_internal.h>
 #include <crm/crm.h>
 #include <crm/msg_xml.h>
 #include <crm/common/xml.h>
 #include <crm/common/util.h>
 
 #include <glib.h>
 
 #include <crm/pengine/rules.h>
 #include <utils.h>
 
 extern xmlNode *get_object_root(const char *object_type,xmlNode *the_root);
 void print_str_str(gpointer key, gpointer value, gpointer user_data);
 gboolean ghash_free_str_str(gpointer key, gpointer value, gpointer user_data);
 void unpack_operation(
 	action_t *action, xmlNode *xml_obj, pe_working_set_t* data_set);
 
 void
 pe_free_shallow(GListPtr alist)
 {
 	pe_free_shallow_adv(alist, TRUE);
 }
 
 void
 pe_free_shallow_adv(GListPtr alist, gboolean with_data)
 {
 	GListPtr item;
 	GListPtr item_next = alist;
 
 	if(with_data == FALSE && alist != NULL) {
 		g_list_free(alist);
 		return;
 	}
 	
 	while(item_next != NULL) {
 		item = item_next;
 		item_next = item_next->next;
 		
 		if(with_data) {
 /*			crm_debug_5("freeing %p", item->data); */
 			crm_free(item->data);
 		}
 		
 		item->data = NULL;
 		item->next = NULL;
 		g_list_free_1(item);
 	}
 }
 
 
 node_t *
 node_copy(node_t *this_node) 
 {
 	node_t *new_node  = NULL;
 
 	CRM_CHECK(this_node != NULL, return NULL);
 
 	crm_malloc0(new_node, sizeof(node_t));
 	CRM_ASSERT(new_node != NULL);
 	
 	crm_debug_5("Copying %p (%s) to %p",
 		  this_node, this_node->details->uname, new_node);
 
 	new_node->weight  = this_node->weight; 
 	new_node->fixed   = this_node->fixed;
 	new_node->details = this_node->details;	
 	
 	return new_node;
 }
 
 /* are the contents of list1 and list2 equal 
  * nodes with weight < 0 are ignored if filter == TRUE
  *
  * slow but linear
  *
  */
 gboolean
 node_list_eq(GListPtr list1, GListPtr list2, gboolean filter)
 {
 	node_t *other_node;
 
 	GListPtr lhs = list1;
 	GListPtr rhs = list2;
 	
 	slist_iter(
 		node, node_t, lhs, lpc,
 
 		if(node == NULL || (filter && node->weight < 0)) {
 			continue;
 		}
 
 		other_node = (node_t*)
 			pe_find_node_id(rhs, node->details->id);
 
 		if(other_node == NULL || other_node->weight < 0) {
 			return FALSE;
 		}
 		);
 	
 	lhs = list2;
 	rhs = list1;
 
 	slist_iter(
 		node, node_t, lhs, lpc,
 
 		if(node == NULL || (filter && node->weight < 0)) {
 			continue;
 		}
 
 		other_node = (node_t*)
 			pe_find_node_id(rhs, node->details->id);
 
 		if(other_node == NULL || other_node->weight < 0) {
 			return FALSE;
 		}
 		);
   
 	return TRUE;
 }
 
 /* any node in list1 or list2 and not in the other gets a score of -INFINITY */
 GListPtr
 node_list_exclude(GListPtr list1, GListPtr list2)
 {
     node_t *other_node = NULL;
     GListPtr result = NULL;
     
     result = node_list_dup(list1, FALSE, FALSE);
     
     slist_iter(
 	node, node_t, result, lpc,
 	
 	other_node = pe_find_node_id(list2, node->details->id);
 	
 	if(other_node == NULL) {
 	    node->weight = -INFINITY;
 	} else {
 	    node->weight = merge_weights(node->weight, other_node->weight);
 	}
 	);
     
     slist_iter(
 	node, node_t, list2, lpc,
 	
 	other_node = pe_find_node_id(result, node->details->id);
 	
 	if(other_node == NULL) {
 	    node_t *new_node = node_copy(node);
 	    new_node->weight = -INFINITY;
 	    result = g_list_append(result, new_node);
 	}
 	);
 
     return result;
 }
 
 /* the intersection of list1 and list2 */
 GListPtr
 node_list_and(GListPtr list1, GListPtr list2, gboolean filter)
 {
 	GListPtr result = NULL;
 	unsigned lpc = 0;
 
 	for(lpc = 0; lpc < g_list_length(list1); lpc++) {
 		node_t *node = (node_t*)g_list_nth_data(list1, lpc);
 		node_t *other_node = pe_find_node_id(list2, node->details->id);
 		node_t *new_node = NULL;
 
 		if(other_node != NULL) {
 			new_node = node_copy(node);
 		}
 		
 		if(new_node != NULL) {
 			crm_debug_4("%s: %d + %d", node->details->uname, 
 				    other_node->weight, new_node->weight);
 			new_node->weight = merge_weights(
 				new_node->weight, other_node->weight);
 
 			crm_debug_3("New node weight for %s: %d",
 				 new_node->details->uname, new_node->weight);
 			
 			if(filter && new_node->weight < 0) {
 				crm_free(new_node);
 				new_node = NULL;
 			}
 		}
 		
 		if(new_node != NULL) {
 			result = g_list_append(result, new_node);
 		}
 	}
 
 	return result;
 }
 
 
 /* list1 - list2 */
 GListPtr
 node_list_minus(GListPtr list1, GListPtr list2, gboolean filter)
 {
 	GListPtr result = NULL;
 
 	slist_iter(
 		node, node_t, list1, lpc,
 		node_t *other_node = pe_find_node_id(list2, node->details->id);
 		node_t *new_node = NULL;
 		
 		if(node == NULL || other_node != NULL
 		   || (filter && node->weight < 0)) {
 			continue;
 			
 		}
 		new_node = node_copy(node);
 		result = g_list_append(result, new_node);
 		);
   
 	crm_debug_3("Minus result len: %d", g_list_length(result));
 
 	return result;
 }
 
 /* list1 + list2 - (intersection of list1 and list2) */
 GListPtr
 node_list_xor(GListPtr list1, GListPtr list2, gboolean filter)
 {
 	GListPtr result = NULL;
 	
 	slist_iter(
 		node, node_t, list1, lpc,
 		node_t *new_node = NULL;
 		node_t *other_node = (node_t*)
 			pe_find_node_id(list2, node->details->id);
 
 		if(node == NULL || other_node != NULL
 		   || (filter && node->weight < 0)) {
 			continue;
 		}
 		new_node = node_copy(node);
 		result = g_list_append(result, new_node);
 		);
 	
  
 	slist_iter(
 		node, node_t, list2, lpc,
 		node_t *new_node = NULL;
 		node_t *other_node = (node_t*)
 			pe_find_node_id(list1, node->details->id);
 
 		if(node == NULL || other_node != NULL
 		   || (filter && node->weight < 0)) {
 			continue;
 		}
 		new_node = node_copy(node);
 		result = g_list_append(result, new_node);
 		);
   
 	crm_debug_3("Xor result len: %d", g_list_length(result));
 	return result;
 }
 
 GListPtr
 node_list_or(GListPtr list1, GListPtr list2, gboolean filter)
 {
 	node_t *other_node = NULL;
 	GListPtr result = NULL;
 	gboolean needs_filter = FALSE;
 
 	result = node_list_dup(list1, FALSE, filter);
 
 	slist_iter(
 		node, node_t, list2, lpc,
 
 		if(node == NULL) {
 			continue;
 		}
 
 		other_node = (node_t*)pe_find_node_id(
 			result, node->details->id);
 
 		if(other_node != NULL) {
 			crm_debug_4("%s + %s: %d + %d",
 				    node->details->uname, 
 				    other_node->details->uname, 
 				    node->weight, other_node->weight);
 			other_node->weight = merge_weights(
 				other_node->weight, node->weight);
 			
 			if(filter && node->weight < 0) {
 				needs_filter = TRUE;
 			}
 
 		} else if(filter == FALSE || node->weight >= 0) {
 			node_t *new_node = node_copy(node);
 			result = g_list_append(result, new_node);
 		}
 		);
 
 	/* not the neatest way, but the most expedient for now */
 	if(filter && needs_filter) {
 		GListPtr old_result = result;
 		result = node_list_dup(old_result, FALSE, filter);
 		pe_free_shallow_adv(old_result, TRUE);
 	}
 	
 
 	return result;
 }
 
 GListPtr 
 node_list_dup(GListPtr list1, gboolean reset, gboolean filter)
 {
 	GListPtr result = NULL;
 
 	slist_iter(
 		this_node, node_t, list1, lpc,
 		node_t *new_node = NULL;
 		if(filter && this_node->weight < 0) {
 			continue;
 		}
 		
 		new_node = node_copy(this_node);
 		if(reset) {
 			new_node->weight = 0;
 		}
 		if(new_node != NULL) {
 			result = g_list_append(result, new_node);
 		}
 		);
 
 	return result;
 }
 
 
 void dump_node_scores(int level, resource_t *rsc, const char *comment, GListPtr nodes) 
 {
     GListPtr list = nodes;
     if(rsc) {
 	list = rsc->allowed_nodes;
     }
     
     slist_iter(
 	node, node_t, list, lpc,
 	if(level == 0) {
 	    if(rsc) {
 		fprintf(stdout, "%s: %s allocation score on %s: %d\n",
 			   comment, rsc->id, node->details->uname, node->weight);
 	    } else {
 		fprintf(stdout, "%s: %s = %d\n", comment, node->details->uname, node->weight);
 	    }
 	    
 	} else {
 	    if(rsc) {
 		do_crm_log(level, "%s: %s allocation score on %s: %d",
 			   comment, rsc->id, node->details->uname, node->weight);
 	    } else {
 		do_crm_log(level, "%s: %s = %d", comment, node->details->uname, node->weight);
 	    }
 	}
 	);
 
     if(rsc && rsc->children) {
 	slist_iter(
 	    child, resource_t, rsc->children, lpc,
 	    dump_node_scores(level, child, comment, nodes);
 	    );
     }
 }
 
 gint sort_rsc_index(gconstpointer a, gconstpointer b)
 {
 	const resource_t *resource1 = (const resource_t*)a;
 	const resource_t *resource2 = (const resource_t*)b;
 
 	if(a == NULL && b == NULL) { return 0; }
 	if(a == NULL) { return 1; }
 	if(b == NULL) { return -1; }
   
 	if(resource1->sort_index > resource2->sort_index) {
 		return -1;
 	}
 	
 	if(resource1->sort_index < resource2->sort_index) {
 		return 1;
 	}
 
 	return 0;
 }
 
 gint sort_rsc_priority(gconstpointer a, gconstpointer b)
 {
 	const resource_t *resource1 = (const resource_t*)a;
 	const resource_t *resource2 = (const resource_t*)b;
 
 	if(a == NULL && b == NULL) { return 0; }
 	if(a == NULL) { return 1; }
 	if(b == NULL) { return -1; }
   
 	if(resource1->priority > resource2->priority) {
 		return -1;
 	}
 	
 	if(resource1->priority < resource2->priority) {
 		return 1;
 	}
 
 	return 0;
 }
 
 action_t *
 custom_action(resource_t *rsc, char *key, const char *task,
 	      node_t *on_node, gboolean optional, gboolean save_action,
 	      pe_working_set_t *data_set)
 {
 	action_t *action = NULL;
 	GListPtr possible_matches = NULL;
 	CRM_CHECK(key != NULL, return NULL);
 	CRM_CHECK(task != NULL, return NULL);
 
 	if(save_action && rsc != NULL) {
 		possible_matches = find_actions(rsc->actions, key, on_node);
 	}
 	
 	if(possible_matches != NULL) {
 		crm_free(key);
 		
 		if(g_list_length(possible_matches) > 1) {
 			pe_warn("Action %s for %s on %s exists %d times",
 				task, rsc?rsc->id:"<NULL>",
 				on_node?on_node->details->uname:"<NULL>",
 				g_list_length(possible_matches));
 		}
 		
 		action = g_list_nth_data(possible_matches, 0);
 		crm_debug_4("Found existing action (%d) %s for %s on %s",
 			  action->id, task, rsc?rsc->id:"<NULL>",
 			  on_node?on_node->details->uname:"<NULL>");
 		g_list_free(possible_matches);
 	}
 
 	if(action == NULL) {
 		if(save_action) {
 			crm_debug_4("Creating%s action %d: %s for %s on %s",
 				    optional?"":" manditory", data_set->action_id, key, rsc?rsc->id:"<NULL>",
 				    on_node?on_node->details->uname:"<NULL>");
 		}
 		
 		crm_malloc0(action, sizeof(action_t));
 		if(save_action) {
 			action->id   = data_set->action_id++;
 		} else {
 			action->id = 0;
 		}
 		action->rsc  = rsc;
 		CRM_ASSERT(task != NULL);
 		action->task = crm_strdup(task);
 		action->node = on_node;
 		action->uuid = key;
 		
 		action->actions_before   = NULL;
 		action->actions_after    = NULL;
 		action->failure_is_fatal = TRUE;
 		
 		action->pseudo     = FALSE;
 		action->dumped     = FALSE;
 		action->runnable   = TRUE;
 		action->processed  = FALSE;
 		action->optional   = optional;
 		action->seen_count = 0;
 		
 		action->extra = g_hash_table_new_full(
 		    g_str_hash, g_str_equal, free, free);
 		
 		action->meta = g_hash_table_new_full(
 		    g_str_hash, g_str_equal, free, free);
 		
 		if(save_action) {
 			data_set->actions = g_list_append(
 				data_set->actions, action);
 		}		
 		
 		if(rsc != NULL) {
 			action->op_entry = find_rsc_op_entry(rsc, key);
 			
 			unpack_operation(
 				action, action->op_entry, data_set);
 			
 			if(save_action) {
 				rsc->actions = g_list_append(
 					rsc->actions, action);
 			}
 		}
 		
 		if(save_action) {
 			crm_debug_4("Action %d created", action->id);
 		}
 	}
 
 	if(optional == FALSE && action->optional) {
 		crm_debug_2("Action %d (%s) marked manditory",
 			    action->id, action->uuid);
 		action->optional = FALSE;
 	}
 	
 	if(rsc != NULL) {
 		enum action_tasks a_task = text2task(action->task);
 		int warn_level = LOG_DEBUG_3;
 		if(save_action) {
 			warn_level = LOG_WARNING;
 		}
 
 		if(action->node != NULL && action->op_entry != NULL) {
 			unpack_instance_attributes(
 				action->op_entry, XML_TAG_ATTR_SETS,
 				action->node->details->attrs,
 				action->extra, NULL, FALSE, data_set->now);
 		}
 
 		if(action->pseudo) {
 			/* leave untouched */
 			
 		} else if(action->node == NULL) {
 			action->runnable = FALSE;
 			
 		} else if(g_hash_table_lookup(action->meta, XML_LRM_ATTR_INTERVAL) == NULL
 			  && is_not_set(rsc->flags, pe_rsc_managed)) {
 			do_crm_log(LOG_DEBUG, "Action %s (unmanaged)",
 				 action->uuid);
 			action->optional = TRUE;
 /*   			action->runnable = FALSE; */
 
 		} else if(action->node->details->online == FALSE) {
 			action->runnable = FALSE;
 			do_crm_log(warn_level, "Action %s on %s is unrunnable (offline)",
 				 action->uuid, action->node->details->uname);
 			if(is_set(action->rsc->flags, pe_rsc_managed)
 			   && save_action
 			   && a_task == stop_rsc) {
 				do_crm_log(warn_level, "Marking node %s unclean",
 					 action->node->details->uname);
 				action->node->details->unclean = TRUE;
 			}
 			
 		} else if(action->node->details->pending) {
 			action->runnable = FALSE;
 			do_crm_log(warn_level, "Action %s on %s is unrunnable (pending)",
 				 action->uuid, action->node->details->uname);
 
 		} else if(action->needs == rsc_req_nothing) {
 			crm_debug_3("Action %s doesnt require anything",
 				  action->uuid);
 			action->runnable = TRUE;
 #if 0
 			/*
 			 * No point checking this
 			 * - if we dont have quorum we cant stonith anyway
 			 */
 		} else if(action->needs == rsc_req_stonith) {
 			crm_debug_3("Action %s requires only stonith", action->uuid);
 			action->runnable = TRUE;
 #endif
 		} else if(is_set(data_set->flags, pe_flag_have_quorum) == FALSE
 			&& data_set->no_quorum_policy == no_quorum_stop) {
 			action->runnable = FALSE;
 			crm_debug("%s\t%s (cancelled : quorum)",
 				  action->node->details->uname,
 				  action->uuid);
 			
 		} else if(is_set(data_set->flags, pe_flag_have_quorum) == FALSE
 			&& data_set->no_quorum_policy == no_quorum_freeze) {
 			crm_debug_3("Check resource is already active");
 			if(rsc->fns->active(rsc, TRUE) == FALSE) {
 				action->runnable = FALSE;
 				crm_debug("%s\t%s (cancelled : quorum freeze)",
 					  action->node->details->uname,
 					  action->uuid);
 			}
 
 		} else {
 			crm_debug_3("Action %s is runnable", action->uuid);
 			action->runnable = TRUE;
 		}
 
 		if(save_action) {
 			switch(a_task) {
 				case stop_rsc:
 				    set_bit(rsc->flags, pe_rsc_stopping);
 				    break;
 				case start_rsc:
 				    clear_bit(rsc->flags, pe_rsc_starting);
 				    if(action->runnable) {
 					set_bit(rsc->flags, pe_rsc_starting);
 				    }
 				    break;
 				default:
 					break;
 			}
 		}
 	}
 	return action;
 }
 
 void
 unpack_operation(
 	action_t *action, xmlNode *xml_obj, pe_working_set_t* data_set)
 {
 	int value_i = 0;
 	unsigned long long interval = 0;
 	unsigned long long start_delay = 0;
 	char *value_ms = NULL;
 	const char *class = NULL;
 	const char *value = NULL;
 	const char *field = NULL;
 
 	CRM_CHECK(action->rsc != NULL, return);
 
 	unpack_instance_attributes(data_set->op_defaults, XML_TAG_META_SETS, NULL,
 				   action->meta, NULL, FALSE, data_set->now);
 
 	xml_prop_iter(xml_obj, name, value,
 		      if(value != NULL && g_hash_table_lookup(action->meta, name) == NULL) {
 			  g_hash_table_insert(action->meta, crm_strdup(name), crm_strdup(value));
 		      }
 	    );
 	
 	unpack_instance_attributes(xml_obj, XML_TAG_META_SETS,
 				   NULL, action->meta, NULL, FALSE, data_set->now);
 	
 	unpack_instance_attributes(xml_obj, XML_TAG_ATTR_SETS,
 				   NULL, action->meta, NULL, FALSE, data_set->now);
 	
 	g_hash_table_remove(action->meta, "id");	
 
 	class = g_hash_table_lookup(action->rsc->meta, "class");
 	
-	value = g_hash_table_lookup(action->meta, "prereq");
+	value = g_hash_table_lookup(action->meta, "requires");
 	if(value == NULL && safe_str_neq(action->task, CRMD_ACTION_START)) {
 		/* todo: integrate stop as an option? */
 		action->needs = rsc_req_nothing;
 		value = "nothing (default)";
 
 	} else if(safe_str_eq(value, "nothing")) {
 		action->needs = rsc_req_nothing;
 
 	} else if(safe_str_eq(value, "quorum")) {
 		action->needs = rsc_req_quorum;
 
 	} else if(safe_str_eq(value, "fencing")) {
 		action->needs = rsc_req_stonith;
 		
 	} else if(data_set->no_quorum_policy == no_quorum_ignore
 	    || safe_str_eq(class, "stonith")) {
 		action->needs = rsc_req_nothing;
 		value = "nothing (default)";
 		
 	} else if(data_set->no_quorum_policy == no_quorum_freeze
 		  && is_set(data_set->flags, pe_flag_stonith_enabled)) {
 		action->needs = rsc_req_stonith;
 		value = "fencing (default)";
 
 	} else {
 		action->needs = rsc_req_quorum;
 		value = "quorum (default)";
 	}
 
 	if(safe_str_eq(class, "stonith")) {
 		if(action->needs == rsc_req_stonith) {
 			crm_config_err("Stonith resources (eg. %s) cannot require"
 				      " fencing to start", action->rsc->id);
 		}
 		action->needs = rsc_req_nothing;
 		value = "nothing (fencing override)";
 	}
 	crm_debug_3("\tAction %s requires: %s", action->task, value);
 
 	value = g_hash_table_lookup(action->meta, XML_OP_ATTR_ON_FAIL);
 	if(safe_str_eq(action->task, CRMD_ACTION_STOP)
 	   && safe_str_eq(value, "standby")) {
 	    crm_config_err("on-fail=standby is not allowed for stop actions: %s", action->rsc->id);
 	    value = NULL;
 	}
 
 	if(value == NULL) {
 
 	} else if(safe_str_eq(value, "block")) {
 		action->on_fail = action_fail_block;
 
 	} else if(safe_str_eq(value, "fence")) {
 		action->on_fail = action_fail_fence;
 		value = "node fencing";
 		
 		if(is_set(data_set->flags, pe_flag_stonith_enabled) == FALSE) {
 		    crm_config_err("Specifying on_fail=fence and"
 				   " stonith-enabled=false makes no sense");
 		    action->on_fail = action_fail_stop;
 		    action->fail_role = RSC_ROLE_STOPPED;
 		    value = "stop resource";
 		}
 		
 	} else if(safe_str_eq(value, "standby")) {
 		action->on_fail = action_fail_standby;
 		value = "node standby";
 
 	} else if(safe_str_eq(value, "ignore")
 		|| safe_str_eq(value, "nothing")) {
 		action->on_fail = action_fail_ignore;
 		value = "ignore";
 
 	} else if(safe_str_eq(value, "migrate")) {
 		action->on_fail = action_fail_migrate;
 		value = "force migration";
 		
 	} else if(safe_str_eq(value, "stop")) {
 		action->on_fail = action_fail_stop;
 		action->fail_role = RSC_ROLE_STOPPED;
 		value = "stop resource";
 		
 	} else if(safe_str_eq(value, "restart")) {
 		action->on_fail = action_fail_recover;
 		value = "restart (and possibly migrate)";
 		
 	} else {
 		pe_err("Resource %s: Unknown failure type (%s)",
 		       action->rsc->id, value);
 		value = NULL;
 	}
 	
 	/* defaults */
 	if(value == NULL && safe_str_eq(action->task, CRMD_ACTION_STOP)) {
 		if(is_set(data_set->flags, pe_flag_stonith_enabled)) {
 			action->on_fail = action_fail_fence;		
 			value = "resource fence (default)";
 			
 		} else {
 			action->on_fail = action_fail_block;		
 			value = "resource block (default)";
 		}
 		
 	} else if(value == NULL
 		  && safe_str_eq(action->task, CRMD_ACTION_MIGRATED)) {
 		action->on_fail = action_migrate_failure;		
 		value = "atomic migration recovery (default)";
 		
 	} else if(value == NULL) {
 		action->on_fail = action_fail_recover;		
 		value = "restart (and possibly migrate) (default)";
 	}
 	
 	crm_debug_3("\t%s failure handling: %s", action->task, value);
 
 	value = NULL;
 	if(xml_obj != NULL) {
 		value = g_hash_table_lookup(action->meta, "role_after_failure");
 	}
 	if(value != NULL && action->fail_role == RSC_ROLE_UNKNOWN) {
 		action->fail_role = text2role(value);
 	}
 	/* defaults */
 	if(action->fail_role == RSC_ROLE_UNKNOWN) {
 		if(safe_str_eq(action->task, CRMD_ACTION_PROMOTE)) {
 			action->fail_role = RSC_ROLE_SLAVE;
 		} else {
 			action->fail_role = RSC_ROLE_STARTED;
 		}
 	}
 	crm_debug_3("\t%s failure results in: %s",
 		    action->task, role2text(action->fail_role));	
 
 	field = XML_LRM_ATTR_INTERVAL;
 	value = g_hash_table_lookup(action->meta, field);
 	if(value != NULL) {
 		interval = crm_get_interval(value);
 		if(interval > 0) {
 		    value_ms = crm_itoa(interval);
 		    g_hash_table_replace(action->meta, crm_strdup(field), value_ms);
 
 		} else {
 		    g_hash_table_remove(action->meta, field);
 		}
 	}
 
 	field = XML_OP_ATTR_START_DELAY;
 	value = g_hash_table_lookup(action->meta, field);
 	if(value != NULL) {
 		value_i = crm_get_msec(value);
 		if(value_i < 0) {
 			value_i = 0;
 		}
 		start_delay = value_i;
 		value_ms = crm_itoa(value_i);
 		g_hash_table_replace(action->meta, crm_strdup(field), value_ms);
 
 	} else if(interval > 0 && g_hash_table_lookup(action->meta, XML_OP_ATTR_ORIGIN)) {
 	    char *date_str = NULL;
 	    char *date_str_mutable = NULL;
 	    ha_time_t *origin = NULL;
 	    value = g_hash_table_lookup(action->meta, XML_OP_ATTR_ORIGIN);
 	    date_str = crm_strdup(value);
 	    date_str_mutable = date_str;
 	    origin = parse_date(&date_str_mutable);
 	    crm_free(date_str);
 
 	    if(origin == NULL) {
 		crm_config_err("Operation %s contained an invalid "XML_OP_ATTR_ORIGIN": %s",
 			       ID(xml_obj), value);
 
 	    } else {
 		ha_time_t *delay = NULL;
 		int rc = compare_date(origin, data_set->now);
 		unsigned long long delay_s = 0;
 
 		while(rc < 0) {
 		    add_seconds(origin, interval/1000);
 		    rc = compare_date(origin, data_set->now);
 		}
 
 		delay = subtract_time(origin, data_set->now);
 		delay_s = date_in_seconds(delay);
 		/* log_date(LOG_DEBUG_5, "delay", delay, ha_log_date|ha_log_time|ha_log_local); */
 
 		crm_info("Calculated a start delay of %llus for %s", delay_s, ID(xml_obj));
 		g_hash_table_replace(action->meta, crm_strdup(XML_OP_ATTR_START_DELAY), crm_itoa(delay_s * 1000));
 		start_delay = delay_s * 1000;
 		free_ha_date(origin);
 		free_ha_date(delay);
 	    }
 	}
 	
 
 	field = XML_ATTR_TIMEOUT;
 	value = g_hash_table_lookup(action->meta, field);
 	if(value == NULL) {
 		value = pe_pref(
 			data_set->config_hash, "default-action-timeout");
 	}
 	value_i = crm_get_msec(value);
 	if(value_i < 0) {
 		value_i = 0;
 	}
 	value_i += start_delay;
 	value_ms = crm_itoa(value_i);
 	g_hash_table_replace(action->meta, crm_strdup(field), value_ms);
 }
 
 xmlNode *
 find_rsc_op_entry(resource_t *rsc, const char *key) 
 {
 	int number = 0;
 	const char *name = NULL;
 	const char *value = NULL;
 	const char *interval = NULL;
 	char *match_key = NULL;
 	xmlNode *op = NULL;
 	
 	xml_child_iter_filter(
 		rsc->ops_xml, operation, "op",
 
 		name = crm_element_value(operation, "name");
 		interval = crm_element_value(operation, XML_LRM_ATTR_INTERVAL);
 		value = crm_element_value(operation, "disabled");
 		if(crm_is_true(value)) {
 			crm_debug_2("%s disabled", ID(operation));
 			continue;
 		}
 
 		number = crm_get_interval(interval);
 		if(number < 0) {
 		    continue;
 		}
 		
 		match_key = generate_op_key(rsc->id, name, number);
 
 		if(safe_str_eq(key, match_key)) {
 			op = operation;
 		}
 		crm_free(match_key);
 
 		if(op != NULL) {
 			return op;
 		}
 		);
 	crm_debug_3("No match for %s", key);
 	return op;
 }
 
 void
 print_node(const char *pre_text, node_t *node, gboolean details)
 { 
 	if(node == NULL) {
 		crm_debug_4("%s%s: <NULL>",
 		       pre_text==NULL?"":pre_text,
 		       pre_text==NULL?"":": ");
 		return;
 	}
 
 	crm_debug_4("%s%s%sNode %s: (weight=%d, fixed=%s)",
 	       pre_text==NULL?"":pre_text,
 	       pre_text==NULL?"":": ",
 	       node->details==NULL?"error ":node->details->online?"":"Unavailable/Unclean ",
 	       node->details->uname, 
 	       node->weight,
 	       node->fixed?"True":"False"); 
 
 	if(details && node != NULL && node->details != NULL) {
 		char *pe_mutable = crm_strdup("\t\t");
 		crm_debug_4("\t\t===Node Attributes");
 		g_hash_table_foreach(node->details->attrs,
 				     print_str_str, pe_mutable);
 		crm_free(pe_mutable);
 
 		crm_debug_4("\t\t=== Resources");
 		slist_iter(
 			rsc, resource_t, node->details->running_rsc, lpc,
 			print_resource(LOG_DEBUG_4, "\t\t", rsc, FALSE);
 			);
 	}
 }
 
 /*
  * Used by the HashTable for-loop
  */
 void print_str_str(gpointer key, gpointer value, gpointer user_data)
 {
 	crm_debug_4("%s%s %s ==> %s",
 	       user_data==NULL?"":(char*)user_data,
 	       user_data==NULL?"":": ",
 	       (char*)key,
 	       (char*)value);
 }
 
 void
 print_resource(
 	int log_level, const char *pre_text, resource_t *rsc, gboolean details)
 {
 	long options = pe_print_log;
 	
 	if(rsc == NULL) {
 		do_crm_log(log_level-1, "%s%s: <NULL>",
 			      pre_text==NULL?"":pre_text,
 			      pre_text==NULL?"":": ");
 		return;
 	}
 	if(details) {
 		options |= pe_print_details;
 	}
 	rsc->fns->print(rsc, pre_text, options, &log_level);
 }
 
 void
 pe_free_action(action_t *action) 
 {
 	if(action == NULL) {
 		return;
 	}
 	pe_free_shallow(action->actions_before);/* action_warpper_t* */
 	pe_free_shallow(action->actions_after); /* action_warpper_t* */
 	g_hash_table_destroy(action->extra);
 	g_hash_table_destroy(action->meta);
 	crm_free(action->task);
 	crm_free(action->uuid);
 	crm_free(action);
 }
 
 GListPtr
 find_recurring_actions(GListPtr input, node_t *not_on_node)
 {
 	const char *value = NULL;
 	GListPtr result = NULL;
 	CRM_CHECK(input != NULL, return NULL);
 	
 	slist_iter(
 		action, action_t, input, lpc,
 		value = g_hash_table_lookup(action->meta, XML_LRM_ATTR_INTERVAL);
 		if(value == NULL) {
 			/* skip */
 		} else if(safe_str_eq(value, "0")) {
 			/* skip */
 		} else if(safe_str_eq(CRMD_ACTION_CANCEL, action->task)) {
 			/* skip */
 		} else if(not_on_node == NULL) {
 			crm_debug_5("(null) Found: %s", action->uuid);
 			result = g_list_append(result, action);
 			
 		} else if(action->node == NULL) {
 			/* skip */
 		} else if(action->node->details != not_on_node->details) {
 			crm_debug_5("Found: %s", action->uuid);
 			result = g_list_append(result, action);
 		}
 		);
 
 	return result;
 }
 
 GListPtr
 find_actions(GListPtr input, const char *key, node_t *on_node)
 {
 	GListPtr result = NULL;
 	CRM_CHECK(key != NULL, return NULL);
 	
 	slist_iter(
 		action, action_t, input, lpc,
 		crm_debug_5("Matching %s against %s", key, action->uuid);
 		if(safe_str_neq(key, action->uuid)) {
 			continue;
 			
 		} else if(on_node == NULL) {
 			result = g_list_append(result, action);
 			
 		} else if(action->node == NULL) {
 			/* skip */
 			crm_debug_2("While looking for %s action on %s, "
 				    "found an unallocated one.  Assigning"
 				    " it to the requested node...",
 				    key, on_node->details->uname);
 
 			action->node = on_node;
 			result = g_list_append(result, action);
 			
 		} else if(safe_str_eq(on_node->details->id,
 				      action->node->details->id)) {
 			result = g_list_append(result, action);
 		}
 		);
 
 	return result;
 }
 
 
 GListPtr
 find_actions_exact(GListPtr input, const char *key, node_t *on_node)
 {
 	GListPtr result = NULL;
 	CRM_CHECK(key != NULL, return NULL);
 	
 	slist_iter(
 		action, action_t, input, lpc,
 		crm_debug_5("Matching %s against %s", key, action->uuid);
 		if(safe_str_neq(key, action->uuid)) {
 			crm_debug_3("Key mismatch: %s vs. %s",
 				    key, action->uuid);
 			continue;
 			
 		} else if(on_node == NULL  || action->node == NULL) {
 			crm_debug_3("on_node=%p, action->node=%p",
 				    on_node, action->node);
 			continue;
 
 		} else if(safe_str_eq(on_node->details->id,
 				      action->node->details->id)) {
 			result = g_list_append(result, action);
 		}
 		crm_debug_2("Node mismatch: %s vs. %s",
 			    on_node->details->id, action->node->details->id);
 		);
 
 	return result;
 }
 
 void
 set_id(xmlNode * xml_obj, const char *prefix, int child) 
 {
 	int id_len = 0;
 	gboolean use_prefix = TRUE;
 	gboolean use_child = TRUE;
 
 	char *new_id   = NULL;
 	const char *id = crm_element_value(xml_obj, XML_ATTR_ID);
 	
 	id_len = 1 + strlen(id);
 
 	if(child > 999) {
 		pe_err("Are you insane?!?"
 			" The CRM does not support > 1000 children per resource");
 		return;
 		
 	} else if(child < 0) {
 		use_child = FALSE;
 		
 	} else {
 		id_len += 4; /* child */
 	}
 	
 	if(prefix == NULL || safe_str_eq(id, prefix)) {
 		use_prefix = FALSE;
 	} else {
 		id_len += (1 + strlen(prefix));
 	}
 	
 	crm_malloc0(new_id, id_len);
 
 	if(use_child) {
 		snprintf(new_id, id_len, "%s%s%s:%d",
 			 use_prefix?prefix:"", use_prefix?":":"", id, child);
 	} else {
 		snprintf(new_id, id_len, "%s%s%s",
 			 use_prefix?prefix:"", use_prefix?":":"", id);
 	}
 	
 	crm_xml_add(xml_obj, XML_ATTR_ID, new_id);
 	crm_free(new_id);
 }
 
 static void
 resource_node_score(resource_t *rsc, node_t *node, int score, const char *tag) 
 {
 	node_t *match = NULL;
 
 	if(rsc->children) {
 	    slist_iter(
 		child_rsc, resource_t, rsc->children, lpc,
 		resource_node_score(child_rsc, node, score, tag);
 		);
 	}
 	
 	crm_debug_2("Setting %s for %s on %s: %d",
 		    tag, rsc->id, node->details->uname, score);
 	match = pe_find_node_id(rsc->allowed_nodes, node->details->id);
 	if(match == NULL) {
 		match = node_copy(node);
 		match->weight = 0;
 		rsc->allowed_nodes = g_list_append(rsc->allowed_nodes, match);
 	}
 	match->weight = merge_weights(match->weight, score);
 }
 
 void
 resource_location(resource_t *rsc, node_t *node, int score, const char *tag,
 		  pe_working_set_t *data_set) 
 {
 	if(node != NULL) {
 		resource_node_score(rsc, node, score, tag);
 
 	} else if(data_set != NULL) {
 		slist_iter(
 			node, node_t, data_set->nodes, lpc,
 			resource_node_score(rsc, node, score, tag);
 			);
 	} else {
 		slist_iter(
 			node, node_t, rsc->allowed_nodes, lpc,
 			resource_node_score(rsc, node, score, tag);
 			);
 	}
 
 	if(node == NULL && score == -INFINITY) {
 		if(rsc->allocated_to) {
 			crm_info("Deallocating %s from %s", rsc->id, rsc->allocated_to->details->uname);
 			crm_free(rsc->allocated_to);
 			rsc->allocated_to = NULL;
 		}
 	}
 }
 
 #define sort_return(an_int) crm_free(a_uuid); crm_free(b_uuid); return an_int
 
 gint
 sort_op_by_callid(gconstpointer a, gconstpointer b)
 {
 	char *a_uuid = NULL;
 	char *b_uuid = NULL;
 	const xmlNode *xml_a = a;
 	const xmlNode *xml_b = b;
 	
  	const char *a_xml_id = crm_element_value_const(xml_a, XML_ATTR_ID);
  	const char *b_xml_id = crm_element_value_const(xml_b, XML_ATTR_ID);
 
  	const char *a_task_id = crm_element_value_const(xml_a, XML_LRM_ATTR_CALLID);
  	const char *b_task_id = crm_element_value_const(xml_b, XML_LRM_ATTR_CALLID);
 
 	const char *a_key = crm_element_value_const(xml_a, XML_ATTR_TRANSITION_MAGIC);
  	const char *b_key = crm_element_value_const(xml_b, XML_ATTR_TRANSITION_MAGIC);
 
 	int dummy = -1;
 	
 	int a_id = -1;
 	int b_id = -1;
 
 	int a_rc = -1;
 	int b_rc = -1;
 
 	int a_status = -1;
 	int b_status = -1;
 	
 	int a_call_id = -1;
 	int b_call_id = -1;
 
 	if(safe_str_eq(a_xml_id, b_xml_id)) {
 		/* We have duplicate lrm_rsc_op entries in the status
 		 *    section which is unliklely to be a good thing
 		 *    - we can handle it easily enough, but we need to get
 		 *    to the bottom of why its happening.
 		 */
 		pe_err("Duplicate lrm_rsc_op entries named %s", a_xml_id);
 		sort_return(0);
 	}
 	
 	CRM_CHECK(a_task_id != NULL && b_task_id != NULL,
 		  crm_err("a: %s, b: %s", crm_str(a_xml_id), crm_str(b_xml_id));
 		  sort_return(0));	
 	a_call_id = crm_parse_int(a_task_id, NULL);
 	b_call_id = crm_parse_int(b_task_id, NULL);
 	
 	if(a_call_id == -1 && b_call_id == -1) {
 		/* both are pending ops so it doesnt matter since
 		 *   stops are never pending
 		 */
 		sort_return(0);
 
 	} else if(a_call_id >= 0 && a_call_id < b_call_id) {
 		crm_debug_4("%s (%d) < %s (%d) : call id",
 			    a_xml_id, a_call_id, b_xml_id, b_call_id);
 		sort_return(-1);
 
 	} else if(b_call_id >= 0 && a_call_id > b_call_id) {
 		crm_debug_4("%s (%d) > %s (%d) : call id",
 			    a_xml_id, a_call_id, b_xml_id, b_call_id);
 		sort_return(1);
 	}
 
 	crm_debug_5("%s (%d) == %s (%d) : continuing",
 		    a_xml_id, a_call_id, b_xml_id, b_call_id);
 	
 	/* now process pending ops */
 	CRM_CHECK(a_key != NULL && b_key != NULL, sort_return(0));
 	CRM_CHECK(decode_transition_magic(
 		      a_key, &a_uuid, &a_id, &dummy, &a_status, &a_rc, &dummy),
 		  sort_return(0));
 	CRM_CHECK(decode_transition_magic(
 		      b_key, &b_uuid, &b_id, &dummy, &b_status, &b_rc, &dummy),
 		  sort_return(0));
 
 	/* try and determin the relative age of the operation...
 	 * some pending operations (ie. a start) may have been supuerceeded
 	 *   by a subsequent stop
 	 *
 	 * [a|b]_id == -1 means its a shutdown operation and _always_ comes last
 	 */
 	if(safe_str_neq(a_uuid, b_uuid) || a_id == b_id) {
 		/*
 		 * some of the logic in here may be redundant...
 		 *
 		 * if the UUID from the TE doesnt match then one better
 		 *   be a pending operation.
 		 * pending operations dont survive between elections and joins
 		 *   because we query the LRM directly
 		 */
 		
 		CRM_CHECK(a_call_id == -1 || b_call_id == -1,
 			  crm_err("a: %s=%d, b: %s=%d",
 				  crm_str(a_xml_id), a_call_id, crm_str(b_xml_id), b_call_id);
 			  sort_return(0));
 		CRM_CHECK(a_call_id >= 0  || b_call_id >= 0, sort_return(0));
 
 		if(b_call_id == -1) {
 			crm_debug_2("%s (%d) < %s (%d) : transition + call id",
 				    a_xml_id, a_call_id, b_xml_id, b_call_id);
 			sort_return(-1);
 		}
 
 		if(a_call_id == -1) {
 			crm_debug_2("%s (%d) > %s (%d) : transition + call id",
 				    a_xml_id, a_call_id, b_xml_id, b_call_id);
 			sort_return(1);
 		}
 		
 	} else if((a_id >= 0 && a_id < b_id) || b_id == -1) {
 		crm_debug_3("%s (%d) < %s (%d) : transition",
 			    a_xml_id, a_id, b_xml_id, b_id);
 		sort_return(-1);
 
 	} else if((b_id >= 0 && a_id > b_id) || a_id == -1) {
 		crm_debug_3("%s (%d) > %s (%d) : transition",
 			    a_xml_id, a_id, b_xml_id, b_id);
 		sort_return(1);
 	}
 
 	/* we should never end up here */
 	crm_err("%s (%d:%d:%s) ?? %s (%d:%d:%s) : default",
 		a_xml_id, a_call_id, a_id, a_uuid, b_xml_id, b_call_id, b_id, b_uuid);
 	CRM_CHECK(FALSE, sort_return(0)); 
 }
 
 time_t get_timet_now(pe_working_set_t *data_set) 
 {
     time_t now = 0;
     if(data_set && data_set->now) {
 	now = data_set->now->tm_now;
     }
     
     if(now == 0) {
 	/* eventually we should convert data_set->now into time_tm
 	 * for now, its only triggered by PE regression tests
 	 */
 	now = time(NULL);
 	crm_crit("Defaulting to 'now'");
 	if(data_set && data_set->now) {
 	    data_set->now->tm_now = now;
 	}
     }
     return now;
 }
 
 
 int get_failcount(node_t *node, resource_t *rsc, int *last_failure, pe_working_set_t *data_set) 
 {
     int last = 0;
     int fail_count = 0;
     resource_t *failed = rsc;
     char *fail_attr = crm_concat("fail-count", rsc->id, '-');
     const char *value = g_hash_table_lookup(node->details->attrs, fail_attr);
 
     if(is_not_set(rsc->flags, pe_rsc_unique)) {
 	failed = uber_parent(rsc);
     }
     
     if(value != NULL) {
 	fail_count = char2score(value);
 	crm_info("%s has failed %d times on %s",
 		 rsc->id, fail_count, node->details->uname);
     }
     crm_free(fail_attr);
     
     fail_attr = crm_concat("last-failure", rsc->id, '-');
     value = g_hash_table_lookup(node->details->attrs, fail_attr);
     if(value != NULL && rsc->failure_timeout) {
 	last = crm_parse_int(value, NULL);
 	if(last_failure) {
 	    *last_failure = last;
 	}
 	if(last > 0) {
 	    time_t now = get_timet_now(data_set);		
 	    if(now > (last + rsc->failure_timeout)) {
 		crm_notice("Failcount for %s on %s has expired (limit was %ds)",
 			   failed->id, node->details->uname, rsc->failure_timeout);
 		fail_count = 0;
 	    }
 	}
     }
     
     crm_free(fail_attr);
     return fail_count;
 }
diff --git a/tools/pingd.c b/tools/pingd.c
index afea003a61..406dfdc828 100644
--- a/tools/pingd.c
+++ b/tools/pingd.c
@@ -1,925 +1,1184 @@
 
 /* 
  * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
  * 
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  * 
  * This software is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  */
 
 #include <crm_internal.h>
 
 #include <sys/param.h>
 
 #include <crm/crm.h>
 
 #include <stdio.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
 
 #include <stdlib.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <libgen.h>
 
 #include <clplumbing/Gmain_timeout.h>
 #include <clplumbing/lsb_exitcodes.h>
 
 #include <crm/common/ipc.h>
 #include <attrd.h>
 
 
 #ifdef HAVE_SYS_SOCKET_H
 # include <sys/socket.h>
 #endif
 
 #include <arpa/inet.h>
 #include <netdb.h>
 #include <sys/poll.h>
 #include <netinet/ip.h>
 #include <netinet/ip_icmp.h>
 #include <sys/un.h>
 
 #include <netinet/ip6.h>
 #include <netinet/icmp6.h>
 
 #include <sys/socket.h>
 #include <sys/uio.h>
 
+#ifdef ON_LINUX
+#include <asm/types.h>
+#include <linux/errqueue.h>
+# ifndef ICMP_FILTER
+#  define ICMP_FILTER	1
+struct icmp_filter {
+	uint32_t	data;
+};
+# endif
+#endif
+
 #ifdef HAVE_GETOPT_H
 #  include <getopt.h>
 #endif
 
 #include <clplumbing/Gmain_timeout.h>
 #include <clplumbing/lsb_exitcodes.h>
 
 #ifdef HAVE_GETOPT_H
 #  include <getopt.h>
 #endif
 
 #if SUPPORT_HEARTBEAT
 #  include <hb_api.h>
 ll_cluster_t *pingd_cluster = NULL;
 void do_node_walk(ll_cluster_t *hb_cluster);
 #endif
 
 /* GMainLoop *mainloop = NULL; */
-#define OPTARGS	"V?p:a:d:s:S:h:Dm:N:"
+#define OPTARGS	"V?p:a:d:s:S:h:Dm:N:U"
 
 GListPtr ping_list = NULL;
 IPC_Channel *attrd = NULL;
 GMainLoop*  mainloop = NULL;
 GHashTable *ping_nodes = NULL;
 const char *pingd_attr = "pingd";
 gboolean do_filter = FALSE;
 gboolean need_shutdown = FALSE;
 gboolean stand_alone = FALSE;
+gboolean do_updates = TRUE;
 
 const char *attr_set = NULL;
 const char *attr_section = NULL;
 const char *attr_dampen = NULL;
 int attr_multiplier = 1;
 int pings_per_host = 5;
 int ping_timeout = 5;
 int re_ping_interval = 10;
 
-void pingd_nstatus_callback(
-	const char *node, const char *status, void *private_data);
-void pingd_lstatus_callback(
-	const char *node, const char *link, const char *status,
-	void *private_data);
-void send_update(int active);
-
 int ident;		/* our pid */
 
 typedef struct ping_node_s {
         int    			fd;		/* ping socket */
 	int			iseq;		/* sequence number */
 	gboolean		type;
+	gboolean		extra_filters;
 	union {
 		struct sockaddr     raw;
 		struct sockaddr_in  v4;   	/* ipv4 ping addr */
 		struct sockaddr_in6 v6;   	/* ipv6 ping addr */
 	} addr;
 	char			dest[256];
 	char			*host;
 } ping_node;
 
+void pingd_nstatus_callback(
+	const char *node, const char *status, void *private_data);
+void pingd_lstatus_callback(
+	const char *node, const char *link, const char *status,
+	void *private_data);
+void send_update(int active);
+int process_icmp_result(ping_node *node, struct sockaddr_in *whereto);
+
 /*
  * in_cksum --
  *	Checksum routine for Internet Protocol family headers (C Version)
  *	This function taken from Mike Muuss' ping program.
  */
 static int
 in_cksum (u_short *addr, size_t len)
 {
 	size_t		nleft = len;
 	u_short *	w = addr;
 	int		sum = 0;
 	u_short		answer = 0;
 
 	/*
 	 * The IP checksum algorithm is simple: using a 32 bit accumulator (sum)
 	 * add sequential 16 bit words to it, and at the end, folding back all
 	 * the carry bits from the top 16 bits into the lower 16 bits.
 	 */
 	while (nleft > 1) {
 		sum += *w++;
 		nleft -= 2;
 	}
 
 	/* Mop up an odd byte, if necessary */
 	if (nleft == 1) {
 		sum += *(u_char*)w;
 	}
 
 	/* Add back carry bits from top 16 bits to low 16 bits */
 
 	sum = (sum >> 16) + (sum & 0xffff);	/* add hi 16 to low 16 */
 	sum += (sum >> 16);			/* add carry */
 	answer = ~sum;				/* truncate to 16 bits */
 
 	return answer;
 }
 
+static const char *ping_desc(uint8_t type, uint8_t code)
+{
+    switch(type) {
+	case ICMP_ECHOREPLY:
+	    return "Echo Reply";
+	case ICMP_ECHO:
+	    return "Echo Request";
+	case ICMP_PARAMPROB:
+	    return "Bad Parameter";
+	case ICMP_SOURCEQUENCH:
+	    return "Packet lost, slow down";
+	case ICMP_TSTAMP:
+	    return "Timestamp Request";
+	case ICMP_TSTAMPREPLY:
+	    return "Timestamp Reply";
+	case ICMP_IREQ:
+	    return "Information Request";
+	case ICMP_IREQREPLY:
+	    return "Information Reply";
+	    
+	case ICMP_UNREACH:
+	    switch(code) {
+		case ICMP_UNREACH_NET:
+		    return "Unreachable Network";
+		case ICMP_UNREACH_HOST:
+		    return "Unreachable Host";
+		case ICMP_UNREACH_PROTOCOL:
+		    return "Unreachable Protocol";
+		case ICMP_UNREACH_PORT:
+		    return "Unreachable Port";
+		case ICMP_UNREACH_NEEDFRAG:
+		    return "Unreachable: Fragmentation needed";
+		case ICMP_UNREACH_SRCFAIL:
+		    return "Unreachable Source Route";
+		case ICMP_UNREACH_NET_UNKNOWN:
+		    return "Unknown Network";
+		case ICMP_UNREACH_HOST_UNKNOWN:
+		    return "Unknown Host";
+		case ICMP_UNREACH_ISOLATED:
+		    return "Unreachable: Isolated";
+		case ICMP_UNREACH_NET_PROHIB:
+		    return "Prohibited network";
+		case ICMP_UNREACH_HOST_PROHIB:
+		    return "Prohibited host";
+		case ICMP_UNREACH_FILTER_PROHIB:
+		    return "Unreachable: Prohibited filter";
+		case ICMP_UNREACH_TOSNET:
+		    return "Unreachable: Type of Service and Network";
+		case ICMP_UNREACH_TOSHOST:
+		    return "Unreachable: Type of Service and Host";
+		case ICMP_UNREACH_HOST_PRECEDENCE:
+		    return "Unreachable: Prec vio";
+		case ICMP_UNREACH_PRECEDENCE_CUTOFF:
+		    return "Unreachable: Prec cutoff";
+		default:
+		    crm_err("Unreachable: Unknown subtype: %d", code);
+		    return "Unreachable: Unknown Subtype";
+	    }
+	    break;
+
+	case ICMP_REDIRECT:
+	    switch(code) {
+		case ICMP_REDIRECT_NET:
+		    return "Redirect: Network";
+		case ICMP_REDIRECT_HOST:
+		    return "Redirect: Host";
+		case ICMP_REDIRECT_TOSNET:
+		    return "Redirect: Type of Service and Network";
+		case ICMP_REDIRECT_TOSHOST:
+		    return "Redirect: Type of Service and Host";
+		default:
+		    crm_err("Redirect: Unknown subtype: %d", code);
+		    return "Redirect: Unknown Subtype";
+	    }
+
+	case ICMP_TIMXCEED:
+	    switch(code) {
+		case ICMP_TIMXCEED_INTRANS:
+		    return "Timeout: TTL";
+		case ICMP_TIMXCEED_REASS:
+		    return "Timeout: Fragmentation reassembly";
+		default:
+		    crm_err("Timeout: Unkown subtype: %d", code);
+		    return "Timeout: Unkown Subtype";
+		}
+	    break;
+
+	default:
+	    crm_err("Unknown type: %d", type);
+	    return "Unknown type";
+    }
+}
+
+#ifdef ON_LINUX
+#  define MAX_HOST 1024
+int process_icmp_result(ping_node *node, struct sockaddr_in *whereto)
+{
+    int rc = 0;
+    char buf[512];
+    struct iovec  iov;
+    struct msghdr msg;
+    struct icmphdr icmph;
+    struct sockaddr_in target;
+    struct cmsghdr *cmsg = NULL;
+    struct sock_extended_err *s_err = NULL;
+
+    iov.iov_base = &icmph;
+    iov.iov_len = sizeof(icmph);
+    msg.msg_name = (void*)&target;
+    msg.msg_namelen = sizeof(target);
+    msg.msg_iov = &iov;
+    msg.msg_iovlen = 1;
+    msg.msg_flags = 0;
+    msg.msg_control = buf;
+    msg.msg_controllen = sizeof(buf);
+    
+    rc = recvmsg(node->fd, &msg, MSG_ERRQUEUE|MSG_DONTWAIT);
+    if (rc < 0 || rc < sizeof(icmph)) {
+	crm_debug("No error message");
+	return -1;
+    }
+	
+    for (cmsg = CMSG_FIRSTHDR(&msg); cmsg; cmsg = CMSG_NXTHDR(&msg, cmsg)) {
+	if (cmsg->cmsg_level == SOL_IP && cmsg->cmsg_type == IP_RECVERR) {
+	    s_err = (struct sock_extended_err *)CMSG_DATA(cmsg);
+	}
+    }
+
+    CRM_ASSERT(s_err != NULL);
+    
+    if (s_err->ee_origin == SO_EE_ORIGIN_LOCAL) {
+	if (s_err->ee_errno == EMSGSIZE) {
+	    crm_info("local error: Message too long, mtu=%u", s_err->ee_info);
+	} else {
+	    crm_info("local error: %s", strerror(s_err->ee_errno));
+	}
+	return 0;
+	
+    } else if (s_err->ee_origin == SO_EE_ORIGIN_ICMP) {
+	char ping_host[MAX_HOST];
+	struct sockaddr_in *sin = (struct sockaddr_in*)(s_err+1);
+	
+	const char *ping_result = ping_desc(s_err->ee_type, s_err->ee_code);
+	
+	if (ntohs(icmph.un.echo.id) != ident) {
+	    /* Result was not for us */
+	    crm_info("Not our error (ident): %d %d", ntohs(icmph.un.echo.id), ident);
+	    return -1;
+	} else if (icmph.type != ICMP_ECHO) {
+	    /* Not an error */
+	    crm_info("Not an error");
+	    return -1;
+
+	} else {
+	    char *target_s = inet_ntoa(*(struct in_addr *)&(target.sin_addr.s_addr));
+	    char *whereto_s = inet_ntoa(*(struct in_addr *)&(whereto->sin_addr.s_addr));
+	    if (safe_str_neq(target_s, whereto_s)) {
+		/* Result was not for us */
+		crm_info("Not our error (addr): %s %s", target_s, whereto_s);
+		return -1;
+	    }
+	}
+	
+	/* snprintf(ping_host, MAX_HOST, "%s", inet_ntoa(*(struct in_addr *)&(sin->sin_addr.s_addr))); */
+	snprintf(ping_host, MAX_HOST, "%s", inet_ntoa(sin->sin_addr));
+	
+	if (node->extra_filters == FALSE) {
+	    /* Now that we got some sort of reply, add extra filters to
+	     * ensure we keep getting the _right_ replies for dead hosts
+	     */
+	    struct icmp_filter filt;
+	    crm_debug("Installing additional ICMP filters");
+	    node->extra_filters = TRUE; /* only try once */
+	    
+	    filt.data = ~((1<<ICMP_SOURCE_QUENCH) | (1<<ICMP_REDIRECT) | (1<<ICMP_ECHOREPLY));
+	    if (setsockopt(node->fd, SOL_RAW, ICMP_FILTER, (char*)&filt, sizeof(filt)) == -1) {
+		crm_perror(LOG_WARNING, "setsockopt failed: Cannot install ICMP filters for %s", ping_host);
+	    }
+	}
+	
+	crm_info("From %s icmp_seq=%u %s", ping_host, ntohs(icmph.un.echo.sequence), ping_result);
+
+    } else {
+	crm_debug("else: %d", s_err->ee_origin);
+    }
+    
+    return 0;
+}
+#else
+int process_icmp_result(ping_node *node, struct sockaddr_in *whereto) 
+{
+    /* dummy function */
+    return 0;
+}
+#endif
+
 static ping_node *ping_new(const char *host)
 {
     ping_node *node;
     
     crm_malloc0(node, sizeof(ping_node));
 
     if(strstr(host, ":")) {
 	node->type = AF_INET6;
     } else {
 	node->type = AF_INET;
     }
     
     node->host = crm_strdup(host);
     
     return node;
 }
 
 static gboolean ping_open(ping_node *node) 
 {
     int ret_ga;
     char *hostname;
     struct addrinfo *res;
     struct addrinfo hints;
 
     /* getaddrinfo */
     bzero(&hints, sizeof(struct addrinfo));
     hints.ai_flags = AI_CANONNAME;
     hints.ai_family = node->type;
     hints.ai_socktype = SOCK_RAW;
 
     if(node->type == AF_INET6) {
 	hints.ai_protocol = IPPROTO_ICMPV6;
     } else {
 	hints.ai_protocol = IPPROTO_ICMP;
     }
 	
     ret_ga = getaddrinfo(node->host, NULL, &hints, &res);
     if (ret_ga) {
-	crm_err("getaddrinfo: %s", gai_strerror(ret_ga));
-	return -1;
+	crm_warn("getaddrinfo: %s", gai_strerror(ret_ga));
+	return FALSE;
     }
 	
     if (res->ai_canonname)
 	hostname = res->ai_canonname;
     else
 	hostname = node->host;
 
     crm_debug("Got address %s for %s", hostname, node->host);
     
-    if (!res->ai_addr) {
-	fprintf(stderr, "getaddrinfo failed");
-	exit(1);
+    if(!res->ai_addr) {
+	crm_warn("getaddrinfo failed: no address");
+	return FALSE;
     }
 	
     memcpy(&(node->addr.raw), res->ai_addr, res->ai_addrlen);
     node->fd = socket(hints.ai_family, hints.ai_socktype, hints.ai_protocol);
     /* node->fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol); */
 
     if(node->fd < 0) {
-	cl_perror("Can't open socket to %s", hostname);
+	crm_perror(LOG_WARNING, "Can't open socket to %s", hostname);
 	return FALSE;
     }
 
     if(node->type == AF_INET6) {
 	int sockopt;
 
 	inet_ntop(node->type, &node->addr.v6.sin6_addr, node->dest, sizeof(node->dest));
 	
 	/* set recv buf for broadcast pings */
 	sockopt = 48 * 1024;
 	setsockopt(node->fd, SOL_SOCKET, SO_RCVBUF, (char *) &sockopt, sizeof(sockopt));
 
     } else {
 	inet_ntop(node->type, &node->addr.v4.sin_addr, node->dest, sizeof(node->dest));
     }
 
     if(ping_timeout > 0) {
 	struct timeval timeout_opt;
 
 	timeout_opt.tv_sec = ping_timeout;
 	timeout_opt.tv_usec = 0;
 	
 	setsockopt(node->fd, SOL_SOCKET, SO_RCVTIMEO, (char *) &timeout_opt, sizeof(timeout_opt));
     }
-    
+
+#ifdef ON_LINUX
+    {
+	int dummy = 1;
+	struct icmp_filter filt;
+	filt.data = ~((1<<ICMP_SOURCE_QUENCH)|
+		      (1<<ICMP_DEST_UNREACH)|
+		      (1<<ICMP_TIME_EXCEEDED)|
+		      (1<<ICMP_PARAMETERPROB)|
+		      (1<<ICMP_REDIRECT)|
+		      (1<<ICMP_ECHOREPLY));
+
+	if (setsockopt(node->fd, SOL_RAW, ICMP_FILTER, (char*)&filt, sizeof(filt)) == -1) {
+	    crm_perror(LOG_WARNING, "setsockopt failed: Cannot install ICMP filters for %s", node->dest);
+	}
+
+	setsockopt(node->fd, SOL_IP, IP_RECVERR, (char *)&dummy, sizeof(dummy));
+    }
+#endif    
     
     crm_debug("Opened connection to %s", node->dest);
 
     return TRUE;
 }
 
 static gboolean ping_close(ping_node *node)
 {
     int tmp_fd = node->fd;
     node->fd = -1;
 	
     if (tmp_fd >= 0) {
 	if(close(tmp_fd) < 0) {
 	    cl_perror("Could not close ping socket");
 	} else {
 	    tmp_fd = -1;
 	    crm_debug("Closed connection to %s", node->dest);
 	}
     }
     return (tmp_fd == -1);
 }
 
 #define MAXPACKETLEN	131072
 #define ICMP6ECHOLEN	8	/* icmp echo header len excluding time */
 #define ICMP6ECHOTMLEN  20
 #define	DEFDATALEN	ICMP6ECHOTMLEN
 #define	EXTRA		256	/* for AH and various other headers. weird. */
 #define	IP6LEN		40
 
-static gboolean
+static int
 dump_v6_echo(ping_node *node, u_char *buf, int bytes, struct msghdr *hdr)
 {
 	int fromlen;
 	char dest[1024];
 	
 	struct icmp6_hdr *icp;
 	struct sockaddr *from;
 
 	if (!hdr || !hdr->msg_name || hdr->msg_namelen != sizeof(struct sockaddr_in6)
 	    || ((struct sockaddr *)hdr->msg_name)->sa_family != AF_INET6) {
 	    crm_warn("Invalid echo peer");
-	    return FALSE;
+	    return -1;
 	}
 
 	fromlen = hdr->msg_namelen;
 	from = (struct sockaddr *)hdr->msg_name;
 	getnameinfo(from, fromlen, dest, sizeof(dest), NULL, 0, NI_NUMERICHOST | NI_NUMERICSERV);
 	
 	if (bytes < (int)sizeof(struct icmp6_hdr)) {
 	    crm_warn("Invalid echo packet (too short: %d bytes) from %s", bytes, dest);
-	    return FALSE;
+	    return -1;
 	}
 	icp = (struct icmp6_hdr *)buf;
 
 	if (icp->icmp6_type == ICMP6_ECHO_REPLY && ntohs(icp->icmp6_id) == ident) {
 	    u_int16_t seq = ntohs(icp->icmp6_seq);
 	    crm_debug("%d bytes from %s, icmp_seq=%u: %s",
 		      bytes, dest, seq, (char*)(buf + ICMP6ECHOLEN));
-	    return TRUE;
+	    return 1;
 	}
 	
 	crm_warn("Bad echo (%d): type=%d, code=%d, seq=%d, id=%d, check=%d",
 		 ICMP6_ECHO_REPLY, icp->icmp6_type,
 		 icp->icmp6_code, ntohs(icp->icmp6_seq), icp->icmp6_id, icp->icmp6_cksum);
-	return FALSE;
+	return 0;
 }
 
-static gboolean
+static int
 dump_v4_echo(ping_node *node, u_char *buf, int bytes, struct msghdr *hdr)
 {
 	int iplen, fromlen;
-	char dest[1024];
+	char from_host[1024];
 
 	struct ip *ip;
 	struct icmp *icp;
 	struct sockaddr *from;
 
-	if (hdr == NULL || !hdr->msg_name || hdr->msg_namelen != sizeof(struct sockaddr_in)
+	if (hdr == NULL
+	    || !hdr->msg_name
+	    || hdr->msg_namelen != sizeof(struct sockaddr_in)
 	    || ((struct sockaddr *)hdr->msg_name)->sa_family != AF_INET) {
 	    crm_warn("Invalid echo peer");
-	    return FALSE;
+	    return -1;
 	}
 
 	fromlen = hdr->msg_namelen;
 	from = (struct sockaddr *)hdr->msg_name;
-	getnameinfo(from, fromlen, dest, sizeof(dest), NULL, 0, NI_NUMERICHOST | NI_NUMERICSERV);
+	getnameinfo(from, fromlen, from_host, sizeof(from_host), NULL, 0, NI_NUMERICHOST | NI_NUMERICSERV);
 
 	ip = (struct ip*)buf;
 	iplen = ip->ip_hl * 4;
 	
 	if (bytes < (iplen + sizeof(struct icmp))) {
-	    crm_warn("Invalid echo packet (too short: %d bytes) from %s", bytes, dest);
-	    return FALSE;
+	    crm_warn("Invalid echo packet (too short: %d bytes) from %s", bytes, from_host);
+	    return -1;
 	}
 
 	/* Check the IP header */
 	icp = (struct icmp*)(buf + iplen);
 
-	if (icp->icmp_type == ICMP_ECHOREPLY && ntohs(icp->icmp_id) == ident) {
-	    crm_debug("%d bytes from %s, icmp_seq=%u: %s",
-		      bytes, dest, ntohs(icp->icmp_seq), icp->icmp_data);
-	    return TRUE;
+	if(icp->icmp_type == ICMP_ECHO) {
+	    /* Filter out the echo request when pinging the local host */ 
+	    return -1;
 	}
+	
+	crm_debug("Echo from %s (exp=%d, seq=%d, id=%d, dest=%s, data=%s): %s",
+		  from_host, node->iseq, ntohs(icp->icmp_seq),
+		  ntohs(icp->icmp_id), node->dest, icp->icmp_data,
+		  ping_desc(icp->icmp_type, icp->icmp_code));
 
-	crm_warn("Bad echo (%d): type=%d, code=%d, seq=%d, id=%d, check=%d",
-		 ICMP_ECHOREPLY, icp->icmp_type,
-		 icp->icmp_code, ntohs(icp->icmp_seq), icp->icmp_id, icp->icmp_cksum);
+	if (icp->icmp_type == ICMP_ECHOREPLY
+	    && node->iseq == ntohs(icp->icmp_seq)) {
+	    crm_debug("%d bytes from %s, icmp_seq=%u: %s",
+		      bytes, from_host, ntohs(icp->icmp_seq), icp->icmp_data);
+	    return 1;
+	}
 
-	return FALSE;
+	return process_icmp_result(node, (struct sockaddr_in*)from);
 }
 
 static int
 ping_read(ping_node *node, int *lenp)
 {
     int bytes;
-    int fromlen;
+    char fromaddr[128];
     struct msghdr m;
     struct cmsghdr *cm;
     u_char buf[1024];
     struct iovec iov[2];
 
     int packlen;
     u_char *packet;
     packlen = DEFDATALEN + IP6LEN + ICMP6ECHOLEN + EXTRA;
 
     crm_malloc0(packet, packlen);
-    if(node->type == AF_INET6) {
-	fromlen = sizeof(struct sockaddr_in6);
-    } else {
-	fromlen = sizeof(struct sockaddr_in);
-    }
-    
-    m.msg_name = (caddr_t)&node->addr;
-    m.msg_namelen = fromlen;
+
+  retry:
+    m.msg_name = &fromaddr;
+    m.msg_namelen = sizeof(fromaddr);
     memset(&iov, 0, sizeof(iov));
     iov[0].iov_base = (caddr_t)packet;
     iov[0].iov_len = packlen;
     m.msg_iov = iov;
     m.msg_iovlen = 1;
     cm = (struct cmsghdr *)buf;
     m.msg_control = (caddr_t)buf;
     m.msg_controllen = sizeof(buf);
 
-    crm_debug_2("reading...");
+
     bytes = recvmsg(node->fd, &m, 0);
     crm_debug_2("Got %d bytes", bytes);
     
     if (bytes > 0) {
+	int rc = 0;
 	if(node->type == AF_INET6) {
-	    return dump_v6_echo(node, packet, bytes, &m);
+	    rc = dump_v6_echo(node, packet, bytes, &m);
 	} else {
-	    return dump_v4_echo(node, packet, bytes, &m);
+	    rc = dump_v4_echo(node, packet, bytes, &m);
 	}
+
+	if(rc < 0) {
+	    crm_info("Retrying...");
+	    goto retry;
+	    
+	} else if(rc > 0) {
+	    return TRUE;
+	    
+	} else {
+	    return FALSE;
+	}	
 	
     } else if(bytes < 0) {
-	cl_perror("recvmsg failed");
+	process_icmp_result(node, (struct sockaddr_in*)&fromaddr);
 
     } else {
 	crm_err("Unexpected reply");
     }
     return FALSE;
 }
 
 static int
 ping_write(ping_node *node, const char *data, size_t size)
 {
 	struct iovec iov[2];
 	int rc, bytes, namelen;
-	static int ntransmitted = 5;
+	/* static int ntransmitted = 9; */
 	struct msghdr smsghdr;
 	u_char outpack[MAXPACKETLEN];
 
-	node->iseq = ntransmitted++;
+	node->iseq++;
 
 	if(node->type == AF_INET6) {
 	    struct icmp6_hdr *icp;
 	    namelen = sizeof(struct sockaddr_in6);
 	    bytes = ICMP6ECHOLEN + DEFDATALEN;
 
 	    icp = (struct icmp6_hdr *)outpack;
 	    memset(icp, 0, sizeof(*icp));
 	    
 	    icp->icmp6_code = 0;
 	    icp->icmp6_cksum = 0;
 	    icp->icmp6_type = ICMP6_ECHO_REQUEST;
 	    icp->icmp6_id = htons(ident);
 	    icp->icmp6_seq = ntohs(node->iseq);
 
-	    memcpy(&outpack[ICMP6ECHOLEN], "beekhof-v6", 10);
+	    memcpy(&outpack[ICMP6ECHOLEN], "pingd-v6", 8);
 	    
 	} else {
 	    struct icmp *icp;
 	    namelen = sizeof(struct sockaddr_in);
 	    bytes = sizeof(struct icmp) + 11;
 
 	    icp = (struct icmp *)outpack;
 	    memset(icp, 0, sizeof(*icp));
 
 	    icp->icmp_code = 0;
 	    icp->icmp_cksum = 0;
 	    icp->icmp_type = ICMP_ECHO;
 	    icp->icmp_id = htons(ident);
 	    icp->icmp_seq = ntohs(node->iseq);
 
-	    memcpy(icp->icmp_data, "beekhof-v4", 10);
+	    memcpy(icp->icmp_data, "pingd-v4", 8);
 	    icp->icmp_cksum = in_cksum((u_short *)icp, bytes);
 	}
 
 	
 	memset(&smsghdr, 0, sizeof(smsghdr));
 	smsghdr.msg_name = (caddr_t)&(node->addr);
 	smsghdr.msg_namelen = namelen;
 	memset(&iov, 0, sizeof(iov));
 	iov[0].iov_base = (caddr_t)outpack;
 	iov[0].iov_len = bytes;
 	smsghdr.msg_iov = iov;
 	smsghdr.msg_iovlen = 1;
 
 	rc = sendmsg(node->fd, &smsghdr, 0);
 
 	if (rc < 0 || rc != bytes) {
-	    cl_perror("Wrote %d of %d chars", rc, bytes);
+	    crm_perror(LOG_WARNING, "Wrote %d of %d chars", rc, bytes);
 
 	} else {
 	    crm_debug("Sent %d bytes to %s", rc, node->dest);
 	}
 	
 	return(0);
 }
 
 static gboolean
 pingd_shutdown(int nsig, gpointer unused)
 {
 	need_shutdown = TRUE;
 	send_update(-1);
 	crm_info("Exiting");
 	
 	if (mainloop != NULL && g_main_is_running(mainloop)) {
 		g_main_quit(mainloop);
 	} else {
 		exit(0);
 	}
 	return FALSE;
 }
 
 static void
 usage(const char *cmd, int exit_status)
 {
 	FILE *stream;
 
 	stream = exit_status ? stderr : stdout;
 
 	fprintf(stream, "usage: %s [-%s]\n", cmd, OPTARGS);
 	fprintf(stream, "\t--%s (-%c) \t\t\tThis text\n", "help", '?');
 	fprintf(stream, "\t--%s (-%c) \t\tRun in daemon mode\n", "daemonize", 'D');
 	fprintf(stream, "\t--%s (-%c) <filename>\tFile in which to store the process' PID\n"
 		"\t\t\t\t\t* Default=/tmp/pingd.pid\n", "pid-file", 'p');
 	fprintf(stream, "\t--%s (-%c) <string>\tName of the node attribute to set\n"
 		"\t\t\t\t\t* Default=pingd\n", "attr-name", 'a');
 	fprintf(stream, "\t--%s (-%c) <string>\tName of the set in which to set the attribute\n"
 		"\t\t\t\t\t* Default=cib-bootstrap-options\n", "attr-set", 's');
 	fprintf(stream, "\t--%s (-%c) <string>\tWhich part of the CIB to put the attribute in\n"
 		"\t\t\t\t\t* Default=status\n", "attr-section", 'S');
 	fprintf(stream, "\t--%s (-%c) <single_host_name>\tMonitor a subset of the ping nodes listed in ha.cf (can be specified multiple times)\n", "node", 'N');
 	fprintf(stream, "\t--%s (-%c) <integer>\t\tHow long to wait for no further changes to occur before updating the CIB with a changed attribute\n", "attr-dampen", 'd');
 	fprintf(stream, "\t--%s (-%c) <integer>\tFor every connected node, add <integer> to the value set in the CIB\n"
 		"\t\t\t\t\t\t* Default=1\n", "value-multiplier", 'm');
 
 	fflush(stream);
 
 	exit(exit_status);
 }
 
 #if SUPPORT_HEARTBEAT
 static gboolean
 pingd_ha_dispatch(IPC_Channel *channel, gpointer user_data)
 {
 	gboolean stay_connected = TRUE;
 
 	crm_debug_2("Invoked");
 
 	while(pingd_cluster != NULL && IPC_ISRCONN(channel)) {
 		if(pingd_cluster->llc_ops->msgready(pingd_cluster) == 0) {
 			crm_debug_2("no message ready yet");
 			break;
 		}
 		/* invoke the callbacks but dont block */
 		pingd_cluster->llc_ops->rcvmsg(pingd_cluster, 0);
 	}
 	
 	if (pingd_cluster == NULL || channel->ch_status != IPC_CONNECT) {
 		if(need_shutdown == FALSE) {
 			crm_crit("Lost connection to heartbeat service.");
 		} else {
 			crm_info("Lost connection to heartbeat service.");
 		}
 		stay_connected = FALSE;
 	}
     
 	return stay_connected;
 }
 
 
 static void
 pingd_ha_connection_destroy(gpointer user_data)
 {
 	crm_debug_3("Invoked");
 	if(need_shutdown) {
 		/* we signed out, so this is expected */
 		crm_info("Heartbeat disconnection complete");
 		return;
 	}
 
 	crm_crit("Lost connection to heartbeat service!");
 }
 
 static gboolean
 register_with_ha(void) 
 {
 	if(pingd_cluster == NULL) {
 		pingd_cluster = ll_cluster_new("heartbeat");
 	}
 	if(pingd_cluster == NULL) {
 		crm_err("Cannot create heartbeat object");
 		return FALSE;
 	}
 	
 	crm_debug("Signing in with Heartbeat");
 	if (pingd_cluster->llc_ops->signon(
 		    pingd_cluster, crm_system_name) != HA_OK) {
 
 		crm_err("Cannot sign on with heartbeat: %s",
 			pingd_cluster->llc_ops->errmsg(pingd_cluster));
 		crm_err("REASON: %s", pingd_cluster->llc_ops->errmsg(pingd_cluster));
 		return FALSE;
 	}
 
 	do_node_walk(pingd_cluster);	
 
 	crm_debug_3("Be informed of Node Status changes");
 	if (HA_OK != pingd_cluster->llc_ops->set_nstatus_callback(
 		    pingd_cluster, pingd_nstatus_callback, NULL)) {
 		
 		crm_err("Cannot set nstatus callback: %s",
 			pingd_cluster->llc_ops->errmsg(pingd_cluster));
 		crm_err("REASON: %s", pingd_cluster->llc_ops->errmsg(pingd_cluster));
 		return FALSE;
 	}
 
 	if (pingd_cluster->llc_ops->set_ifstatus_callback(
 		    pingd_cluster, pingd_lstatus_callback, NULL) != HA_OK) {
 		cl_log(LOG_ERR, "Cannot set if status callback");
 		crm_err("REASON: %s", pingd_cluster->llc_ops->errmsg(pingd_cluster));
 		return FALSE;
 	}
 	
 	crm_debug_3("Adding channel to mainloop");
 	G_main_add_IPC_Channel(
 		G_PRIORITY_HIGH, pingd_cluster->llc_ops->ipcchan(
 			pingd_cluster),
 		FALSE, pingd_ha_dispatch, pingd_cluster,  
 		pingd_ha_connection_destroy);
 
 	return TRUE;
 }
 
 void
 do_node_walk(ll_cluster_t *hb_cluster)
 {
 	const char *ha_node = NULL;
 
 	/* Async get client status information in the cluster */
 	crm_debug_2("Invoked");
 	crm_debug_3("Requesting an initial dump of CRMD client_status");
 	hb_cluster->llc_ops->client_status(
 		hb_cluster, NULL, CRM_SYSTEM_CRMD, -1);
 	
 	crm_info("Requesting the list of configured nodes");
 	hb_cluster->llc_ops->init_nodewalk(hb_cluster);
 
 	do {
 		const char *ha_node_type = NULL;
 		const char *ha_node_status = NULL;
 
 		ha_node = hb_cluster->llc_ops->nextnode(hb_cluster);
 		if(ha_node == NULL) {
 			continue;
 		}
 		
 		ha_node_type = hb_cluster->llc_ops->node_type(
 			hb_cluster, ha_node);
 		if(safe_str_neq("ping", ha_node_type)) {
 			crm_debug("Node %s: skipping '%s'",
 				  ha_node, ha_node_type);
 			continue;
 		}
 
 		if(do_filter
 		   && g_hash_table_lookup(ping_nodes, ha_node) == NULL) {
 			crm_debug("Filtering: %s", ha_node);
 			continue;
 		}
 		
 		ha_node_status = hb_cluster->llc_ops->node_status(
 			hb_cluster, ha_node);
 
 		crm_debug("Adding: %s=%s", ha_node, ha_node_status);
 		g_hash_table_replace(ping_nodes, crm_strdup(ha_node),
 				     crm_strdup(ha_node_status));
 
 	} while(ha_node != NULL);
 
 	hb_cluster->llc_ops->end_nodewalk(hb_cluster);
 	crm_debug_2("Complete");
 	send_update(-1);
 }
 #endif
 
 static gboolean stand_alone_ping(gpointer data)
 {
     int len = 0;
     int num_active = 0;
     
     crm_debug("Checking connectivity");
     slist_iter(
 	ping, ping_node, ping_list, num, 
-	int lpc = 0;
-	int alive = 0;
-	
-	ping_open(ping);
-	for(;lpc < pings_per_host; lpc++) {
-	    ping_write(ping, "test", 4);
-	    if(ping_read(ping, &len)) {
-		alive++;
-	    }
-	    sleep(1);
-	}
 	
-	if(alive) {
-	    crm_info("Node %s is alive (%d)", ping->host, alive);
-	    num_active++;
+	if(ping_open(ping)) {
+
+	    int lpc = 0;
+	    for(;lpc < pings_per_host; lpc++) {
+		ping_write(ping, "test", 4);
+		if(ping_read(ping, &len)) {
+		    crm_info("Node %s is alive", ping->host);
+		    num_active++;
+		    break;
+		}
+		sleep(1);
+	    }
 	}
 	
 	ping_close(ping);
 	);
 
     send_update(num_active);
     
     CRM_ASSERT(Gmain_timeout_add(re_ping_interval*1000, stand_alone_ping, NULL) > 0);
     
     return FALSE;
 }
 
 int
 main(int argc, char **argv)
 {
 	int lpc;
 	int argerr = 0;
 	int flag;
 	char *pid_file = NULL;
 	gboolean daemonize = FALSE;
 	ping_node *p = NULL;
 	
 #ifdef HAVE_GETOPT_H
 	int option_index = 0;
 	static struct option long_options[] = {
 		/* Top-level Options */
 		{"verbose",   0, 0, 'V'},
 		{"help",      0, 0, '?'},
 		{"pid-file",  1, 0, 'p'},		
 		{"node",      1, 0, 'N'},		
 		{"ping-host", 1, 0, 'h'}, /* legacy */
 		{"attr-name", 1, 0, 'a'},		
 		{"attr-set",  1, 0, 's'},		
 		{"daemonize", 0, 0, 'D'},		
 		{"attr-section", 1, 0, 'S'},		
 		{"attr-dampen",  1, 0, 'd'},		
 		{"value-multiplier",  1, 0, 'm'},		
+		{"no-updates", 0, 0, 'U'},		
 
 		{0, 0, 0, 0}
 	};
 #endif
 	pid_file = crm_strdup("/tmp/pingd.pid");
 
 	G_main_add_SignalHandler(
 		G_PRIORITY_HIGH, SIGTERM, pingd_shutdown, NULL, NULL);
 	
 	ping_nodes = g_hash_table_new_full(
                      g_str_hash, g_str_equal,
 		     g_hash_destroy_str, g_hash_destroy_str);	
 
 	crm_log_init(basename(argv[0]), LOG_INFO, TRUE, FALSE, argc, argv);
 	
 	while (1) {
 #ifdef HAVE_GETOPT_H
 		flag = getopt_long(argc, argv, OPTARGS,
 				   long_options, &option_index);
 #else
 		flag = getopt(argc, argv, OPTARGS);
 #endif
 		if (flag == -1)
 			break;
 
 		switch(flag) {
 			case 'V':
 				cl_log_enable_stderr(TRUE);
 				alter_debug(DEBUG_INC);
 				break;
 			case 'p':
 				pid_file = crm_strdup(optarg);
 				break;
 			case 'a':
 				pingd_attr = crm_strdup(optarg);
 				break;
 			case 'N':
 			case 'h':
 				stand_alone = TRUE;
 				crm_debug("Adding ping host %s", optarg);
 				p = ping_new(crm_strdup(optarg));
 				ping_list = g_list_append(ping_list, p);
 				break;
 			case 's':
 				attr_set = crm_strdup(optarg);
 				break;
 			case 'm':
 				attr_multiplier = crm_parse_int(optarg, "1");
 				break;
 			case 'S':
 				attr_section = crm_strdup(optarg);
 				break;
 			case 'd':
 				attr_dampen = crm_strdup(optarg);
 				break;
 			case 'n':
 				pings_per_host = crm_atoi(optarg, NULL);
 				break;
 			case 't':
 				ping_timeout = crm_atoi(optarg, NULL);
 				break;
 			case 'i':
 				re_ping_interval = crm_atoi(optarg, NULL);
 				break;
 			case 'D':
 				daemonize = TRUE;
 				break;
+			case 'U':
+				do_updates = FALSE;
+				break;
 			case '?':
 				usage(crm_system_name, LSB_EXIT_GENERIC);
 				break;
 			default:
 				printf("Argument code 0%o (%c) is not (?yet?) supported\n", flag, flag);
 				crm_err("Argument code 0%o (%c) is not (?yet?) supported\n", flag, flag);
 				++argerr;
 				break;
 		}
 	}
 
 	if (optind < argc) {
 		crm_err("non-option ARGV-elements: ");
 		printf("non-option ARGV-elements: ");
 		while (optind < argc) {
 			crm_err("%s ", argv[optind++]);
 			printf("%s ", argv[optind++]);
 		}
 		printf("\n");
 	}
 	if (argerr) {
 		usage(crm_system_name, LSB_EXIT_GENERIC);
 	}
 
 	crm_make_daemon(crm_system_name, daemonize, pid_file);
+	ident = getpid();
 
+	if(do_updates == FALSE) {
+	    goto start_ping;
+	}
+	
 	for(lpc = 0; attrd == NULL && lpc < 30; lpc++) {
 		crm_debug("attrd registration attempt: %d", lpc);
 		sleep(5);
 		attrd = init_client_ipc_comms_nodispatch(T_ATTRD);
 	}
 	
 	if(attrd == NULL) {
 		crm_err("attrd registration failed");
 		cl_flush_logs();
 		exit(LSB_EXIT_GENERIC);
 	}
 
 #if SUPPORT_AIS
 	if(is_openais_cluster()) {
 	    stand_alone = TRUE;
 	}
 #endif
 	
 #if SUPPORT_HEARTBEAT
 	if(stand_alone == FALSE && register_with_ha() == FALSE) {
 		crm_err("HA registration failed");
 		cl_flush_logs();
 		exit(LSB_EXIT_GENERIC);
 	}
 #endif
+  start_ping:
 	if(stand_alone && ping_list == NULL) {
 	    crm_err("You must specify a list of hosts to monitor");
 	    exit(LSB_EXIT_GENERIC);
 
-	} else if(stand_alone) {
-	    CRM_ASSERT(Gmain_timeout_add(re_ping_interval*1000, stand_alone_ping, NULL) > 0);
 	}
 	
 	crm_info("Starting %s", crm_system_name);
 	mainloop = g_main_new(FALSE);
+
+	if(stand_alone) {
+	    stand_alone_ping(NULL);
+	}
+
 	g_main_run(mainloop);
 	
 	crm_info("Exiting %s", crm_system_name);	
 	return 0;
 }
 
 
 static void count_ping_nodes(gpointer key, gpointer value, gpointer user_data)
 {
 	int *num_active = user_data;
 	CRM_CHECK(num_active != NULL, return);
 
 	if(need_shutdown) {
 		return;
 	}
 	
 	if(safe_str_eq(value, "ping")) {
 		(*num_active)++;
 	} else if(safe_str_eq(value, "up")) {
 		(*num_active)++;
 	}
 }
 
 void
 send_update(int num_active) 
 {
 	xmlNode *update = create_xml_node(NULL, __FUNCTION__);
 	crm_xml_add(update, F_TYPE, T_ATTRD);
 	crm_xml_add(update, F_ORIG, crm_system_name);
 	crm_xml_add(update, F_ATTRD_TASK, "update");
 	crm_xml_add(update, F_ATTRD_ATTRIBUTE, pingd_attr);
 
 	if(num_active < 0) {
 	    g_hash_table_foreach(ping_nodes, count_ping_nodes, &num_active);
 	}
 	
 	crm_info("%d active ping nodes", num_active);
 	crm_xml_add_int(update, F_ATTRD_VALUE, attr_multiplier*num_active);
 	
 	if(attr_set != NULL) {
 		crm_xml_add(update, F_ATTRD_SET,     attr_set);
 	}
 	if(attr_section != NULL) {
 		crm_xml_add(update, F_ATTRD_SECTION, attr_section);
 	}
 	if(attr_dampen != NULL) {
 		crm_xml_add(update, F_ATTRD_DAMPEN,  attr_dampen);
 	}
 
-	if(send_ipc_message(attrd, update) == FALSE) {
+	if(do_updates == FALSE) {
+	    crm_log_xml_info(update, "pingd");
+	    
+	} else if(send_ipc_message(attrd, update) == FALSE) {
 		crm_err("Could not send update");
 		exit(1);
 	}
 	free_xml(update);
 }
 
 void
 pingd_nstatus_callback(
 	const char *node, const char * status,	void* private_data)
 {
 	crm_notice("Status update: Ping node %s now has status [%s]",
 		   node, status);
 	
 	if(g_hash_table_lookup(ping_nodes, node) != NULL) {
 		g_hash_table_replace(
 			ping_nodes, crm_strdup(node), crm_strdup(status));
 		send_update(-1);
 	}
 }
 
 void
 pingd_lstatus_callback(const char *node, const char *lnk, const char *status,
 		       void *private)
 {
 	crm_notice("Status update: Ping node %s now has status [%s]",
 		   node, status);
 	pingd_nstatus_callback(node, status, private);
 }