diff --git a/cib/callbacks.c b/cib/callbacks.c
index 1452ded809..28844b87b5 100644
--- a/cib/callbacks.c
+++ b/cib/callbacks.c
@@ -1,1713 +1,1725 @@
 /*
  * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This software is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
 #include <crm_internal.h>
 
 #include <sys/param.h>
 #include <stdio.h>
 #include <sys/types.h>
 #include <unistd.h>
 
 #include <stdlib.h>
 #include <errno.h>
 #include <fcntl.h>
 
 #include <crm/crm.h>
 #include <crm/cib.h>
 #include <crm/msg_xml.h>
 #include <crm/cluster/internal.h>
 
 #include <crm/common/xml.h>
 
 #include <cibio.h>
 #include <callbacks.h>
 #include <cibmessages.h>
 #include <notify.h>
 #include "common.h"
 
 static unsigned long cib_local_bcast_num = 0;
 
 typedef struct cib_local_notify_s {
     xmlNode *notify_src;
     char *client_id;
     gboolean from_peer;
     gboolean sync_reply;
 } cib_local_notify_t;
 
 int next_client_id = 0;
 
 #if SUPPORT_PLUGIN
 gboolean legacy_mode = TRUE;
 #else
 gboolean legacy_mode = FALSE;
 #endif
 
 qb_ipcs_service_t *ipcs_ro = NULL;
 qb_ipcs_service_t *ipcs_rw = NULL;
 qb_ipcs_service_t *ipcs_shm = NULL;
 
 gint cib_GCompareFunc(gconstpointer a, gconstpointer b);
 gboolean can_write(int flags);
 void send_cib_replace(const xmlNode * sync_request, const char *host);
 void cib_process_request(xmlNode * request, gboolean privileged, gboolean force_synchronous,
                          gboolean from_peer, crm_client_t * cib_client);
 
 
 int cib_process_command(xmlNode * request, xmlNode ** reply,
                         xmlNode ** cib_diff, gboolean privileged);
 
 gboolean cib_common_callback(qb_ipcs_connection_t * c, void *data, size_t size,
                              gboolean privileged);
 
 static gboolean cib_read_legacy_mode(void)
 {
     static gboolean init = TRUE;
     static gboolean legacy = FALSE;
 
     if(init) {
         init = FALSE;
         legacy = daemon_option_enabled("cib", "legacy");
         if(legacy) {
             crm_notice("Enabled legacy mode");
         }
     }
 
     return legacy;
 }
 
 gboolean cib_legacy_mode(void)
 {
 #if SUPPORT_PLUGIN
     return TRUE;
 #endif
 
     if(cib_read_legacy_mode()) {
         return TRUE;
     }
     return legacy_mode;
 }
 
 
 static int32_t
 cib_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid)
 {
     crm_trace("Connection %p", c);
     if (cib_shutdown_flag) {
         crm_info("Ignoring new client [%d] during shutdown", crm_ipcs_client_pid(c));
         return -EPERM;
     }
 
     if (crm_client_new(c, uid, gid) == NULL) {
         return -EIO;
     }
     return 0;
 }
 
 static void
 cib_ipc_created(qb_ipcs_connection_t * c)
 {
     crm_trace("Connection %p", c);
 }
 
 static int32_t
 cib_ipc_dispatch_rw(qb_ipcs_connection_t * c, void *data, size_t size)
 {
     crm_client_t *client = crm_client_get(c);
 
     crm_trace("%p message from %s", c, client->id);
     return cib_common_callback(c, data, size, TRUE);
 }
 
 static int32_t
 cib_ipc_dispatch_ro(qb_ipcs_connection_t * c, void *data, size_t size)
 {
     crm_client_t *client = crm_client_get(c);
 
     crm_trace("%p message from %s", c, client->id);
     return cib_common_callback(c, data, size, FALSE);
 }
 
 /* Error code means? */
 static int32_t
 cib_ipc_closed(qb_ipcs_connection_t * c)
 {
     crm_client_t *client = crm_client_get(c);
 
     if (client == NULL) {
         return 0;
     }
     crm_trace("Connection %p", c);
     crm_client_destroy(client);
     return 0;
 }
 
 static void
 cib_ipc_destroy(qb_ipcs_connection_t * c)
 {
     crm_trace("Connection %p", c);
     cib_ipc_closed(c);
     if (cib_shutdown_flag) {
         cib_shutdown(0);
     }
 }
 
 struct qb_ipcs_service_handlers ipc_ro_callbacks = {
     .connection_accept = cib_ipc_accept,
     .connection_created = cib_ipc_created,
     .msg_process = cib_ipc_dispatch_ro,
     .connection_closed = cib_ipc_closed,
     .connection_destroyed = cib_ipc_destroy
 };
 
 struct qb_ipcs_service_handlers ipc_rw_callbacks = {
     .connection_accept = cib_ipc_accept,
     .connection_created = cib_ipc_created,
     .msg_process = cib_ipc_dispatch_rw,
     .connection_closed = cib_ipc_closed,
     .connection_destroyed = cib_ipc_destroy
 };
 
 void
 cib_common_callback_worker(uint32_t id, uint32_t flags, xmlNode * op_request,
                            crm_client_t * cib_client, gboolean privileged)
 {
     const char *op = crm_element_value(op_request, F_CIB_OPERATION);
 
     if (crm_str_eq(op, CRM_OP_REGISTER, TRUE)) {
         if (flags & crm_ipc_client_response) {
             xmlNode *ack = create_xml_node(NULL, __FUNCTION__);
 
             crm_xml_add(ack, F_CIB_OPERATION, CRM_OP_REGISTER);
             crm_xml_add(ack, F_CIB_CLIENTID, cib_client->id);
             crm_ipcs_send(cib_client, id, ack, flags);
             cib_client->request_id = 0;
             free_xml(ack);
         }
         return;
 
     } else if (crm_str_eq(op, T_CIB_NOTIFY, TRUE)) {
         /* Update the notify filters for this client */
         int on_off = 0;
         long long bit = 0;
         const char *type = crm_element_value(op_request, F_CIB_NOTIFY_TYPE);
 
         crm_element_value_int(op_request, F_CIB_NOTIFY_ACTIVATE, &on_off);
 
         crm_debug("Setting %s callbacks for %s (%s): %s",
                   type, cib_client->name, cib_client->id, on_off ? "on" : "off");
 
         if (safe_str_eq(type, T_CIB_POST_NOTIFY)) {
             bit = cib_notify_post;
 
         } else if (safe_str_eq(type, T_CIB_PRE_NOTIFY)) {
             bit = cib_notify_pre;
 
         } else if (safe_str_eq(type, T_CIB_UPDATE_CONFIRM)) {
             bit = cib_notify_confirm;
 
         } else if (safe_str_eq(type, T_CIB_DIFF_NOTIFY)) {
             bit = cib_notify_diff;
 
         } else if (safe_str_eq(type, T_CIB_REPLACE_NOTIFY)) {
             bit = cib_notify_replace;
         }
 
         if (on_off) {
             set_bit(cib_client->options, bit);
         } else {
             clear_bit(cib_client->options, bit);
         }
 
         if (flags & crm_ipc_client_response) {
             /* TODO - include rc */
             crm_ipcs_send_ack(cib_client, id, flags, "ack", __FUNCTION__, __LINE__);
         }
         return;
     }
 
     cib_process_request(op_request, FALSE, privileged, FALSE, cib_client);
 }
 
 int32_t
 cib_common_callback(qb_ipcs_connection_t * c, void *data, size_t size, gboolean privileged)
 {
     uint32_t id = 0;
     uint32_t flags = 0;
     int call_options = 0;
     crm_client_t *cib_client = crm_client_get(c);
     xmlNode *op_request = crm_ipcs_recv(cib_client, data, size, &id, &flags);
 
     if (op_request) {
         crm_element_value_int(op_request, F_CIB_CALLOPTS, &call_options);
     }
 
     if (op_request == NULL) {
         crm_trace("Invalid message from %p", c);
         crm_ipcs_send_ack(cib_client, id, flags, "nack", __FUNCTION__, __LINE__);
         return 0;
 
     } else if(cib_client == NULL) {
         crm_trace("Invalid client %p", c);
         return 0;
     }
 
     if (is_set(call_options, cib_sync_call)) {
         CRM_ASSERT(flags & crm_ipc_client_response);
         CRM_LOG_ASSERT(cib_client->request_id == 0);    /* This means the client has two synchronous events in-flight */
         cib_client->request_id = id;    /* Reply only to the last one */
     }
 
     if (cib_client->name == NULL) {
         const char *value = crm_element_value(op_request, F_CIB_CLIENTNAME);
 
         if (value == NULL) {
             cib_client->name = crm_itoa(cib_client->pid);
         } else {
             cib_client->name = strdup(value);
         }
     }
 
     crm_xml_add(op_request, F_CIB_CLIENTID, cib_client->id);
     crm_xml_add(op_request, F_CIB_CLIENTNAME, cib_client->name);
 
 #if ENABLE_ACL
     CRM_ASSERT(cib_client->user != NULL);
     crm_acl_get_set_user(op_request, F_CIB_USER, cib_client->user);
 #endif
 
     crm_log_xml_trace(op_request, "Client[inbound]");
 
     cib_common_callback_worker(id, flags, op_request, cib_client, privileged);
     free_xml(op_request);
 
     return 0;
 }
 
 static uint64_t ping_seq = 0;
 static char *ping_digest = NULL;
 static bool ping_modified_since = FALSE;
 int sync_our_cib(xmlNode * request, gboolean all);
 
 static gboolean
 cib_digester_cb(gpointer data)
 {
     if (cib_is_master) {
         char buffer[32];
         xmlNode *ping = create_xml_node(NULL, "ping");
 
         ping_seq++;
         free(ping_digest);
         ping_digest = NULL;
         ping_modified_since = FALSE;
         snprintf(buffer, 32, U64T, ping_seq);
         crm_trace("Requesting peer digests (%s)", buffer);
 
         crm_xml_add(ping, F_TYPE, "cib");
         crm_xml_add(ping, F_CIB_OPERATION, CRM_OP_PING);
         crm_xml_add(ping, F_CIB_PING_ID, buffer);
 
         crm_xml_add(ping, XML_ATTR_CRM_VERSION, CRM_FEATURE_SET);
         send_cluster_message(NULL, crm_msg_cib, ping, TRUE);
 
         free_xml(ping);
     }
     return FALSE;
 }
 
 static void
 process_ping_reply(xmlNode *reply) 
 {
     uint64_t seq = 0;
     const char *host = crm_element_value(reply, F_ORIG);
 
     xmlNode *pong = get_message_xml(reply, F_CIB_CALLDATA);
     const char *seq_s = crm_element_value(pong, F_CIB_PING_ID);
     const char *digest = crm_element_value(pong, XML_ATTR_DIGEST);
 
     if (seq_s) {
         seq = crm_int_helper(seq_s, NULL);
     }
 
     if(digest == NULL) {
         crm_trace("Ignoring ping reply %s from %s with no digest", seq_s, host);
 
     } else if(seq != ping_seq) {
         crm_trace("Ignoring out of sequence ping reply %s from %s", seq_s, host);
 
     } else if(ping_modified_since) {
         crm_trace("Ignoring ping reply %s from %s: cib updated since", seq_s, host);
 
     } else {
         const char *version = crm_element_value(pong, XML_ATTR_CRM_VERSION);
 
         if(ping_digest == NULL) {
             crm_trace("Calculating new digest");
             ping_digest = calculate_xml_versioned_digest(the_cib, FALSE, TRUE, version);
         }
 
         crm_trace("Processing ping reply %s from %s (%s)", seq_s, host, digest);
         if(safe_str_eq(ping_digest, digest) == FALSE) {
             xmlNode *remote_cib = get_message_xml(pong, F_CIB_CALLDATA);
 
             crm_notice("Local CIB %s.%s.%s.%s differs from %s: %s.%s.%s.%s %p",
                        crm_element_value(the_cib, XML_ATTR_GENERATION_ADMIN),
                        crm_element_value(the_cib, XML_ATTR_GENERATION),
                        crm_element_value(the_cib, XML_ATTR_NUMUPDATES),
                        ping_digest, host,
                        remote_cib?crm_element_value(remote_cib, XML_ATTR_GENERATION_ADMIN):"_",
                        remote_cib?crm_element_value(remote_cib, XML_ATTR_GENERATION):"_",
                        remote_cib?crm_element_value(remote_cib, XML_ATTR_NUMUPDATES):"_",
                        digest, remote_cib);
 
             if(remote_cib && remote_cib->children) {
                 /* Additional debug */
                 xml_calculate_changes(the_cib, remote_cib);
                 xml_log_changes(LOG_INFO, __FUNCTION__, remote_cib);
                 crm_trace("End of differences");
             }
 
             free_xml(remote_cib);
             sync_our_cib(reply, FALSE);
         }
     }
 }
 
 static void
 do_local_notify(xmlNode * notify_src, const char *client_id,
                 gboolean sync_reply, gboolean from_peer)
 {
     /* send callback to originating child */
     crm_client_t *client_obj = NULL;
     int local_rc = pcmk_ok;
     int call_id = 0;
 
     crm_element_value_int(notify_src, F_CIB_CALLID, &call_id);
 
     if (client_id != NULL) {
         client_obj = crm_client_get_by_id(client_id);
     }
 
     if (client_obj == NULL) {
         local_rc = -ECONNRESET;
         crm_trace("No client to sent response %d to, F_CIB_CLIENTID not set.", call_id);
 
     } else {
         int rid = 0;
 
         if (sync_reply) {
             if (client_obj->ipcs) {
                 CRM_LOG_ASSERT(client_obj->request_id);
 
                 rid = client_obj->request_id;
                 client_obj->request_id = 0;
 
                 crm_trace("Sending response %d to %s %s",
                           rid, client_obj->name,
                           from_peer ? "(originator of delegated request)" : "");
             } else {
                 crm_trace("Sending response [call %d] to %s %s",
                           call_id, client_obj->name, from_peer ? "(originator of delegated request)" : "");
             }
 
         } else {
             crm_trace("Sending event %d to %s %s",
                       call_id, client_obj->name, from_peer ? "(originator of delegated request)" : "");
         }
 
         switch (client_obj->kind) {
             case CRM_CLIENT_IPC:
                 if (crm_ipcs_send(client_obj, rid, notify_src, sync_reply?crm_ipc_flags_none:crm_ipc_server_event) < 0) {
                     local_rc = -ENOMSG;
                 }
                 break;
 #ifdef HAVE_GNUTLS_GNUTLS_H
             case CRM_CLIENT_TLS:
 #endif
             case CRM_CLIENT_TCP:
                 crm_remote_send(client_obj->remote, notify_src);
                 break;
             default:
                 crm_err("Unknown transport %d for %s", client_obj->kind, client_obj->name);
         }
     }
 
     if (local_rc != pcmk_ok && client_obj != NULL) {
         crm_warn("%sSync reply to %s failed: %s",
                  sync_reply ? "" : "A-",
                  client_obj ? client_obj->name : "<unknown>", pcmk_strerror(local_rc));
     }
 }
 
 static void
 local_notify_destroy_callback(gpointer data)
 {
     cib_local_notify_t *notify = data;
 
     free_xml(notify->notify_src);
     free(notify->client_id);
     free(notify);
 }
 
 static void
 check_local_notify(int bcast_id)
 {
     cib_local_notify_t *notify = NULL;
 
     if (!local_notify_queue) {
         return;
     }
 
     notify = g_hash_table_lookup(local_notify_queue, GINT_TO_POINTER(bcast_id));
 
     if (notify) {
         do_local_notify(notify->notify_src, notify->client_id, notify->sync_reply,
                         notify->from_peer);
         g_hash_table_remove(local_notify_queue, GINT_TO_POINTER(bcast_id));
     }
 }
 
 static void
 queue_local_notify(xmlNode * notify_src, const char *client_id, gboolean sync_reply,
                    gboolean from_peer)
 {
     cib_local_notify_t *notify = calloc(1, sizeof(cib_local_notify_t));
 
     notify->notify_src = notify_src;
     notify->client_id = strdup(client_id);
     notify->sync_reply = sync_reply;
     notify->from_peer = from_peer;
 
     if (!local_notify_queue) {
         local_notify_queue = g_hash_table_new_full(g_direct_hash,
                                                    g_direct_equal, NULL,
                                                    local_notify_destroy_callback);
     }
 
     g_hash_table_insert(local_notify_queue, GINT_TO_POINTER(cib_local_bcast_num), notify);
 }
 
 static void
 parse_local_options_v1(crm_client_t * cib_client, int call_type, int call_options, const char *host,
                     const char *op, gboolean * local_notify, gboolean * needs_reply,
                     gboolean * process, gboolean * needs_forward)
 {
     if (cib_op_modifies(call_type)
         && !(call_options & cib_inhibit_bcast)) {
         /* we need to send an update anyway */
         *needs_reply = TRUE;
     } else {
         *needs_reply = FALSE;
     }
 
     if (host == NULL && (call_options & cib_scope_local)) {
         crm_trace("Processing locally scoped %s op from %s", op, cib_client->name);
         *local_notify = TRUE;
 
     } else if (host == NULL && cib_is_master) {
         crm_trace("Processing master %s op locally from %s", op, cib_client->name);
         *local_notify = TRUE;
 
     } else if (safe_str_eq(host, cib_our_uname)) {
         crm_trace("Processing locally addressed %s op from %s", op, cib_client->name);
         *local_notify = TRUE;
 
     } else if (stand_alone) {
         *needs_forward = FALSE;
         *local_notify = TRUE;
         *process = TRUE;
 
     } else {
         crm_trace("%s op from %s needs to be forwarded to %s",
                   op, cib_client->name, host ? host : "the master instance");
         *needs_forward = TRUE;
         *process = FALSE;
     }
 }
 
 static void
 parse_local_options_v2(crm_client_t * cib_client, int call_type, int call_options, const char *host,
                     const char *op, gboolean * local_notify, gboolean * needs_reply,
                     gboolean * process, gboolean * needs_forward)
 {
     if (cib_op_modifies(call_type)) {
         if(safe_str_eq(op, CIB_OP_MASTER) || safe_str_eq(op, CIB_OP_SLAVE)) {
             /* Always handle these locally */
             *process = TRUE;
             *needs_reply = FALSE;
             *local_notify = TRUE;
             *needs_forward = FALSE;
             return;
 
         } else {
             /* Redirect all other updates via CPG */
             *needs_reply = TRUE;
             *needs_forward = TRUE;
             *process = FALSE;
             crm_trace("%s op from %s needs to be forwarded to %s",
                       op, cib_client->name, host ? host : "the master instance");
             return;
         }
     }
 
 
     *process = TRUE;
     *needs_reply = FALSE;
     *local_notify = TRUE;
     *needs_forward = FALSE;
 
     if (stand_alone) {
         crm_trace("Processing %s op from %s (stand-alone)", op, cib_client->name);
 
     } else if (host == NULL) {
         crm_trace("Processing unaddressed %s op from %s", op, cib_client->name);
 
     } else if (safe_str_eq(host, cib_our_uname)) {
         crm_trace("Processing locally addressed %s op from %s", op, cib_client->name);
 
     } else {
         crm_trace("%s op from %s needs to be forwarded to %s", op, cib_client->name, host);
         *needs_forward = TRUE;
         *process = FALSE;
     }
 }
 
 static void
 parse_local_options(crm_client_t * cib_client, int call_type, int call_options, const char *host,
                     const char *op, gboolean * local_notify, gboolean * needs_reply,
                     gboolean * process, gboolean * needs_forward)
 {
     if(cib_legacy_mode()) {
         parse_local_options_v1(cib_client, call_type, call_options, host,
                                op, local_notify, needs_reply, process, needs_forward);
     } else {
         parse_local_options_v2(cib_client, call_type, call_options, host,
                                op, local_notify, needs_reply, process, needs_forward);
     }
 }
 
 static gboolean
 parse_peer_options_v1(int call_type, xmlNode * request,
                    gboolean * local_notify, gboolean * needs_reply, gboolean * process,
                    gboolean * needs_forward)
 {
     const char *op = NULL;
     const char *host = NULL;
     const char *delegated = NULL;
     const char *originator = crm_element_value(request, F_ORIG);
     const char *reply_to = crm_element_value(request, F_CIB_ISREPLY);
     const char *update = crm_element_value(request, F_CIB_GLOBAL_UPDATE);
 
     gboolean is_reply = safe_str_eq(reply_to, cib_our_uname);
 
     if (crm_is_true(update)) {
         *needs_reply = FALSE;
         if (is_reply) {
             *local_notify = TRUE;
             crm_trace("Processing global/peer update from %s"
                       " that originated from us", originator);
         } else {
             crm_trace("Processing global/peer update from %s", originator);
         }
         return TRUE;
     }
 
     crm_trace("Processing %s request sent by %s", op, originator);
     op = crm_element_value(request, F_CIB_OPERATION);
     if (safe_str_eq(op, "cib_shutdown_req")) {
         /* Always process these */
         *local_notify = FALSE;
         if (reply_to == NULL || is_reply) {
             *process = TRUE;
         }
         if (is_reply) {
             *needs_reply = FALSE;
         }
         return *process;
     }
 
     if (is_reply && safe_str_eq(op, CRM_OP_PING)) {
         process_ping_reply(request);
         return FALSE;
     }
 
     if (is_reply) {
         crm_trace("Forward reply sent from %s to local clients", originator);
         *process = FALSE;
         *needs_reply = FALSE;
         *local_notify = TRUE;
         return TRUE;
     }
 
     host = crm_element_value(request, F_CIB_HOST);
     if (host != NULL && safe_str_eq(host, cib_our_uname)) {
         crm_trace("Processing %s request sent to us from %s", op, originator);
         return TRUE;
 
     } else if(is_reply == FALSE && safe_str_eq(op, CRM_OP_PING)) {
         crm_trace("Processing %s request sent to %s by %s", op, host?host:"everyone", originator);
         *needs_reply = TRUE;
         return TRUE;
 
     } else if (host == NULL && cib_is_master == TRUE) {
         crm_trace("Processing %s request sent to master instance from %s", op, originator);
         return TRUE;
     }
 
     delegated = crm_element_value(request, F_CIB_DELEGATED);
     if (delegated != NULL) {
         crm_trace("Ignoring msg for master instance");
 
     } else if (host != NULL) {
         /* this is for a specific instance and we're not it */
         crm_trace("Ignoring msg for instance on %s", crm_str(host));
 
     } else if (reply_to == NULL && cib_is_master == FALSE) {
         /* this is for the master instance and we're not it */
         crm_trace("Ignoring reply to %s", crm_str(reply_to));
 
     } else if (safe_str_eq(op, "cib_shutdown_req")) {
         if (reply_to != NULL) {
             crm_debug("Processing %s from %s", op, host);
             *needs_reply = FALSE;
 
         } else {
             crm_debug("Processing %s reply from %s", op, host);
         }
         return TRUE;
 
     } else {
         crm_err("Nothing for us to do?");
         crm_log_xml_err(request, "Peer[inbound]");
     }
 
     return FALSE;
 }
 
 static gboolean
 parse_peer_options_v2(int call_type, xmlNode * request,
                    gboolean * local_notify, gboolean * needs_reply, gboolean * process,
                    gboolean * needs_forward)
 {
     const char *host = NULL;
     const char *delegated = crm_element_value(request, F_CIB_DELEGATED);
     const char *op = crm_element_value(request, F_CIB_OPERATION);
     const char *originator = crm_element_value(request, F_ORIG);
     const char *reply_to = crm_element_value(request, F_CIB_ISREPLY);
     const char *update = crm_element_value(request, F_CIB_GLOBAL_UPDATE);
 
     gboolean is_reply = safe_str_eq(reply_to, cib_our_uname);
 
     if(safe_str_eq(op, CIB_OP_REPLACE)) {
         /* sync_our_cib() sets F_CIB_ISREPLY */
         if (reply_to) {
             delegated = reply_to;
         }
         goto skip_is_reply;
 
     } else if(safe_str_eq(op, CIB_OP_SYNC)) {
 
     } else if (is_reply && safe_str_eq(op, CRM_OP_PING)) {
         process_ping_reply(request);
         return FALSE;
 
     } else if (safe_str_eq(op, CIB_OP_UPGRADE)) {
         /* Only the DC (node with the oldest software) should process
          * this operation if F_CIB_SCHEMA_MAX is unset
          *
          * If the DC is happy it will then send out another
          * CIB_OP_UPGRADE which will tell all nodes to do the actual
          * upgrade.
          *
          * Except this time F_CIB_SCHEMA_MAX will be set which puts a
          * limit on how far newer nodes will go
          */
         const char *max = crm_element_value(request, F_CIB_SCHEMA_MAX);
 
         crm_trace("Parsing %s operation%s for %s with max=%s",
                   op, is_reply?" reply":"", cib_is_master?"master":"slave", max);
         if(max == NULL && cib_is_master) {
             /* We are the DC, check if this upgrade is allowed */
             goto skip_is_reply;
 
         } else if(max) {
             /* Ok, go ahead and upgrade to 'max' */
             goto skip_is_reply;
 
         } else {
             return FALSE; /* Ignore */
         }
 
     } else if (crm_is_true(update)) {
         crm_info("Detected legacy %s global update from %s", op, originator);
         send_sync_request(NULL);
         legacy_mode = TRUE;
         return FALSE;
 
     } else if (is_reply && cib_op_modifies(call_type)) {
         crm_trace("Ignoring legacy %s reply sent from %s to local clients", op, originator);
         return FALSE;
 
     } else if (safe_str_eq(op, "cib_shutdown_req")) {
         /* Legacy handling */
         crm_debug("Legacy handling of %s message from %s", op, originator);
         *local_notify = FALSE;
         if (reply_to == NULL) {
             *process = TRUE;
         }
         return *process;
     }
 
     if(is_reply) {
         crm_trace("Handling %s reply sent from %s to local clients", op, originator);
         *process = FALSE;
         *needs_reply = FALSE;
         *local_notify = TRUE;
         return TRUE;
     }
 
   skip_is_reply:
     *process = TRUE;
     *needs_reply = FALSE;
 
     if(safe_str_eq(delegated, cib_our_uname)) {
         *local_notify = TRUE;
     } else {
         *local_notify = FALSE;
     }
 
     host = crm_element_value(request, F_CIB_HOST);
     if (host != NULL && safe_str_eq(host, cib_our_uname)) {
         crm_trace("Processing %s request sent to us from %s", op, originator);
         *needs_reply = TRUE;
         return TRUE;
 
     } else if (host != NULL) {
         /* this is for a specific instance and we're not it */
         crm_trace("Ignoring %s operation for instance on %s", op, crm_str(host));
         return FALSE;
 
     } else if(is_reply == FALSE && safe_str_eq(op, CRM_OP_PING)) {
         *needs_reply = TRUE;
     }
 
     crm_trace("Processing %s request sent to everyone by %s/%s on %s %s", op,
               crm_element_value(request, F_CIB_CLIENTNAME),
               crm_element_value(request, F_CIB_CALLID),
               originator, (*local_notify)?"(notify)":"");
     return TRUE;
 }
 
 static gboolean
 parse_peer_options(int call_type, xmlNode * request,
                    gboolean * local_notify, gboolean * needs_reply, gboolean * process,
                    gboolean * needs_forward)
 {
     /* TODO: What happens when an update comes in after node A
      * requests the CIB from node B, but before it gets the reply (and
      * sends out the replace operation)
      */
     if(cib_legacy_mode()) {
         return parse_peer_options_v1(
             call_type, request, local_notify, needs_reply, process, needs_forward);
     } else {
         return parse_peer_options_v2(
             call_type, request, local_notify, needs_reply, process, needs_forward);
     }
 }
 
 static void
 forward_request(xmlNode * request, crm_client_t * cib_client, int call_options)
 {
     const char *op = crm_element_value(request, F_CIB_OPERATION);
     const char *host = crm_element_value(request, F_CIB_HOST);
 
     crm_xml_add(request, F_CIB_DELEGATED, cib_our_uname);
 
     if (host != NULL) {
         crm_trace("Forwarding %s op to %s", op, host);
         send_cluster_message(crm_get_peer(0, host), crm_msg_cib, request, FALSE);
 
     } else {
         crm_trace("Forwarding %s op to master instance", op);
         send_cluster_message(NULL, crm_msg_cib, request, FALSE);
     }
 
     /* Return the request to its original state */
     xml_remove_prop(request, F_CIB_DELEGATED);
 
     if (call_options & cib_discard_reply) {
         crm_trace("Client not interested in reply");
     }
 }
 
 static gboolean
 send_peer_reply(xmlNode * msg, xmlNode * result_diff, const char *originator, gboolean broadcast)
 {
     CRM_ASSERT(msg != NULL);
 
     if (broadcast) {
         /* this (successful) call modified the CIB _and_ the
          * change needs to be broadcast...
          *   send via HA to other nodes
          */
         int diff_add_updates = 0;
         int diff_add_epoch = 0;
         int diff_add_admin_epoch = 0;
 
         int diff_del_updates = 0;
         int diff_del_epoch = 0;
         int diff_del_admin_epoch = 0;
 
         const char *digest = NULL;
         int format = 1;
 
         CRM_LOG_ASSERT(result_diff != NULL);
         digest = crm_element_value(result_diff, XML_ATTR_DIGEST);
         crm_element_value_int(result_diff, "format", &format);
 
         cib_diff_version_details(result_diff,
                                  &diff_add_admin_epoch, &diff_add_epoch, &diff_add_updates,
                                  &diff_del_admin_epoch, &diff_del_epoch, &diff_del_updates);
 
         crm_trace("Sending update diff %d.%d.%d -> %d.%d.%d %s",
                   diff_del_admin_epoch, diff_del_epoch, diff_del_updates,
                   diff_add_admin_epoch, diff_add_epoch, diff_add_updates, digest);
 
         crm_xml_add(msg, F_CIB_ISREPLY, originator);
         crm_xml_add(msg, F_CIB_GLOBAL_UPDATE, XML_BOOLEAN_TRUE);
         crm_xml_add(msg, F_CIB_OPERATION, CIB_OP_APPLY_DIFF);
 
         if (format == 1) {
             CRM_ASSERT(digest != NULL);
         }
 
         add_message_xml(msg, F_CIB_UPDATE_DIFF, result_diff);
         crm_log_xml_explicit(msg, "copy");
         return send_cluster_message(NULL, crm_msg_cib, msg, TRUE);
 
     } else if (originator != NULL) {
         /* send reply via HA to originating node */
         crm_trace("Sending request result to %s only", originator);
         crm_xml_add(msg, F_CIB_ISREPLY, originator);
         return send_cluster_message(crm_get_peer(0, originator), crm_msg_cib, msg, FALSE);
     }
 
     return FALSE;
 }
 
 void
 cib_process_request(xmlNode * request, gboolean force_synchronous, gboolean privileged,
                     gboolean unused, crm_client_t * cib_client)
 {
     int call_type = 0;
     int call_options = 0;
 
     gboolean process = TRUE;
     gboolean is_update = TRUE;
     gboolean from_peer = TRUE;
     gboolean needs_reply = TRUE;
     gboolean local_notify = FALSE;
     gboolean needs_forward = FALSE;
     gboolean global_update = crm_is_true(crm_element_value(request, F_CIB_GLOBAL_UPDATE));
 
     xmlNode *op_reply = NULL;
     xmlNode *result_diff = NULL;
 
     int rc = pcmk_ok;
     const char *op = crm_element_value(request, F_CIB_OPERATION);
     const char *originator = crm_element_value(request, F_ORIG);
     const char *host = crm_element_value(request, F_CIB_HOST);
     const char *target = NULL;
     const char *call_id = crm_element_value(request, F_CIB_CALLID);
     const char *client_id = crm_element_value(request, F_CIB_CLIENTID);
     const char *client_name = crm_element_value(request, F_CIB_CLIENTNAME);
     const char *reply_to = crm_element_value(request, F_CIB_ISREPLY);
 
     if (cib_client) {
         from_peer = FALSE;
     }
 
     cib_num_ops++;
     if (cib_num_ops == 0) {
         cib_num_fail = 0;
         cib_num_local = 0;
         cib_num_updates = 0;
         crm_info("Stats wrapped around");
     }
 
     crm_element_value_int(request, F_CIB_CALLOPTS, &call_options);
     if (force_synchronous) {
         call_options |= cib_sync_call;
     }
 
     if (host != NULL && strlen(host) == 0) {
         host = NULL;
     }
 
     if (host) {
         target = host;
 
     } else if (call_options & cib_scope_local) {
         target = "local host";
 
     } else {
         target = "master";
     }
 
     if (from_peer) {
         crm_trace("Processing peer %s operation from %s/%s on %s intended for %s (reply=%s)",
                   op, client_name, call_id, originator, target, reply_to);
     } else {
         crm_xml_add(request, F_ORIG, cib_our_uname);
         crm_trace("Processing local %s operation from %s/%s intended for %s", op, client_name, call_id, target);
     }
 
     rc = cib_get_operation_id(op, &call_type);
     if (rc != pcmk_ok) {
         /* TODO: construct error reply? */
         crm_err("Pre-processing of command failed: %s", pcmk_strerror(rc));
         return;
     }
 
     if (from_peer == FALSE) {
         parse_local_options(cib_client, call_type, call_options, host, op,
                             &local_notify, &needs_reply, &process, &needs_forward);
 
     } else if (parse_peer_options(call_type, request, &local_notify,
                                   &needs_reply, &process, &needs_forward) == FALSE) {
         return;
     }
 
     is_update = cib_op_modifies(call_type);
     if (is_update) {
         cib_num_updates++;
     }
 
     if (call_options & cib_discard_reply) {
         needs_reply = is_update;
         local_notify = FALSE;
     }
 
     if (needs_forward) {
         const char *host = crm_element_value(request, F_CIB_HOST);
         const char *section = crm_element_value(request, F_CIB_SECTION);
 
         crm_info("Forwarding %s operation for section %s to %s (origin=%s/%s/%s)",
                  op,
                  section ? section : "'all'",
                  host ? host : "master",
                  originator ? originator : "local",
                  client_name, call_id);
 
         forward_request(request, cib_client, call_options);
         return;
     }
 
     if (cib_status != pcmk_ok) {
         const char *call = crm_element_value(request, F_CIB_CALLID);
 
         rc = cib_status;
         crm_err("Operation ignored, cluster configuration is invalid."
                 " Please repair and restart: %s", pcmk_strerror(cib_status));
 
         op_reply = create_xml_node(NULL, "cib-reply");
         crm_xml_add(op_reply, F_TYPE, T_CIB);
         crm_xml_add(op_reply, F_CIB_OPERATION, op);
         crm_xml_add(op_reply, F_CIB_CALLID, call);
         crm_xml_add(op_reply, F_CIB_CLIENTID, client_id);
         crm_xml_add_int(op_reply, F_CIB_CALLOPTS, call_options);
         crm_xml_add_int(op_reply, F_CIB_RC, rc);
 
         crm_trace("Attaching reply output");
         add_message_xml(op_reply, F_CIB_CALLDATA, the_cib);
 
         crm_log_xml_explicit(op_reply, "cib:reply");
 
     } else if (process) {
         time_t finished = 0;
 
         int now = time(NULL);
         int level = LOG_INFO;
         const char *section = crm_element_value(request, F_CIB_SECTION);
 
         cib_num_local++;
         rc = cib_process_command(request, &op_reply, &result_diff, privileged);
 
         if (is_update == FALSE) {
             level = LOG_TRACE;
 
         } else if (global_update) {
             switch (rc) {
                 case pcmk_ok:
                     level = LOG_INFO;
                     break;
                 case -pcmk_err_old_data:
                 case -pcmk_err_diff_resync:
                 case -pcmk_err_diff_failed:
                     level = LOG_TRACE;
                     break;
                 default:
                     level = LOG_ERR;
             }
 
         } else if (rc != pcmk_ok && is_update) {
             cib_num_fail++;
             level = LOG_WARNING;
         }
 
         do_crm_log(level,
                    "Completed %s operation for section %s: %s (rc=%d, origin=%s/%s/%s, version=%s.%s.%s)",
                    op, section ? section : "'all'", pcmk_strerror(rc), rc,
                    originator ? originator : "local", client_name, call_id,
                    the_cib ? crm_element_value(the_cib, XML_ATTR_GENERATION_ADMIN) : "0",
                    the_cib ? crm_element_value(the_cib, XML_ATTR_GENERATION) : "0",
                    the_cib ? crm_element_value(the_cib, XML_ATTR_NUMUPDATES) : "0");
 
         finished = time(NULL);
         if (finished - now > 3) {
             crm_trace("%s operation took %ds to complete", op, finished - now);
             crm_write_blackbox(0, NULL);
         }
 
         if (op_reply == NULL && (needs_reply || local_notify)) {
             crm_err("Unexpected NULL reply to message");
             crm_log_xml_err(request, "null reply");
             needs_reply = FALSE;
             local_notify = FALSE;
         }
     }
 
     /* from now on we are the server */
     if(is_update && cib_legacy_mode() == FALSE) {
         crm_trace("Completed pre-sync update from %s/%s/%s%s",
                   originator ? originator : "local", client_name, call_id,
                   local_notify?" with local notification":"");
 
     } else if (needs_reply == FALSE || stand_alone) {
         /* nothing more to do...
          * this was a non-originating slave update
          */
         crm_trace("Completed slave update");
 
     } else if (cib_legacy_mode() &&
                rc == pcmk_ok && result_diff != NULL && !(call_options & cib_inhibit_bcast)) {
         gboolean broadcast = FALSE;
 
         cib_local_bcast_num++;
         crm_xml_add_int(request, F_CIB_LOCAL_NOTIFY_ID, cib_local_bcast_num);
         broadcast = send_peer_reply(request, result_diff, originator, TRUE);
 
         if (broadcast && client_id && local_notify && op_reply) {
 
             /* If we have been asked to sync the reply,
              * and a bcast msg has gone out, we queue the local notify
              * until we know the bcast message has been received */
             local_notify = FALSE;
             crm_trace("Queuing local %ssync notification for %s",
                       (call_options & cib_sync_call) ? "" : "a-", client_id);
 
             queue_local_notify(op_reply, client_id, (call_options & cib_sync_call), from_peer);
             op_reply = NULL;    /* the reply is queued, so don't free here */
         }
 
     } else if (call_options & cib_discard_reply) {
         crm_trace("Caller isn't interested in reply");
 
     } else if (from_peer) {
         if (is_update == FALSE || result_diff == NULL) {
             crm_trace("Request not broadcast: R/O call");
 
         } else if (call_options & cib_inhibit_bcast) {
             crm_trace("Request not broadcast: inhibited");
 
         } else if (rc != pcmk_ok) {
             crm_trace("Request not broadcast: call failed: %s", pcmk_strerror(rc));
 
         } else {
             crm_trace("Directing reply to %s", originator);
         }
 
         send_peer_reply(op_reply, result_diff, originator, FALSE);
     }
 
     if (local_notify && client_id) {
         crm_trace("Performing local %ssync notification for %s",
                   (call_options & cib_sync_call) ? "" : "a-", client_id);
         if (process == FALSE) {
             do_local_notify(request, client_id, call_options & cib_sync_call, from_peer);
         } else {
             do_local_notify(op_reply, client_id, call_options & cib_sync_call, from_peer);
         }
     }
 
     free_xml(op_reply);
     free_xml(result_diff);
 
     return;
 }
 
 int
 cib_process_command(xmlNode * request, xmlNode ** reply, xmlNode ** cib_diff, gboolean privileged)
 {
     xmlNode *input = NULL;
     xmlNode *output = NULL;
     xmlNode *result_cib = NULL;
     xmlNode *current_cib = NULL;
 
     int call_type = 0;
     int call_options = 0;
 
     const char *op = NULL;
     const char *section = NULL;
     const char *call_id = crm_element_value(request, F_CIB_CALLID);
 
     int rc = pcmk_ok;
     int rc2 = pcmk_ok;
 
     gboolean send_r_notify = FALSE;
     gboolean global_update = FALSE;
     gboolean config_changed = FALSE;
     gboolean manage_counters = TRUE;
 
     static mainloop_timer_t *digest_timer = NULL;
 
     CRM_ASSERT(cib_status == pcmk_ok);
 
     if(digest_timer == NULL) {
         digest_timer = mainloop_timer_add("digester", 5000, FALSE, cib_digester_cb, NULL);
     }
 
     *reply = NULL;
     *cib_diff = NULL;
     current_cib = the_cib;
 
     /* Start processing the request... */
     op = crm_element_value(request, F_CIB_OPERATION);
     crm_element_value_int(request, F_CIB_CALLOPTS, &call_options);
     rc = cib_get_operation_id(op, &call_type);
 
     if (rc == pcmk_ok && privileged == FALSE) {
         rc = cib_op_can_run(call_type, call_options, privileged, global_update);
     }
 
     rc2 = cib_op_prepare(call_type, request, &input, &section);
     if (rc == pcmk_ok) {
         rc = rc2;
     }
 
     if (rc != pcmk_ok) {
         crm_trace("Call setup failed: %s", pcmk_strerror(rc));
         goto done;
 
     } else if (cib_op_modifies(call_type) == FALSE) {
         rc = cib_perform_op(op, call_options, cib_op_func(call_type), TRUE,
                             section, request, input, FALSE, &config_changed,
                             current_cib, &result_cib, NULL, &output);
 
         CRM_CHECK(result_cib == NULL, free_xml(result_cib));
         goto done;
     }
 
     /* Handle a valid write action */
     global_update = crm_is_true(crm_element_value(request, F_CIB_GLOBAL_UPDATE));
     if (global_update) {
         /* legacy code */
         manage_counters = FALSE;
         call_options |= cib_force_diff;
         crm_trace("Global update detected");
 
         CRM_CHECK(call_type == 3 || call_type == 4, crm_err("Call type: %d", call_type);
                   crm_log_xml_err(request, "bad op"));
     }
 
     if (rc == pcmk_ok) {
         ping_modified_since = TRUE;
         if (call_options & cib_inhibit_bcast) {
             /* skip */
             crm_trace("Skipping update: inhibit broadcast");
             manage_counters = FALSE;
         }
 
         if (is_not_set(call_options, cib_dryrun) && safe_str_eq(section, XML_CIB_TAG_STATUS)) {
             /* Copying large CIBs accounts for a huge percentage of our CIB usage */
             call_options |= cib_zero_copy;
         } else {
             clear_bit(call_options, cib_zero_copy);
         }
 
         /* result_cib must not be modified after cib_perform_op() returns */
         rc = cib_perform_op(op, call_options, cib_op_func(call_type), FALSE,
                             section, request, input, manage_counters, &config_changed,
                             current_cib, &result_cib, cib_diff, &output);
 
         if (manage_counters == FALSE) {
             /* Legacy code
              * If the diff is NULL at this point, its because nothing changed
              */
             config_changed = cib_config_changed(NULL, NULL, cib_diff);
         }
 
         /* Always write to disk for replace ops,
          * this also negates the need to detect ordering changes
          */
         if (crm_str_eq(CIB_OP_REPLACE, op, TRUE)) {
             config_changed = TRUE;
         }
     }
 
     if (rc == pcmk_ok && is_not_set(call_options, cib_dryrun)) {
         if(is_not_set(call_options, cib_zero_copy)) {
             rc = activateCibXml(result_cib, config_changed, op);
         }
 
         if (rc == pcmk_ok && cib_internal_config_changed(*cib_diff)) {
             cib_read_config(config_hash, result_cib);
         }
 
         if (crm_str_eq(CIB_OP_REPLACE, op, TRUE)) {
             if (section == NULL) {
                 send_r_notify = TRUE;
 
             } else if (safe_str_eq(section, XML_TAG_CIB)) {
                 send_r_notify = TRUE;
 
             } else if (safe_str_eq(section, XML_CIB_TAG_NODES)) {
                 send_r_notify = TRUE;
 
             } else if (safe_str_eq(section, XML_CIB_TAG_STATUS)) {
                 send_r_notify = TRUE;
             }
 
         } else if (crm_str_eq(CIB_OP_ERASE, op, TRUE)) {
             send_r_notify = TRUE;
         }
 
         mainloop_timer_stop(digest_timer);
         mainloop_timer_start(digest_timer);
 
     } else if (rc == -pcmk_err_schema_validation) {
         CRM_ASSERT(is_not_set(call_options, cib_zero_copy));
 
         if (output != NULL) {
             crm_log_xml_info(output, "cib:output");
             free_xml(output);
         }
 
         output = result_cib;
 
     } else {
         if(is_not_set(call_options, cib_zero_copy)) {
             free_xml(result_cib);
         }
     }
 
     if ((call_options & cib_inhibit_notify) == 0) {
         const char *client = crm_element_value(request, F_CIB_CLIENTNAME);
 
         crm_trace("Sending notifications");
         cib_diff_notify(call_options, client, call_id, op, input, rc, *cib_diff);
     }
 
     if (send_r_notify) {
         const char *origin = crm_element_value(request, F_ORIG);
 
         cib_replace_notify(origin, the_cib, rc, *cib_diff);
     }
 
     xml_log_patchset(LOG_TRACE, "cib:diff", *cib_diff);
   done:
     if ((call_options & cib_discard_reply) == 0) {
         const char *caller = crm_element_value(request, F_CIB_CLIENTID);
 
         *reply = create_xml_node(NULL, "cib-reply");
         crm_xml_add(*reply, F_TYPE, T_CIB);
         crm_xml_add(*reply, F_CIB_OPERATION, op);
         crm_xml_add(*reply, F_CIB_CALLID, call_id);
         crm_xml_add(*reply, F_CIB_CLIENTID, caller);
         crm_xml_add_int(*reply, F_CIB_CALLOPTS, call_options);
         crm_xml_add_int(*reply, F_CIB_RC, rc);
 
         if (output != NULL) {
             crm_trace("Attaching reply output");
             add_message_xml(*reply, F_CIB_CALLDATA, output);
         }
 
         crm_log_xml_explicit(*reply, "cib:reply");
     }
 
     crm_trace("cleanup");
 
     if (cib_op_modifies(call_type) == FALSE && output != current_cib) {
         free_xml(output);
         output = NULL;
     }
 
     if (call_type >= 0) {
         cib_op_cleanup(call_type, call_options, &input, &output);
     }
 
     crm_trace("done");
     return rc;
 }
 
 gint
 cib_GCompareFunc(gconstpointer a, gconstpointer b)
 {
     const xmlNode *a_msg = a;
     const xmlNode *b_msg = b;
 
     int msg_a_id = 0;
     int msg_b_id = 0;
     const char *value = NULL;
 
     value = crm_element_value_const(a_msg, F_CIB_CALLID);
     msg_a_id = crm_parse_int(value, NULL);
 
     value = crm_element_value_const(b_msg, F_CIB_CALLID);
     msg_b_id = crm_parse_int(value, NULL);
 
     if (msg_a_id == msg_b_id) {
         return 0;
     } else if (msg_a_id < msg_b_id) {
         return -1;
     }
     return 1;
 }
 
 #if SUPPORT_HEARTBEAT
 void
 cib_ha_peer_callback(HA_Message * msg, void *private_data)
 {
     xmlNode *xml = convert_ha_message(NULL, msg, __FUNCTION__);
 
     cib_peer_callback(xml, private_data);
     free_xml(xml);
 }
 #endif
 
 void
 cib_peer_callback(xmlNode * msg, void *private_data)
 {
     const char *reason = NULL;
     const char *originator = crm_element_value(msg, F_ORIG);
 
     if (cib_legacy_mode() && (originator == NULL || crm_str_eq(originator, cib_our_uname, TRUE))) {
         /* message is from ourselves */
         int bcast_id = 0;
 
         if (!(crm_element_value_int(msg, F_CIB_LOCAL_NOTIFY_ID, &bcast_id))) {
             check_local_notify(bcast_id);
         }
         return;
 
     } else if (crm_peer_cache == NULL) {
         reason = "membership not established";
         goto bail;
     }
 
     if (crm_element_value(msg, F_CIB_CLIENTNAME) == NULL) {
         crm_xml_add(msg, F_CIB_CLIENTNAME, originator);
     }
 
     /* crm_log_xml_trace("Peer[inbound]", msg); */
     cib_process_request(msg, FALSE, TRUE, TRUE, NULL);
     return;
 
   bail:
     if (reason) {
         const char *seq = crm_element_value(msg, F_SEQ);
         const char *op = crm_element_value(msg, F_CIB_OPERATION);
 
         crm_warn("Discarding %s message (%s) from %s: %s", op, seq, originator, reason);
     }
 }
 
 #if SUPPORT_HEARTBEAT
 extern oc_ev_t *cib_ev_token;
 static void *ccm_library = NULL;
 int (*ccm_api_callback_done) (void *cookie) = NULL;
 int (*ccm_api_handle_event) (const oc_ev_t * token) = NULL;
 
 void
 cib_client_status_callback(const char *node, const char *client, const char *status, void *private)
 {
     /* Heartbeat only */
     crm_node_t *peer = NULL;
 
     if (safe_str_eq(client, CRM_SYSTEM_CIB)) {
         peer = crm_get_peer(0, node);
         if (safe_str_neq(peer->state, CRM_NODE_MEMBER)) {
             crm_warn("This peer is not a ccm member (yet). "
                 "Status ignored: Client %s/%s announced status [%s]",
                 node, client, status);
             return;
         }
 
         crm_info("Status update: Client %s/%s now has status [%s]", node, client, status);
 
         if (safe_str_eq(status, JOINSTATUS)) {
             status = ONLINESTATUS;
 
         } else if (safe_str_eq(status, LEAVESTATUS)) {
             status = OFFLINESTATUS;
         }
 
         crm_update_peer_proc(__FUNCTION__, peer, crm_proc_cib, status);
     }
     return;
 }
 
 int
 cib_ccm_dispatch(gpointer user_data)
 {
     int rc = 0;
     oc_ev_t *ccm_token = (oc_ev_t *) user_data;
 
     crm_trace("received callback");
 
     if (ccm_api_handle_event == NULL) {
         ccm_api_handle_event =
             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_handle_event", 1);
     }
 
     rc = (*ccm_api_handle_event) (ccm_token);
     if (0 == rc) {
         return 0;
     }
 
     crm_err("CCM connection appears to have failed: rc=%d.", rc);
 
     /* eventually it might be nice to recover and reconnect... but until then... */
     crm_err("Exiting to recover from CCM connection failure");
     return crm_exit(ENOTCONN);
 }
 
 int current_instance = 0;
 void
 cib_ccm_msg_callback(oc_ed_t event, void *cookie, size_t size, const void *data)
 {
     gboolean update_id = FALSE;
     const oc_ev_membership_t *membership = data;
 
     CRM_ASSERT(membership != NULL);
 
     crm_info("Processing CCM event=%s (id=%d)", ccm_event_name(event), membership->m_instance);
 
     if (current_instance > membership->m_instance) {
         crm_err("Membership instance ID went backwards! %d->%d",
                 current_instance, membership->m_instance);
         CRM_ASSERT(current_instance <= membership->m_instance);
     }
 
     switch (event) {
         case OC_EV_MS_NEW_MEMBERSHIP:
         case OC_EV_MS_INVALID:
             update_id = TRUE;
             break;
         case OC_EV_MS_PRIMARY_RESTORED:
             update_id = TRUE;
             break;
         case OC_EV_MS_NOT_PRIMARY:
             crm_trace("Ignoring transitional CCM event: %s", ccm_event_name(event));
             break;
         case OC_EV_MS_EVICTED:
             crm_err("Evicted from CCM: %s", ccm_event_name(event));
             break;
         default:
             crm_err("Unknown CCM event: %d", event);
     }
 
     if (update_id) {
         unsigned int lpc = 0;
 
         CRM_CHECK(membership != NULL, return);
 
         current_instance = membership->m_instance;
 
         for (lpc = 0; lpc < membership->m_n_out; lpc++) {
             crm_update_ccm_node(membership, lpc + membership->m_out_idx, CRM_NODE_LOST,
                                 current_instance);
         }
 
         for (lpc = 0; lpc < membership->m_n_member; lpc++) {
             crm_update_ccm_node(membership, lpc + membership->m_memb_idx, CRM_NODE_MEMBER,
                                 current_instance);
         }
         heartbeat_cluster->llc_ops->client_status(heartbeat_cluster, NULL, crm_system_name, 0);
     }
 
     if (ccm_api_callback_done == NULL) {
         ccm_api_callback_done =
             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_callback_done", 1);
     }
     (*ccm_api_callback_done) (cookie);
     return;
 }
 #endif
 
 gboolean
 can_write(int flags)
 {
     return TRUE;
 }
 
 static gboolean
 cib_force_exit(gpointer data)
 {
     crm_notice("Forcing exit!");
-    terminate_cib(__FUNCTION__, TRUE);
+    terminate_cib(__FUNCTION__, -1);
     return FALSE;
 }
 
 static void
 disconnect_remote_client(gpointer key, gpointer value, gpointer user_data)
 {
     crm_client_t *a_client = value;
 
     crm_err("Disconnecting %s... Not implemented", crm_str(a_client->name));
 }
 
 void
 cib_shutdown(int nsig)
 {
     struct qb_ipcs_stats srv_stats;
 
     if (cib_shutdown_flag == FALSE) {
         int disconnects = 0;
         qb_ipcs_connection_t *c = NULL;
 
         cib_shutdown_flag = TRUE;
 
         c = qb_ipcs_connection_first_get(ipcs_rw);
         while (c != NULL) {
             qb_ipcs_connection_t *last = c;
 
             c = qb_ipcs_connection_next_get(ipcs_rw, last);
 
             crm_debug("Disconnecting r/w client %p...", last);
             qb_ipcs_disconnect(last);
             qb_ipcs_connection_unref(last);
             disconnects++;
         }
 
         c = qb_ipcs_connection_first_get(ipcs_ro);
         while (c != NULL) {
             qb_ipcs_connection_t *last = c;
 
             c = qb_ipcs_connection_next_get(ipcs_ro, last);
 
             crm_debug("Disconnecting r/o client %p...", last);
             qb_ipcs_disconnect(last);
             qb_ipcs_connection_unref(last);
             disconnects++;
         }
 
         c = qb_ipcs_connection_first_get(ipcs_shm);
         while (c != NULL) {
             qb_ipcs_connection_t *last = c;
 
             c = qb_ipcs_connection_next_get(ipcs_shm, last);
 
             crm_debug("Disconnecting non-blocking r/w client %p...", last);
             qb_ipcs_disconnect(last);
             qb_ipcs_connection_unref(last);
             disconnects++;
         }
 
         disconnects += crm_hash_table_size(client_connections);
 
         crm_debug("Disconnecting %d remote clients", crm_hash_table_size(client_connections));
         g_hash_table_foreach(client_connections, disconnect_remote_client, NULL);
         crm_info("Disconnected %d clients", disconnects);
     }
 
     qb_ipcs_stats_get(ipcs_rw, &srv_stats, QB_FALSE);
 
     if (crm_hash_table_size(client_connections) == 0) {
         crm_info("All clients disconnected (%d)", srv_stats.active_connections);
         initiate_exit();
 
     } else {
         crm_info("Waiting on %d clients to disconnect (%d)",
                  crm_hash_table_size(client_connections), srv_stats.active_connections);
     }
 }
 
 void
 initiate_exit(void)
 {
     int active = 0;
     xmlNode *leaving = NULL;
 
     active = crm_active_peers();
     if (active < 2) {
-        terminate_cib(__FUNCTION__, FALSE);
+        terminate_cib(__FUNCTION__, 0);
         return;
     }
 
     crm_info("Sending disconnect notification to %d peers...", active);
 
     leaving = create_xml_node(NULL, "exit-notification");
     crm_xml_add(leaving, F_TYPE, "cib");
     crm_xml_add(leaving, F_CIB_OPERATION, "cib_shutdown_req");
 
     send_cluster_message(NULL, crm_msg_cib, leaving, TRUE);
     free_xml(leaving);
 
     g_timeout_add(crm_get_msec("5s"), cib_force_exit, NULL);
 }
 
 extern int remote_fd;
 extern int remote_tls_fd;
 
+/*
+ * \internal
+ * \brief Close remote sockets, free the global CIB and quit
+ *
+ * \param[in] caller           Name of calling function (for log message)
+ * \param[in] fast             If 1, skip disconnect; if -1, also exit error
+ */
 void
-terminate_cib(const char *caller, gboolean fast)
+terminate_cib(const char *caller, int fast)
 {
+    crm_info("%s: Exiting%s...", caller,
+             (fast < 0)? " fast" : mainloop ? " from mainloop" : "");
+
     if (remote_fd > 0) {
         close(remote_fd);
         remote_fd = 0;
     }
     if (remote_tls_fd > 0) {
         close(remote_tls_fd);
         remote_tls_fd = 0;
     }
 
-    if (!fast) {
-        crm_info("%s: Disconnecting from cluster infrastructure", caller);
-        crm_cluster_disconnect(&crm_cluster);
-    }
-
     uninitializeCib();
 
-    crm_info("%s: Exiting%s...", caller, fast ? " fast" : mainloop ? " from mainloop" : "");
+    if (fast < 0) {
+        /* Quit fast on error */
+        cib_ipc_servers_destroy(ipcs_ro, ipcs_rw, ipcs_shm);
+        crm_exit(EINVAL);
 
-    if (fast == FALSE && mainloop != NULL && g_main_is_running(mainloop)) {
+    } else if ((mainloop != NULL) && g_main_is_running(mainloop)) {
+        /* Quit via returning from the main loop. If fast == 1, we skip the
+         * disconnect here, and it will be done when the main loop returns
+         * (this allows the peer status callback to avoid messing with the
+         * peer caches).
+         */
+        if (fast == 0) {
+            crm_cluster_disconnect(&crm_cluster);
+        }
         g_main_quit(mainloop);
 
     } else {
-        qb_ipcs_destroy(ipcs_ro);
-        qb_ipcs_destroy(ipcs_rw);
-        qb_ipcs_destroy(ipcs_shm);
-
-        if (fast) {
-            crm_exit(EINVAL);
-        } else {
-            crm_exit(pcmk_ok);
-        }
+        /* Quit via clean exit. Even the peer status callback can disconnect
+         * here, because we're not returning control to the caller. */
+        crm_cluster_disconnect(&crm_cluster);
+        cib_ipc_servers_destroy(ipcs_ro, ipcs_rw, ipcs_shm);
+        crm_exit(pcmk_ok);
     }
 }
diff --git a/cib/callbacks.h b/cib/callbacks.h
index bca9992191..a49428e101 100644
--- a/cib/callbacks.h
+++ b/cib/callbacks.h
@@ -1,82 +1,82 @@
 /*
  * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This software is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
 #include <stdio.h>
 #include <sys/types.h>
 #include <unistd.h>
 
 #include <crm/crm.h>
 #include <crm/cib.h>
 #include <crm/common/xml.h>
 #include <crm/cluster.h>
 #include <crm/common/ipcs.h>
 #include <crm/common/mainloop.h>
 #ifdef HAVE_GNUTLS_GNUTLS_H
 #  undef KEYFILE
 #  include <gnutls/gnutls.h>
 #endif
 
 extern gboolean cib_is_master;
 extern GHashTable *peer_hash;
 extern GHashTable *config_hash;
 
 /* *INDENT-OFF* */
 enum cib_notifications
 {
     cib_notify_pre     = 0x0001,
     cib_notify_post    = 0x0002,
     cib_notify_replace = 0x0004,
     cib_notify_confirm = 0x0008,
     cib_notify_diff    = 0x0010,
 };
 /* *INDENT-ON* */
 
 typedef struct cib_operation_s {
     const char *operation;
     gboolean modifies_cib;
     gboolean needs_privileges;
     gboolean needs_quorum;
     int (*prepare) (xmlNode *, xmlNode **, const char **);
     int (*cleanup) (int, xmlNode **, xmlNode **);
     int (*fn) (const char *, int, const char *, xmlNode *,
                xmlNode *, xmlNode *, xmlNode **, xmlNode **);
 } cib_operation_t;
 
 extern struct qb_ipcs_service_handlers ipc_ro_callbacks;
 extern struct qb_ipcs_service_handlers ipc_rw_callbacks;
 extern qb_ipcs_service_t *ipcs_ro;
 extern qb_ipcs_service_t *ipcs_rw;
 extern qb_ipcs_service_t *ipcs_shm;
 
 extern void cib_peer_callback(xmlNode * msg, void *private_data);
 extern void cib_client_status_callback(const char *node, const char *client,
                                        const char *status, void *private);
 extern void cib_common_callback_worker(uint32_t id, uint32_t flags, xmlNode * op_request,
                                        crm_client_t * cib_client, gboolean privileged);
 
 void cib_shutdown(int nsig);
 void initiate_exit(void);
-void terminate_cib(const char *caller, gboolean fast);
+void terminate_cib(const char *caller, int fast);
 
 extern gboolean cib_legacy_mode(void);
 
 #if SUPPORT_HEARTBEAT
 extern void cib_ha_peer_callback(HA_Message * msg, void *private_data);
 extern int cib_ccm_dispatch(gpointer user_data);
 extern void cib_ccm_msg_callback(oc_ed_t event, void *cookie, size_t size, const void *data);
 #endif
diff --git a/cib/main.c b/cib/main.c
index e20a2b6dff..cbaf7b515b 100644
--- a/cib/main.c
+++ b/cib/main.c
@@ -1,583 +1,584 @@
 /*
  * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This software is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
 #include <crm_internal.h>
 
 #include <sys/param.h>
 #include <stdio.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #include <sys/utsname.h>
 
 #include <stdlib.h>
 #include <errno.h>
 #include <fcntl.h>
 
 #include <crm/crm.h>
 #include <crm/cib/internal.h>
 #include <crm/msg_xml.h>
 #include <crm/cluster/internal.h>
 
 #include <crm/common/xml.h>
 
 #include <crm/common/mainloop.h>
 
 #include <cibio.h>
 #include <callbacks.h>
 #include <pwd.h>
 #include <grp.h>
 #include "common.h"
 
 #if HAVE_LIBXML2
 #  include <libxml/parser.h>
 #endif
 
 #ifdef HAVE_GETOPT_H
 #  include <getopt.h>
 #endif
 
 #if HAVE_BZLIB_H
 #  include <bzlib.h>
 #endif
 
 extern int init_remote_listener(int port, gboolean encrypted);
 gboolean cib_shutdown_flag = FALSE;
 int cib_status = pcmk_ok;
 
 crm_cluster_t crm_cluster;
 
 #if SUPPORT_HEARTBEAT
 oc_ev_t *cib_ev_token;
 ll_cluster_t *hb_conn = NULL;
 extern void oc_ev_special(const oc_ev_t *, oc_ev_class_t, int);
 gboolean cib_register_ha(ll_cluster_t * hb_cluster, const char *client_name);
 #else
 void *hb_conn = NULL;
 #endif
 
-extern void terminate_cib(const char *caller, gboolean fast);
-
 GMainLoop *mainloop = NULL;
 const char *cib_root = NULL;
 char *cib_our_uname = NULL;
 gboolean preserve_status = FALSE;
 gboolean cib_writes_enabled = TRUE;
 int remote_fd = 0;
 int remote_tls_fd = 0;
 
 int cib_init(void);
 void cib_shutdown(int nsig);
 gboolean startCib(const char *filename);
 extern int write_cib_contents(gpointer p);
 
 GHashTable *config_hash = NULL;
 GHashTable *local_notify_queue = NULL;
 
 char *channel1 = NULL;
 char *channel2 = NULL;
 char *channel3 = NULL;
 char *channel4 = NULL;
 char *channel5 = NULL;
 
 #define OPTARGS	"maswr:V?"
 void cib_cleanup(void);
 
 static void
 cib_enable_writes(int nsig)
 {
     crm_info("(Re)enabling disk writes");
     cib_writes_enabled = TRUE;
 }
 
 static void
 log_cib_client(gpointer key, gpointer value, gpointer user_data)
 {
     crm_info("Client %s", crm_client_name(value));
 }
 
 /* *INDENT-OFF* */
 static struct crm_option long_options[] = {
     /* Top-level Options */
     {"help",    0, 0, '?', "\tThis text"},
     {"verbose", 0, 0, 'V', "\tIncrease debug output"},
 
     {"per-action-cib", 0, 0, 'a', "\tAdvanced use only"},
     {"stand-alone",    0, 0, 's', "\tAdvanced use only"},
     {"disk-writes",    0, 0, 'w', "\tAdvanced use only"},
     {"cib-root",       1, 0, 'r', "\tAdvanced use only"},
 
     {0, 0, 0, 0}
 };
 /* *INDENT-ON* */
 
 int
 main(int argc, char **argv)
 {
     int flag;
     int rc = 0;
     int index = 0;
     int argerr = 0;
     struct passwd *pwentry = NULL;
 
     crm_log_preinit(NULL, argc, argv);
     crm_set_options(NULL, "[options]",
                     long_options, "Daemon for storing and replicating the cluster configuration");
 
     crm_peer_init();
 
     mainloop_add_signal(SIGTERM, cib_shutdown);
     mainloop_add_signal(SIGPIPE, cib_enable_writes);
 
     cib_writer = mainloop_add_trigger(G_PRIORITY_LOW, write_cib_contents, NULL);
 
     while (1) {
         flag = crm_get_option(argc, argv, &index);
         if (flag == -1)
             break;
 
         switch (flag) {
             case 'V':
                 crm_bump_log_level(argc, argv);
                 break;
             case 's':
                 stand_alone = TRUE;
                 preserve_status = TRUE;
                 cib_writes_enabled = FALSE;
 
                 pwentry = getpwnam(CRM_DAEMON_USER);
                 CRM_CHECK(pwentry != NULL,
                           crm_perror(LOG_ERR, "Invalid uid (%s) specified", CRM_DAEMON_USER);
                           return 100);
 
                 rc = setgid(pwentry->pw_gid);
                 if (rc < 0) {
                     crm_perror(LOG_ERR, "Could not set group to %d", pwentry->pw_gid);
                     return 100;
                 }
 
                 rc = initgroups(CRM_DAEMON_GROUP, pwentry->pw_gid);
                 if (rc < 0) {
                     crm_perror(LOG_ERR, "Could not setup groups for user %d", pwentry->pw_uid);
                     return 100;
                 }
 
                 rc = setuid(pwentry->pw_uid);
                 if (rc < 0) {
                     crm_perror(LOG_ERR, "Could not set user to %d", pwentry->pw_uid);
                     return 100;
                 }
                 break;
             case '?':          /* Help message */
                 crm_help(flag, EX_OK);
                 break;
             case 'w':
                 cib_writes_enabled = TRUE;
                 break;
             case 'r':
                 cib_root = optarg;
                 break;
             case 'm':
                 cib_metadata();
                 return 0;
             default:
                 ++argerr;
                 break;
         }
     }
     if (argc - optind == 1 && safe_str_eq("metadata", argv[optind])) {
         cib_metadata();
         return 0;
     }
 
     if (optind > argc) {
         ++argerr;
     }
 
     if (argerr) {
         crm_help('?', EX_USAGE);
     }
 
     crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE);
     if (cib_root == NULL) {
         char *path = crm_strdup_printf("%s/cib.xml", CRM_CONFIG_DIR);
         char *legacy = crm_strdup_printf("%s/cib.xml", CRM_LEGACY_CONFIG_DIR);
 
         if (g_file_test(path, G_FILE_TEST_EXISTS)) {
             cib_root = CRM_CONFIG_DIR;
 
         } else if (g_file_test(legacy, G_FILE_TEST_EXISTS)) {
             cib_root = CRM_LEGACY_CONFIG_DIR;
             crm_notice("Using legacy config location: %s", cib_root);
 
         } else {
             cib_root = CRM_CONFIG_DIR;
             crm_notice("Using new config location: %s", cib_root);
         }
 
         free(legacy);
         free(path);
 
     } else {
         crm_notice("Using custom config location: %s", cib_root);
     }
 
     if (crm_is_writable(cib_root, NULL, CRM_DAEMON_USER, CRM_DAEMON_GROUP, FALSE) == FALSE) {
         crm_err("Bad permissions on %s. Terminating", cib_root);
         fprintf(stderr, "ERROR: Bad permissions on %s. See logs for details\n", cib_root);
         fflush(stderr);
         return 100;
     }
 
     /* read local config file */
     rc = cib_init();
 
     CRM_CHECK(crm_hash_table_size(client_connections) == 0,
               crm_warn("Not all clients gone at exit"));
     g_hash_table_foreach(client_connections, log_cib_client, NULL);
     cib_cleanup();
 
 #if SUPPORT_HEARTBEAT
     if (hb_conn) {
         hb_conn->llc_ops->delete(hb_conn);
     }
 #endif
 
     crm_info("Done");
     return rc;
 }
 
 void
 cib_cleanup(void)
 {
     crm_peer_destroy();
     if (local_notify_queue) {
         g_hash_table_destroy(local_notify_queue);
     }
     crm_client_cleanup();
     g_hash_table_destroy(config_hash);
     free(cib_our_uname);
     free(channel1);
     free(channel2);
     free(channel3);
     free(channel4);
     free(channel5);
 }
 
 unsigned long cib_num_ops = 0;
 const char *cib_stat_interval = "10min";
 unsigned long cib_num_local = 0, cib_num_updates = 0, cib_num_fail = 0;
 unsigned long cib_bad_connects = 0, cib_num_timeouts = 0;
 
 #if SUPPORT_HEARTBEAT
 gboolean ccm_connect(void);
 
 static void
 ccm_connection_destroy(gpointer user_data)
 {
     crm_err("CCM connection failed... blocking while we reconnect");
     CRM_ASSERT(ccm_connect());
     return;
 }
 
 static void *ccm_library = NULL;
 
 gboolean
 ccm_connect(void)
 {
     gboolean did_fail = TRUE;
     int num_ccm_fails = 0;
     int max_ccm_fails = 30;
     int ret;
     int cib_ev_fd;
 
     int (*ccm_api_register) (oc_ev_t ** token) =
         find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_register", 1);
 
     int (*ccm_api_set_callback) (const oc_ev_t * token,
                                  oc_ev_class_t class,
                                  oc_ev_callback_t * fn,
                                  oc_ev_callback_t ** prev_fn) =
         find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_set_callback", 1);
 
     void (*ccm_api_special) (const oc_ev_t *, oc_ev_class_t, int) =
         find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_special", 1);
     int (*ccm_api_activate) (const oc_ev_t * token, int *fd) =
         find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_activate", 1);
     int (*ccm_api_unregister) (oc_ev_t * token) =
         find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_unregister", 1);
 
     static struct mainloop_fd_callbacks ccm_fd_callbacks = {
         .dispatch = cib_ccm_dispatch,
         .destroy = ccm_connection_destroy,
     };
 
     while (did_fail) {
         did_fail = FALSE;
         crm_info("Registering with CCM...");
         ret = (*ccm_api_register) (&cib_ev_token);
         if (ret != 0) {
             did_fail = TRUE;
         }
 
         if (did_fail == FALSE) {
             crm_trace("Setting up CCM callbacks");
             ret = (*ccm_api_set_callback) (cib_ev_token, OC_EV_MEMB_CLASS,
                                            cib_ccm_msg_callback, NULL);
             if (ret != 0) {
                 crm_warn("CCM callback not set");
                 did_fail = TRUE;
             }
         }
         if (did_fail == FALSE) {
             (*ccm_api_special) (cib_ev_token, OC_EV_MEMB_CLASS, 0);
 
             crm_trace("Activating CCM token");
             ret = (*ccm_api_activate) (cib_ev_token, &cib_ev_fd);
             if (ret != 0) {
                 crm_warn("CCM Activation failed");
                 did_fail = TRUE;
             }
         }
 
         if (did_fail) {
             num_ccm_fails++;
             (*ccm_api_unregister) (cib_ev_token);
 
             if (num_ccm_fails < max_ccm_fails) {
                 crm_warn("CCM Connection failed %d times (%d max)", num_ccm_fails, max_ccm_fails);
                 sleep(3);
 
             } else {
                 crm_err("CCM Activation failed %d (max) times", num_ccm_fails);
                 return FALSE;
             }
         }
     }
 
     crm_debug("CCM Activation passed... all set to go!");
     mainloop_add_fd("heartbeat-ccm", G_PRIORITY_MEDIUM, cib_ev_fd, cib_ev_token, &ccm_fd_callbacks);
 
     return TRUE;
 }
 #endif
 
 #if SUPPORT_COROSYNC
 static void
 cib_cs_dispatch(cpg_handle_t handle,
                  const struct cpg_name *groupName,
                  uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len)
 {
     uint32_t kind = 0;
     xmlNode *xml = NULL;
     const char *from = NULL;
     char *data = pcmk_message_common_cs(handle, nodeid, pid, msg, &kind, &from);
 
     if(data == NULL) {
         return;
     }
     if (kind == crm_class_cluster) {
         xml = string2xml(data);
         if (xml == NULL) {
             crm_err("Invalid XML: '%.120s'", data);
             free(data);
             return;
         }
         crm_xml_add(xml, F_ORIG, from);
         /* crm_xml_add_int(xml, F_SEQ, wrapper->id); */
         cib_peer_callback(xml, NULL);
     }
 
     free_xml(xml);
     free(data);
 }
 
 static void
 cib_cs_destroy(gpointer user_data)
 {
     if (cib_shutdown_flag) {
         crm_info("Corosync disconnection complete");
     } else {
         crm_err("Corosync connection lost!  Exiting.");
-        terminate_cib(__FUNCTION__, TRUE);
+        terminate_cib(__FUNCTION__, -1);
     }
 }
 #endif
 
 static void
 cib_peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data)
 {
-    if ((type == crm_status_processes) && legacy_mode
-        && is_not_set(node->processes, crm_get_cluster_proc())) {
-        uint32_t old = 0;
-
-        if (data) {
-            old = *(const uint32_t *)data;
-        }
+    switch (type) {
+        case crm_status_processes:
+            if (legacy_mode && is_not_set(node->processes, crm_get_cluster_proc())) {
+                uint32_t old = data? *(const uint32_t *)data : 0;
+
+                if ((node->processes ^ old) & crm_proc_cpg) {
+                    crm_info("Attempting to disable legacy mode after %s left the cluster",
+                             node->uname);
+                    legacy_mode = FALSE;
+                }
+            }
+            break;
 
-        if ((node->processes ^ old) & crm_proc_cpg) {
-            crm_info("Attempting to disable legacy mode after %s left the cluster", node->uname);
-            legacy_mode = FALSE;
-        }
-    }
+        case crm_status_uname:
+        case crm_status_rstate:
+        case crm_status_nstate:
+            if (cib_shutdown_flag && (crm_active_peers() < 2)
+                && crm_hash_table_size(client_connections) == 0) {
 
-    if (cib_shutdown_flag && crm_active_peers() < 2 && crm_hash_table_size(client_connections) == 0) {
-        crm_info("No more peers");
-        /* @TODO
-         * terminate_cib() calls crm_cluster_disconnect() which calls
-         * crm_peer_destroy() which destroys the peer caches, which a peer
-         * status callback shouldn't do. For now, there is a workaround in
-         * crm_update_peer_proc(), but CIB should be refactored to avoid
-         * destroying the peer caches here.
-         */
-        terminate_cib(__FUNCTION__, FALSE);
+                crm_info("No more peers");
+                terminate_cib(__FUNCTION__, 1);
+            }
+            break;
     }
 }
 
 #if SUPPORT_HEARTBEAT
 static void
 cib_ha_connection_destroy(gpointer user_data)
 {
     if (cib_shutdown_flag) {
         crm_info("Heartbeat disconnection complete... exiting");
-        terminate_cib(__FUNCTION__, FALSE);
+        terminate_cib(__FUNCTION__, 0);
     } else {
         crm_err("Heartbeat connection lost!  Exiting.");
-        terminate_cib(__FUNCTION__, TRUE);
+        terminate_cib(__FUNCTION__, -1);
     }
 }
 #endif
 
 int
 cib_init(void)
 {
     if (is_openais_cluster()) {
 #if SUPPORT_COROSYNC
         crm_cluster.destroy = cib_cs_destroy;
         crm_cluster.cpg.cpg_deliver_fn = cib_cs_dispatch;
         crm_cluster.cpg.cpg_confchg_fn = pcmk_cpg_membership;
 #endif
     } else if (is_heartbeat_cluster()) {
 #if SUPPORT_HEARTBEAT
         crm_cluster.hb_dispatch = cib_ha_peer_callback;
         crm_cluster.destroy = cib_ha_connection_destroy;
 #endif
     }
 
     config_hash =
         g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str);
 
     if (startCib("cib.xml") == FALSE) {
         crm_crit("Cannot start CIB... terminating");
         crm_exit(ENODATA);
     }
 
     if (stand_alone == FALSE) {
         if (crm_cluster_connect(&crm_cluster) == FALSE) {
             crm_crit("Cannot sign in to the cluster... terminating");
             crm_exit(DAEMON_RESPAWN_STOP);
         }
         cib_our_uname = crm_cluster.uname;
         if (is_openais_cluster()) {
             crm_set_status_callback(&cib_peer_update_callback);
         }
 #if SUPPORT_HEARTBEAT
         if (is_heartbeat_cluster()) {
 
             gboolean was_error = FALSE;
 
             hb_conn = crm_cluster.hb_conn;
             if (was_error == FALSE) {
                 if (HA_OK !=
                     hb_conn->llc_ops->set_cstatus_callback(hb_conn, cib_client_status_callback,
                                                            hb_conn)) {
 
                     crm_err("Cannot set cstatus callback: %s", hb_conn->llc_ops->errmsg(hb_conn));
                     was_error = TRUE;
                 }
             }
 
             if (was_error == FALSE) {
                 was_error = (ccm_connect() == FALSE);
             }
 
             if (was_error == FALSE) {
                 /* Async get client status information in the cluster */
                 crm_info("Requesting the list of configured nodes");
                 hb_conn->llc_ops->client_status(hb_conn, NULL, CRM_SYSTEM_CIB, -1);
             }
         }
 #endif
 
     } else {
         cib_our_uname = strdup("localhost");
     }
 
     cib_ipc_servers_init(&ipcs_ro,
                          &ipcs_rw,
                          &ipcs_shm,
                          &ipc_ro_callbacks,
                          &ipc_rw_callbacks);
 
     if (stand_alone) {
         cib_is_master = TRUE;
     }
 
     /* Create the mainloop and run it... */
     mainloop = g_main_new(FALSE);
     crm_info("Starting %s mainloop", crm_system_name);
-
     g_main_run(mainloop);
+
+    /* If main loop returned, clean up and exit. We disconnect in case
+     * terminate_cib() was called with fast=1.
+     */
+    crm_cluster_disconnect(&crm_cluster);
     cib_ipc_servers_destroy(ipcs_ro, ipcs_rw, ipcs_shm);
 
     return crm_exit(pcmk_ok);
 }
 
 gboolean
 startCib(const char *filename)
 {
     gboolean active = FALSE;
     xmlNode *cib = readCibXmlFile(cib_root, filename, !preserve_status);
 
     CRM_ASSERT(cib != NULL);
 
     if (activateCibXml(cib, TRUE, "start") == 0) {
         int port = 0;
         const char *port_s = NULL;
 
         active = TRUE;
 
         cib_read_config(config_hash, cib);
 
         port_s = crm_element_value(cib, "remote-tls-port");
         if (port_s) {
             port = crm_parse_int(port_s, "0");
             remote_tls_fd = init_remote_listener(port, TRUE);
         }
 
         port_s = crm_element_value(cib, "remote-clear-port");
         if (port_s) {
             port = crm_parse_int(port_s, "0");
             remote_fd = init_remote_listener(port, FALSE);
         }
 
         crm_info("CIB Initialization completed successfully");
     }
 
     return active;
 }
diff --git a/cib/messages.c b/cib/messages.c
index 363562c086..eca63b9986 100644
--- a/cib/messages.c
+++ b/cib/messages.c
@@ -1,569 +1,569 @@
 /*
  * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This software is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
 #include <crm_internal.h>
 
 #include <stdio.h>
 #include <unistd.h>
 #include <stdlib.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <time.h>
 
 #include <sys/param.h>
 #include <sys/types.h>
 
 #include <crm/crm.h>
 #include <crm/cib/internal.h>
 #include <crm/msg_xml.h>
 
 #include <crm/common/xml.h>
 #include <crm/common/ipcs.h>
 #include <crm/cluster/internal.h>
 
 #include <cibio.h>
 #include <cibmessages.h>
 #include <callbacks.h>
 
 #define MAX_DIFF_RETRY 5
 
 #ifdef CIBPIPE
 gboolean cib_is_master = TRUE;
 #else
 gboolean cib_is_master = FALSE;
 #endif
 
 xmlNode *the_cib = NULL;
 gboolean syncd_once = FALSE;
 extern const char *cib_our_uname;
 int revision_check(xmlNode * cib_update, xmlNode * cib_copy, int flags);
 int get_revision(xmlNode * xml_obj, int cur_revision);
 
 int updateList(xmlNode * local_cib, xmlNode * update_command, xmlNode * failed,
                int operation, const char *section);
 
 gboolean check_generation(xmlNode * newCib, xmlNode * oldCib);
 
 gboolean update_results(xmlNode * failed, xmlNode * target, const char *operation, int return_code);
 
 int cib_update_counter(xmlNode * xml_obj, const char *field, gboolean reset);
 
 int sync_our_cib(xmlNode * request, gboolean all);
 
 extern xmlNode *cib_msg_copy(const xmlNode * msg, gboolean with_data);
 extern gboolean cib_shutdown_flag;
 
 int
 cib_process_shutdown_req(const char *op, int options, const char *section, xmlNode * req,
                          xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib,
                          xmlNode ** answer)
 {
 #ifdef CIBPIPE
     return -EINVAL;
 #else
     int result = pcmk_ok;
     const char *host = crm_element_value(req, F_ORIG);
 
     *answer = NULL;
 
     if (crm_element_value(req, F_CIB_ISREPLY) == NULL) {
         crm_info("Shutdown REQ from %s", host);
         return pcmk_ok;
 
     } else if (cib_shutdown_flag) {
         crm_info("Shutdown ACK from %s", host);
-        terminate_cib(__FUNCTION__, FALSE);
+        terminate_cib(__FUNCTION__, 0);
         return pcmk_ok;
 
     } else {
         crm_err("Shutdown ACK from %s - not shutting down", host);
         result = -EINVAL;
     }
 
     return result;
 #endif
 }
 
 int
 cib_process_default(const char *op, int options, const char *section, xmlNode * req,
                     xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib,
                     xmlNode ** answer)
 {
     int result = pcmk_ok;
 
     crm_trace("Processing \"%s\" event", op);
     *answer = NULL;
 
     if (op == NULL) {
         result = -EINVAL;
         crm_err("No operation specified");
 
     } else if (strcasecmp(CRM_OP_NOOP, op) == 0) {
         ;
 
     } else {
         result = -EPROTONOSUPPORT;
         crm_err("Action [%s] is not supported by the CIB", op);
     }
     return result;
 }
 
 int
 cib_process_quit(const char *op, int options, const char *section, xmlNode * req, xmlNode * input,
                  xmlNode * existing_cib, xmlNode ** result_cib, xmlNode ** answer)
 {
     int result = pcmk_ok;
 
     crm_trace("Processing \"%s\" event", op);
 
     crm_warn("The CRMd has asked us to exit... complying");
     crm_exit(pcmk_ok);
     return result;
 }
 
 int
 cib_process_readwrite(const char *op, int options, const char *section, xmlNode * req,
                       xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib,
                       xmlNode ** answer)
 {
 #ifdef CIBPIPE
     return -EINVAL;
 #else
     int result = pcmk_ok;
 
     crm_trace("Processing \"%s\" event", op);
 
     if (safe_str_eq(op, CIB_OP_ISMASTER)) {
         if (cib_is_master == TRUE) {
             result = pcmk_ok;
         } else {
             result = -EPERM;
         }
         return result;
     }
 
     if (safe_str_eq(op, CIB_OP_MASTER)) {
         if (cib_is_master == FALSE) {
             crm_info("We are now in R/W mode");
             cib_is_master = TRUE;
             syncd_once = TRUE;
 
         } else {
             crm_debug("We are still in R/W mode");
         }
 
     } else if (cib_is_master) {
         crm_info("We are now in R/O mode");
         cib_is_master = FALSE;
     }
 
     return result;
 #endif
 }
 
 int sync_in_progress = 0;
 void
 send_sync_request(const char *host)
 {
     xmlNode *sync_me = create_xml_node(NULL, "sync-me");
 
     crm_info("Requesting re-sync from peer");
     sync_in_progress++;
 
     crm_xml_add(sync_me, F_TYPE, "cib");
     crm_xml_add(sync_me, F_CIB_OPERATION, CIB_OP_SYNC_ONE);
     crm_xml_add(sync_me, F_CIB_DELEGATED, cib_our_uname);
 
     send_cluster_message(host ? crm_get_peer(0, host) : NULL, crm_msg_cib, sync_me, FALSE);
     free_xml(sync_me);
 }
 
 int
 cib_process_ping(const char *op, int options, const char *section, xmlNode * req, xmlNode * input,
                  xmlNode * existing_cib, xmlNode ** result_cib, xmlNode ** answer)
 {
 #ifdef CIBPIPE
     return -EINVAL;
 #else
     const char *host = crm_element_value(req, F_ORIG);
     const char *seq = crm_element_value(req, F_CIB_PING_ID);
     char *digest = calculate_xml_versioned_digest(the_cib, FALSE, TRUE, CRM_FEATURE_SET);
 
     static struct qb_log_callsite *cs = NULL;
 
     crm_trace("Processing \"%s\" event %s from %s", op, seq, host);
     *answer = create_xml_node(NULL, XML_CRM_TAG_PING);
 
     crm_xml_add(*answer, XML_ATTR_CRM_VERSION, CRM_FEATURE_SET);
     crm_xml_add(*answer, XML_ATTR_DIGEST, digest);
     crm_xml_add(*answer, F_CIB_PING_ID, seq);
 
     if (cs == NULL) {
         cs = qb_log_callsite_get(__func__, __FILE__, __FUNCTION__, LOG_TRACE, __LINE__, crm_trace_nonlog);
     }
     if (cs && cs->targets) {
         /* Append additional detail so the reciever can log the differences */
         add_message_xml(*answer, F_CIB_CALLDATA, the_cib);
 
     } else {
         /* Always include at least the version details */
         const char *tag = TYPE(the_cib);
         xmlNode *shallow = create_xml_node(NULL, tag);
 
         copy_in_properties(shallow, the_cib);
         add_message_xml(*answer, F_CIB_CALLDATA, shallow);
         free_xml(shallow);
     }
 
     crm_info("Reporting our current digest to %s: %s for %s.%s.%s (%p %d)",
              host, digest,
              crm_element_value(existing_cib, XML_ATTR_GENERATION_ADMIN),
              crm_element_value(existing_cib, XML_ATTR_GENERATION),
              crm_element_value(existing_cib, XML_ATTR_NUMUPDATES),
              existing_cib,
              cs && cs->targets);
 
     free(digest);
 
     return pcmk_ok;
 #endif
 }
 
 int
 cib_process_sync(const char *op, int options, const char *section, xmlNode * req, xmlNode * input,
                  xmlNode * existing_cib, xmlNode ** result_cib, xmlNode ** answer)
 {
 #ifdef CIBPIPE
     return -EINVAL;
 #else
     return sync_our_cib(req, TRUE);
 #endif
 }
 
 int
 cib_process_upgrade_server(const char *op, int options, const char *section, xmlNode * req, xmlNode * input,
                            xmlNode * existing_cib, xmlNode ** result_cib, xmlNode ** answer)
 {
 #ifdef CIBPIPE
     return -EINVAL;
 #else
     int rc = pcmk_ok;
 
     *answer = NULL;
 
     if(crm_element_value(req, F_CIB_SCHEMA_MAX)) {
         return cib_process_upgrade(
             op, options, section, req, input, existing_cib, result_cib, answer);
 
     } else {
         int new_version = 0;
         int current_version = 0;
         xmlNode *scratch = copy_xml(existing_cib);
         const char *host = crm_element_value(req, F_ORIG);
         const char *value = crm_element_value(existing_cib, XML_ATTR_VALIDATION);
 
         crm_trace("Processing \"%s\" event", op);
         if (value != NULL) {
             current_version = get_schema_version(value);
         }
 
         rc = update_validation(&scratch, &new_version, 0, TRUE, TRUE);
         if (new_version > current_version) {
             xmlNode *up = create_xml_node(NULL, __FUNCTION__);
 
             rc = pcmk_ok;
             crm_notice("Upgrade request from %s verified", host);
 
             crm_xml_add(up, F_TYPE, "cib");
             crm_xml_add(up, F_CIB_OPERATION, CIB_OP_UPGRADE);
             crm_xml_add(up, F_CIB_SCHEMA_MAX, get_schema_name(new_version));
             crm_xml_add(up, F_CIB_DELEGATED, host);
             crm_xml_add(up, F_CIB_CLIENTID, crm_element_value(req, F_CIB_CLIENTID));
             crm_xml_add(up, F_CIB_CALLOPTS, crm_element_value(req, F_CIB_CALLOPTS));
             crm_xml_add(up, F_CIB_CALLID, crm_element_value(req, F_CIB_CALLID));
 
             if (cib_legacy_mode() && cib_is_master) {
                 rc = cib_process_upgrade(
                     op, options, section, up, input, existing_cib, result_cib, answer);
 
             } else {
                 send_cluster_message(NULL, crm_msg_cib, up, FALSE);
             }
 
             free_xml(up);
 
         } else if(rc == pcmk_ok) {
             rc = -pcmk_err_schema_unchanged;
         }
 
         free_xml(scratch);
     }
     return rc;
 #endif
 }
 
 int
 cib_process_sync_one(const char *op, int options, const char *section, xmlNode * req,
                      xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib,
                      xmlNode ** answer)
 {
 #ifdef CIBPIPE
     return -EINVAL;
 #else
     return sync_our_cib(req, FALSE);
 #endif
 }
 
 int
 cib_server_process_diff(const char *op, int options, const char *section, xmlNode * req,
                         xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib,
                         xmlNode ** answer)
 {
     int rc = pcmk_ok;
 
     if (cib_is_master) {
         /* the master is never waiting for a resync */
         sync_in_progress = 0;
     }
 
     if (sync_in_progress > MAX_DIFF_RETRY) {
         /* request another full-sync,
          * the last request may have been lost
          */
         sync_in_progress = 0;
     }
 
     if (sync_in_progress) {
         int diff_add_updates = 0;
         int diff_add_epoch = 0;
         int diff_add_admin_epoch = 0;
 
         int diff_del_updates = 0;
         int diff_del_epoch = 0;
         int diff_del_admin_epoch = 0;
 
         cib_diff_version_details(input,
                                  &diff_add_admin_epoch, &diff_add_epoch, &diff_add_updates,
                                  &diff_del_admin_epoch, &diff_del_epoch, &diff_del_updates);
 
         sync_in_progress++;
         crm_notice("Not applying diff %d.%d.%d -> %d.%d.%d (sync in progress)",
                    diff_del_admin_epoch, diff_del_epoch, diff_del_updates,
                    diff_add_admin_epoch, diff_add_epoch, diff_add_updates);
         return -pcmk_err_diff_resync;
     }
 
     rc = cib_process_diff(op, options, section, req, input, existing_cib, result_cib, answer);
 
     if (rc == -pcmk_err_diff_resync && cib_is_master == FALSE) {
         free_xml(*result_cib);
         *result_cib = NULL;
         send_sync_request(NULL);
 
     } else if (rc == -pcmk_err_diff_resync) {
         rc = -pcmk_err_diff_failed;
         if (options & cib_force_diff) {
             crm_warn("Not requesting full refresh in R/W mode");
         }
     }
 
     return rc;
 }
 
 int
 cib_process_replace_svr(const char *op, int options, const char *section, xmlNode * req,
                         xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib,
                         xmlNode ** answer)
 {
     const char *tag = crm_element_name(input);
     int rc =
         cib_process_replace(op, options, section, req, input, existing_cib, result_cib, answer);
     if (rc == pcmk_ok && safe_str_eq(tag, XML_TAG_CIB)) {
         sync_in_progress = 0;
     }
     return rc;
 }
 
 static int
 delete_cib_object(xmlNode * parent, xmlNode * delete_spec)
 {
     const char *object_name = NULL;
     const char *object_id = NULL;
     xmlNode *equiv_node = NULL;
     int result = pcmk_ok;
 
     if (delete_spec != NULL) {
         object_name = crm_element_name(delete_spec);
     }
     object_id = crm_element_value(delete_spec, XML_ATTR_ID);
 
     crm_trace("Processing: <%s id=%s>", crm_str(object_name), crm_str(object_id));
 
     if (delete_spec == NULL) {
         result = -EINVAL;
 
     } else if (parent == NULL) {
         result = -EINVAL;
 
     } else if (object_id == NULL) {
         /*  placeholder object */
         equiv_node = find_xml_node(parent, object_name, FALSE);
 
     } else {
         equiv_node = find_entity(parent, object_name, object_id);
     }
 
     if (result != pcmk_ok) {
         ;                       /* nothing */
 
     } else if (equiv_node == NULL) {
         result = pcmk_ok;
 
     } else if (xml_has_children(delete_spec) == FALSE) {
         /*  only leaves are deleted */
         crm_debug("Removing leaf: <%s id=%s>", crm_str(object_name), crm_str(object_id));
         free_xml(equiv_node);
         equiv_node = NULL;
 
     } else {
         xmlNode *child = NULL;
 
         for (child = __xml_first_child(delete_spec); child != NULL; child = __xml_next(child)) {
             int tmp_result = delete_cib_object(equiv_node, child);
 
             /*  only the first error is likely to be interesting */
             if (tmp_result != pcmk_ok && result == pcmk_ok) {
                 result = tmp_result;
             }
         }
     }
 
     return result;
 }
 
 int
 cib_process_delete_absolute(const char *op, int options, const char *section, xmlNode * req,
                             xmlNode * input, xmlNode * existing_cib, xmlNode ** result_cib,
                             xmlNode ** answer)
 {
     xmlNode *failed = NULL;
     int result = pcmk_ok;
     xmlNode *update_section = NULL;
 
     crm_trace("Processing \"%s\" event for section=%s", op, crm_str(section));
     if (safe_str_eq(XML_CIB_TAG_SECTION_ALL, section)) {
         section = NULL;
 
     } else if (safe_str_eq(XML_TAG_CIB, section)) {
         section = NULL;
 
     } else if (safe_str_eq(crm_element_name(input), XML_TAG_CIB)) {
         section = NULL;
     }
 
     CRM_CHECK(strcasecmp(CIB_OP_DELETE, op) == 0, return -EINVAL);
 
     if (input == NULL) {
         crm_err("Cannot perform modification with no data");
         return -EINVAL;
     }
 
     failed = create_xml_node(NULL, XML_TAG_FAILED);
 
     update_section = get_object_root(section, *result_cib);
     result = delete_cib_object(update_section, input);
     update_results(failed, input, op, result);
 
     if (xml_has_children(failed)) {
         CRM_CHECK(result != pcmk_ok, result = -EINVAL);
     }
 
     if (result != pcmk_ok) {
         crm_log_xml_err(failed, "CIB Update failures");
         *answer = failed;
 
     } else {
         free_xml(failed);
     }
 
     return result;
 }
 
 gboolean
 check_generation(xmlNode * newCib, xmlNode * oldCib)
 {
     if (cib_compare_generation(newCib, oldCib) >= 0) {
         return TRUE;
     }
 
     crm_warn("Generation from update is older than the existing one");
     return FALSE;
 }
 
 #ifndef CIBPIPE
 int
 sync_our_cib(xmlNode * request, gboolean all)
 {
     int result = pcmk_ok;
     char *digest = NULL;
     const char *host = crm_element_value(request, F_ORIG);
     const char *op = crm_element_value(request, F_CIB_OPERATION);
 
     xmlNode *replace_request = cib_msg_copy(request, FALSE);
 
     CRM_CHECK(the_cib != NULL,;);
     CRM_CHECK(replace_request != NULL,;);
 
     crm_debug("Syncing CIB to %s", all ? "all peers" : host);
     if (all == FALSE && host == NULL) {
         crm_log_xml_err(request, "bad sync");
     }
 
     /* remove the "all == FALSE" condition
      *
      * sync_from was failing, the local client wasnt being notified
      *    because it didnt know it was a reply
      * setting this does not prevent the other nodes from applying it
      *    if all == TRUE
      */
     if (host != NULL) {
         crm_xml_add(replace_request, F_CIB_ISREPLY, host);
     }
     if (all) {
         xml_remove_prop(replace_request, F_CIB_HOST);
     }
 
     crm_xml_add(replace_request, F_CIB_OPERATION, CIB_OP_REPLACE);
     crm_xml_add(replace_request, "original_" F_CIB_OPERATION, op);
     crm_xml_add(replace_request, F_CIB_GLOBAL_UPDATE, XML_BOOLEAN_TRUE);
 
     crm_xml_add(replace_request, XML_ATTR_CRM_VERSION, CRM_FEATURE_SET);
     digest = calculate_xml_versioned_digest(the_cib, FALSE, TRUE, CRM_FEATURE_SET);
     crm_xml_add(replace_request, XML_ATTR_DIGEST, digest);
 
     add_message_xml(replace_request, F_CIB_CALLDATA, the_cib);
 
     if (send_cluster_message
         (all ? NULL : crm_get_peer(0, host), crm_msg_cib, replace_request, FALSE) == FALSE) {
         result = -ENOTCONN;
     }
     free_xml(replace_request);
     free(digest);
     return result;
 }
 #endif
diff --git a/cts/CTStests.py b/cts/CTStests.py
index f81700448e..4d9bd8aa97 100644
--- a/cts/CTStests.py
+++ b/cts/CTStests.py
@@ -1,3225 +1,3225 @@
 '''CTS: Cluster Testing System: Tests module
 
 There are a few things we want to do here:
 
  '''
 
 __copyright__ = '''
 Copyright (C) 2000, 2001 Alan Robertson <alanr@unix.sh>
 Licensed under the GNU GPL.
 
 Add RecourceRecover testcase Zhao Kai <zhaokai@cn.ibm.com>
 '''
 
 #
 # This program is free software; you can redistribute it and/or
 # modify it under the terms of the GNU General Public License
 # as published by the Free Software Foundation; either version 2
 # of the License, or (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA.
 
 #
 #        SPECIAL NOTE:
 #
 #        Tests may NOT implement any cluster-manager-specific code in them.
 #        EXTEND the ClusterManager object to provide the base capabilities
 #        the test needs if you need to do something that the current CM classes
 #        do not.  Otherwise you screw up the whole point of the object structure
 #        in CTS.
 #
 #                Thank you.
 #
 
 import time, os, re, string, tempfile
 from stat import *
 from cts import CTS
 from cts.CTSaudits import *
 from cts.CTSvars   import *
 from cts.patterns  import PatternSelector
 from cts.logging   import LogFactory
 from cts.remote    import RemoteFactory
 from cts.watcher   import LogWatcher
 from cts.environment import EnvFactory
 
 AllTestClasses = [ ]
 
 
 class CTSTest:
     '''
     A Cluster test.
     We implement the basic set of properties and behaviors for a generic
     cluster test.
 
     Cluster tests track their own statistics.
     We keep each of the kinds of counts we track as separate {name,value}
     pairs.
     '''
 
     def __init__(self, cm):
         #self.name="the unnamed test"
         self.Stats = {"calls":0
         ,        "success":0
         ,        "failure":0
         ,        "skipped":0
         ,        "auditfail":0}
 
 #        if not issubclass(cm.__class__, ClusterManager):
 #            raise ValueError("Must be a ClusterManager object")
         self.CM = cm
         self.Env = EnvFactory().getInstance()
         self.rsh = RemoteFactory().getInstance()
         self.logger = LogFactory()
         self.templates = PatternSelector(cm["Name"])
         self.Audits = []
         self.timeout = 120
         self.passed = 1
         self.is_loop = 0
         self.is_unsafe = 0
         self.is_docker_unsafe = 0
         self.is_experimental = 0
         self.is_container = 0
         self.is_valgrind = 0
         self.benchmark = 0  # which tests to benchmark
         self.timer = {}  # timers
 
     def log(self, args):
         self.logger.log(args)
 
     def debug(self, args):
         self.logger.debug(args)
 
     def has_key(self, key):
         return self.Stats.has_key(key)
 
     def __setitem__(self, key, value):
         self.Stats[key] = value
 
     def __getitem__(self, key):
         return self.Stats[key]
 
     def log_mark(self, msg):
         self.debug("MARK: test %s %s %d" % (self.name,msg,time.time()))
         return
 
     def get_timer(self,key = "test"):
         try: return self.timer[key]
         except: return 0
 
     def set_timer(self,key = "test"):
         self.timer[key] = time.time()
         return self.timer[key]
 
     def log_timer(self,key = "test"):
         elapsed = 0
         if key in self.timer:
             elapsed = time.time() - self.timer[key]
             s = key == "test" and self.name or "%s:%s" % (self.name,key)
             self.debug("%s runtime: %.2f" % (s, elapsed))
             del self.timer[key]
         return elapsed
 
     def incr(self, name):
         '''Increment (or initialize) the value associated with the given name'''
         if not self.Stats.has_key(name):
             self.Stats[name] = 0
         self.Stats[name] = self.Stats[name]+1
 
         # Reset the test passed boolean
         if name == "calls":
             self.passed = 1
 
     def failure(self, reason="none"):
         '''Increment the failure count'''
         self.passed = 0
         self.incr("failure")
         self.logger.log(("Test %s" % self.name).ljust(35) + " FAILED: %s" % reason)
         return None
 
     def success(self):
         '''Increment the success count'''
         self.incr("success")
         return 1
 
     def skipped(self):
         '''Increment the skipped count'''
         self.incr("skipped")
         return 1
 
     def __call__(self, node):
         '''Perform the given test'''
         raise ValueError("Abstract Class member (__call__)")
         self.incr("calls")
         return self.failure()
 
     def audit(self):
         passed = 1
         if len(self.Audits) > 0:
             for audit in self.Audits:
                 if not audit():
                     self.logger.log("Internal %s Audit %s FAILED." % (self.name, audit.name()))
                     self.incr("auditfail")
                     passed = 0
         return passed
 
     def setup(self, node):
         '''Setup the given test'''
         return self.success()
 
     def teardown(self, node):
         '''Tear down the given test'''
         return self.success()
 
     def create_watch(self, patterns, timeout, name=None):
         if not name:
             name = self.name
         return LogWatcher(self.Env["LogFileName"], patterns, name, timeout, kind=self.Env["LogWatcher"], hosts=self.Env["nodes"])
 
     def local_badnews(self, prefix, watch, local_ignore=[]):
         errcount = 0
         if not prefix:
             prefix = "LocalBadNews:"
 
         ignorelist = []
         ignorelist.append(" CTS: ")
         ignorelist.append(prefix)
         ignorelist.extend(local_ignore)
 
         while errcount < 100:
             match = watch.look(0)
             if match:
                add_err = 1
                for ignore in ignorelist:
                    if add_err == 1 and re.search(ignore, match):
                        add_err = 0
                if add_err == 1:
                    self.logger.log(prefix + " " + match)
                    errcount = errcount + 1
             else:
               break
         else:
             self.logger.log("Too many errors!")
 
         watch.end()
         return errcount
 
     def is_applicable(self):
         return self.is_applicable_common()
 
     def is_applicable_common(self):
         '''Return TRUE if we are applicable in the current test configuration'''
         #raise ValueError("Abstract Class member (is_applicable)")
 
         if self.is_loop and not self.Env["loop-tests"]:
             return 0
         elif self.is_unsafe and not self.Env["unsafe-tests"]:
             return 0
         elif self.is_valgrind and not self.Env["valgrind-tests"]:
             return 0
         elif self.is_experimental and not self.Env["experimental-tests"]:
             return 0
         elif self.is_docker_unsafe and self.Env["docker"]:
             return 0
         elif self.is_container and not self.Env["container-tests"]:
             return 0
         elif self.Env["benchmark"] and self.benchmark == 0:
             return 0
 
         return 1
 
     def find_ocfs2_resources(self, node):
         self.r_o2cb = None
         self.r_ocfs2 = []
 
         (rc, lines) = self.rsh(node, "crm_resource -c", None)
         for line in lines:
             if re.search("^Resource", line):
                 r = AuditResource(self.CM, line)
                 if r.rtype == "o2cb" and r.parent != "NA":
                     self.debug("Found o2cb: %s" % self.r_o2cb)
                     self.r_o2cb = r.parent
             if re.search("^Constraint", line):
                 c = AuditConstraint(self.CM, line)
                 if c.type == "rsc_colocation" and c.target == self.r_o2cb:
                     self.r_ocfs2.append(c.rsc)
 
         self.debug("Found ocfs2 filesystems: %s" % repr(self.r_ocfs2))
         return len(self.r_ocfs2)
 
     def canrunnow(self, node):
         '''Return TRUE if we can meaningfully run right now'''
         return 1
 
     def errorstoignore(self):
         '''Return list of errors which are 'normal' and should be ignored'''
         return []
 
 
 class StopTest(CTSTest):
     '''Stop (deactivate) the cluster manager on a node'''
     def __init__(self, cm):
         CTSTest.__init__(self, cm)
         self.name = "Stop"
 
     def __call__(self, node):
         '''Perform the 'stop' test. '''
         self.incr("calls")
         if self.CM.ShouldBeStatus[node] != "up":
             return self.skipped()
 
         patterns = []
         # Technically we should always be able to notice ourselves stopping
         patterns.append(self.templates["Pat:We_stopped"] % node)
 
         #if self.Env["use_logd"]:
         #    patterns.append(self.templates["Pat:Logd_stopped"] % node)
 
         # Any active node needs to notice this one left
         # NOTE: This wont work if we have multiple partitions
         for other in self.Env["nodes"]:
             if self.CM.ShouldBeStatus[other] == "up" and other != node:
                 patterns.append(self.templates["Pat:They_stopped"] %(other, self.CM.key_for_node(node)))
                 #self.debug("Checking %s will notice %s left"%(other, node))
 
         watch = self.create_watch(patterns, self.Env["DeadTime"])
         watch.setwatch()
 
         if node == self.CM.OurNode:
             self.incr("us")
         else:
             if self.CM.upcount() <= 1:
                 self.incr("all")
             else:
                 self.incr("them")
 
         self.CM.StopaCM(node)
         watch_result = watch.lookforall()
 
         failreason = None
         UnmatchedList = "||"
         if watch.unmatched:
             (rc, output) = self.rsh(node, "/bin/ps axf", None)
             for line in output:
                 self.debug(line)
 
             (rc, output) = self.rsh(node, "/usr/sbin/dlm_tool dump", None)
             for line in output:
                 self.debug(line)
 
             for regex in watch.unmatched:
                 self.logger.log ("ERROR: Shutdown pattern not found: %s" % (regex))
                 UnmatchedList +=  regex + "||";
                 failreason = "Missing shutdown pattern"
 
         self.CM.cluster_stable(self.Env["DeadTime"])
 
         if not watch.unmatched or self.CM.upcount() == 0:
             return self.success()
 
         if len(watch.unmatched) >= self.CM.upcount():
             return self.failure("no match against (%s)" % UnmatchedList)
 
         if failreason == None:
             return self.success()
         else:
             return self.failure(failreason)
 #
 # We don't register StopTest because it's better when called by
 # another test...
 #
 
 
 class StartTest(CTSTest):
     '''Start (activate) the cluster manager on a node'''
     def __init__(self, cm, debug=None):
         CTSTest.__init__(self,cm)
         self.name = "start"
         self.debug = debug
 
     def __call__(self, node):
         '''Perform the 'start' test. '''
         self.incr("calls")
 
         if self.CM.upcount() == 0:
             self.incr("us")
         else:
             self.incr("them")
 
         if self.CM.ShouldBeStatus[node] != "down":
             return self.skipped()
         elif self.CM.StartaCM(node):
             return self.success()
         else:
             return self.failure("Startup %s on node %s failed"
                                 % (self.Env["Name"], node))
 
 #
 # We don't register StartTest because it's better when called by
 # another test...
 #
 
 
 class FlipTest(CTSTest):
     '''If it's running, stop it.  If it's stopped start it.
        Overthrow the status quo...
     '''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "Flip"
         self.start = StartTest(cm)
         self.stop = StopTest(cm)
 
     def __call__(self, node):
         '''Perform the 'Flip' test. '''
         self.incr("calls")
         if self.CM.ShouldBeStatus[node] == "up":
             self.incr("stopped")
             ret = self.stop(node)
             type = "up->down"
             # Give the cluster time to recognize it's gone...
             time.sleep(self.Env["StableTime"])
         elif self.CM.ShouldBeStatus[node] == "down":
             self.incr("started")
             ret = self.start(node)
             type = "down->up"
         else:
             return self.skipped()
 
         self.incr(type)
         if ret:
             return self.success()
         else:
             return self.failure("%s failure" % type)
 
 #        Register FlipTest as a good test to run
 AllTestClasses.append(FlipTest)
 
 
 class RestartTest(CTSTest):
     '''Stop and restart a node'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "Restart"
         self.start = StartTest(cm)
         self.stop = StopTest(cm)
         self.benchmark = 1
 
     def __call__(self, node):
         '''Perform the 'restart' test. '''
         self.incr("calls")
 
         self.incr("node:" + node)
 
         ret1 = 1
         if self.CM.StataCM(node):
             self.incr("WasStopped")
             if not self.start(node):
                 return self.failure("start (setup) failure: "+node)
 
         self.set_timer()
         if not self.stop(node):
             return self.failure("stop failure: "+node)
         if not self.start(node):
             return self.failure("start failure: "+node)
         return self.success()
 
 #        Register RestartTest as a good test to run
 AllTestClasses.append(RestartTest)
 
 
 class StonithdTest(CTSTest):
     def __init__(self, cm):
         CTSTest.__init__(self, cm)
         self.name = "Stonithd"
         self.startall = SimulStartLite(cm)
         self.benchmark = 1
 
     def __call__(self, node):
         self.incr("calls")
         if len(self.Env["nodes"]) < 2:
             return self.skipped()
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         is_dc = self.CM.is_node_dc(node)
 
         watchpats = []
         watchpats.append(self.templates["Pat:FenceOpOK"] % node)
         watchpats.append(self.templates["Pat:NodeFenced"] % node)
 
         if self.Env["at-boot"] == 0:
             self.debug("Expecting %s to stay down" % node)
             self.CM.ShouldBeStatus[node] = "down"
         else:
             self.debug("Expecting %s to come up again %d" % (node, self.Env["at-boot"]))
             watchpats.append("%s.* S_STARTING -> S_PENDING" % node)
             watchpats.append("%s.* S_PENDING -> S_NOT_DC" % node)
 
         watch = self.create_watch(watchpats, 30 + self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"])
         watch.setwatch()
 
         origin = self.Env.RandomGen.choice(self.Env["nodes"])
 
         rc = self.rsh(origin, "stonith_admin --reboot %s -VVVVVV" % node)
 
         if rc == 194:
             # 194 - 256 = -62 = Timer expired
             #
             # Look for the patterns, usually this means the required
             # device was running on the node to be fenced - or that
             # the required devices were in the process of being loaded
             # and/or moved
             #
             # Effectively the node committed suicide so there will be
             # no confirmation, but pacemaker should be watching and
             # fence the node again
 
             self.logger.log("Fencing command on %s to fence %s timed out" % (origin, node))
 
         elif origin != node and rc != 0:
             self.debug("Waiting for the cluster to recover")
             self.CM.cluster_stable()
 
             self.debug("Waiting STONITHd node to come back up")
             self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
 
             self.logger.log("Fencing command on %s failed to fence %s (rc=%d)" % (origin, node, rc))
 
         elif origin == node and rc != 255:
             # 255 == broken pipe, ie. the node was fenced as expected
             self.logger.log("Locally originated fencing returned %d" % rc)
 
         self.set_timer("fence")
         matched = watch.lookforall()
         self.log_timer("fence")
         self.set_timer("reform")
         if watch.unmatched:
             self.logger.log("Patterns not found: " + repr(watch.unmatched))
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
 
         self.debug("Waiting STONITHd node to come back up")
         self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
 
         self.debug("Waiting for the cluster to re-stabilize with all nodes")
         is_stable = self.CM.cluster_stable(self.Env["StartTime"])
 
         if not matched:
             return self.failure("Didn't find all expected patterns")
         elif not is_stable:
             return self.failure("Cluster did not become stable")
 
         self.log_timer("reform")
         return self.success()
 
     def errorstoignore(self):
         return [
             self.templates["Pat:Fencing_start"] % ".*",
             self.templates["Pat:Fencing_ok"] % ".*",
             r"error.*: Resource .*stonith::.* is active on 2 nodes attempting recovery",
             r"error.*: Operation reboot of .*by .* for stonith_admin.*: Timer expired",
         ]
 
     def is_applicable(self):
         if not self.is_applicable_common():
             return 0
 
         if self.Env.has_key("DoFencing"):
             return self.Env["DoFencing"]
 
         return 1
 
 AllTestClasses.append(StonithdTest)
 
 
 class StartOnebyOne(CTSTest):
     '''Start all the nodes ~ one by one'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "StartOnebyOne"
         self.stopall = SimulStopLite(cm)
         self.start = StartTest(cm)
         self.ns = CTS.NodeStatus(cm.Env)
 
     def __call__(self, dummy):
         '''Perform the 'StartOnebyOne' test. '''
         self.incr("calls")
 
         #        We ignore the "node" parameter...
 
         #        Shut down all the nodes...
         ret = self.stopall(None)
         if not ret:
             return self.failure("Test setup failed")
 
         failed = []
         self.set_timer()
         for node in self.Env["nodes"]:
             if not self.start(node):
                 failed.append(node)
 
         if len(failed) > 0:
             return self.failure("Some node failed to start: " + repr(failed))
 
         return self.success()
 
 #        Register StartOnebyOne as a good test to run
 AllTestClasses.append(StartOnebyOne)
 
 
 class SimulStart(CTSTest):
     '''Start all the nodes ~ simultaneously'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "SimulStart"
         self.stopall = SimulStopLite(cm)
         self.startall = SimulStartLite(cm)
 
     def __call__(self, dummy):
         '''Perform the 'SimulStart' test. '''
         self.incr("calls")
 
         #        We ignore the "node" parameter...
 
         #        Shut down all the nodes...
         ret = self.stopall(None)
         if not ret:
             return self.failure("Setup failed")
 
         self.CM.clear_all_caches()
 
         if not self.startall(None):
             return self.failure("Startall failed")
 
         return self.success()
 
 #        Register SimulStart as a good test to run
 AllTestClasses.append(SimulStart)
 
 
 class SimulStop(CTSTest):
     '''Stop all the nodes ~ simultaneously'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "SimulStop"
         self.startall = SimulStartLite(cm)
         self.stopall = SimulStopLite(cm)
 
     def __call__(self, dummy):
         '''Perform the 'SimulStop' test. '''
         self.incr("calls")
 
         #     We ignore the "node" parameter...
 
         #     Start up all the nodes...
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         if not self.stopall(None):
             return self.failure("Stopall failed")
 
         return self.success()
 
 #     Register SimulStop as a good test to run
 AllTestClasses.append(SimulStop)
 
 
 class StopOnebyOne(CTSTest):
     '''Stop all the nodes in order'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "StopOnebyOne"
         self.startall = SimulStartLite(cm)
         self.stop = StopTest(cm)
 
     def __call__(self, dummy):
         '''Perform the 'StopOnebyOne' test. '''
         self.incr("calls")
 
         #     We ignore the "node" parameter...
 
         #     Start up all the nodes...
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         failed = []
         self.set_timer()
         for node in self.Env["nodes"]:
             if not self.stop(node):
                 failed.append(node)
 
         if len(failed) > 0:
             return self.failure("Some node failed to stop: " + repr(failed))
 
         self.CM.clear_all_caches()
         return self.success()
 
 #     Register StopOnebyOne as a good test to run
 AllTestClasses.append(StopOnebyOne)
 
 
 class RestartOnebyOne(CTSTest):
     '''Restart all the nodes in order'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "RestartOnebyOne"
         self.startall = SimulStartLite(cm)
 
     def __call__(self, dummy):
         '''Perform the 'RestartOnebyOne' test. '''
         self.incr("calls")
 
         #     We ignore the "node" parameter...
 
         #     Start up all the nodes...
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         did_fail = []
         self.set_timer()
         self.restart = RestartTest(self.CM)
         for node in self.Env["nodes"]:
             if not self.restart(node):
                 did_fail.append(node)
 
         if did_fail:
             return self.failure("Could not restart %d nodes: %s"
                                 % (len(did_fail), repr(did_fail)))
         return self.success()
 
 #     Register StopOnebyOne as a good test to run
 AllTestClasses.append(RestartOnebyOne)
 
 
 class PartialStart(CTSTest):
     '''Start a node - but tell it to stop before it finishes starting up'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "PartialStart"
         self.startall = SimulStartLite(cm)
         self.stopall = SimulStopLite(cm)
         self.stop = StopTest(cm)
         #self.is_unsafe = 1
 
     def __call__(self, node):
         '''Perform the 'PartialStart' test. '''
         self.incr("calls")
 
         ret = self.stopall(None)
         if not ret:
             return self.failure("Setup failed")
 
 #   FIXME!  This should use the CM class to get the pattern
 #       then it would be applicable in general
         watchpats = []
         watchpats.append("crmd.*Connecting to cluster infrastructure")
         watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
         watch.setwatch()
 
         self.CM.StartaCMnoBlock(node)
         ret = watch.lookforall()
         if not ret:
             self.logger.log("Patterns not found: " + repr(watch.unmatched))
             return self.failure("Setup of %s failed" % node)
 
         ret = self.stop(node)
         if not ret:
             return self.failure("%s did not stop in time" % node)
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
 
         # We might do some fencing in the 2-node case if we make it up far enough
         return [
             """Executing reboot fencing operation""",
         ]
 
 #     Register StopOnebyOne as a good test to run
 AllTestClasses.append(PartialStart)
 
 
 class StandbyTest(CTSTest):
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "Standby"
         self.benchmark = 1
 
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
 
     # make sure the node is active
     # set the node to standby mode
     # check resources, none resource should be running on the node
     # set the node to active mode
     # check resouces, resources should have been migrated back (SHOULD THEY?)
 
     def __call__(self, node):
 
         self.incr("calls")
         ret = self.startall(None)
         if not ret:
             return self.failure("Start all nodes failed")
 
         self.debug("Make sure node %s is active" % node)
         if self.CM.StandbyStatus(node) != "off":
             if not self.CM.SetStandbyMode(node, "off"):
                 return self.failure("can't set node %s to active mode" % node)
 
         self.CM.cluster_stable()
 
         status = self.CM.StandbyStatus(node)
         if status != "off":
             return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))
 
         self.debug("Getting resources running on node %s" % node)
         rsc_on_node = self.CM.active_resources(node)
 
         watchpats = []
         watchpats.append(r"State transition .* -> S_POLICY_ENGINE")
         watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
         watch.setwatch()
 
         self.debug("Setting node %s to standby mode" % node)
         if not self.CM.SetStandbyMode(node, "on"):
             return self.failure("can't set node %s to standby mode" % node)
 
         self.set_timer("on")
 
         ret = watch.lookforall()
         if not ret:
             self.logger.log("Patterns not found: " + repr(watch.unmatched))
             self.CM.SetStandbyMode(node, "off")
             return self.failure("cluster didn't react to standby change on %s" % node)
 
         self.CM.cluster_stable()
 
         status = self.CM.StandbyStatus(node)
         if status != "on":
             return self.failure("standby status of %s is [%s] but we expect [on]" % (node, status))
         self.log_timer("on")
 
         self.debug("Checking resources")
         bad_run = self.CM.active_resources(node)
         if len(bad_run) > 0:
             rc = self.failure("%s set to standby, %s is still running on it" % (node, repr(bad_run)))
             self.debug("Setting node %s to active mode" % node)
             self.CM.SetStandbyMode(node, "off")
             return rc
 
         self.debug("Setting node %s to active mode" % node)
         if not self.CM.SetStandbyMode(node, "off"):
             return self.failure("can't set node %s to active mode" % node)
 
         self.set_timer("off")
         self.CM.cluster_stable()
 
         status = self.CM.StandbyStatus(node)
         if status != "off":
             return self.failure("standby status of %s is [%s] but we expect [off]" % (node, status))
         self.log_timer("off")
 
         return self.success()
 
 AllTestClasses.append(StandbyTest)
 
 
 class ValgrindTest(CTSTest):
     '''Check for memory leaks'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "Valgrind"
         self.stopall = SimulStopLite(cm)
         self.startall = SimulStartLite(cm)
         self.is_valgrind = 1
         self.is_loop = 1
 
     def setup(self, node):
         self.incr("calls")
 
         ret = self.stopall(None)
         if not ret:
             return self.failure("Stop all nodes failed")
 
         # Enable valgrind
         self.logger.logPat = "/tmp/%s-*.valgrind" % self.name
 
         self.Env["valgrind-prefix"] = self.name
 
         self.rsh(node, "rm -f %s" % self.logger.logPat, None)
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Start all nodes failed")
 
         for node in self.Env["nodes"]:
             (rc, output) = self.rsh(node, "ps u --ppid `pidofproc aisexec`", None)
             for line in output:
                 self.debug(line)
 
         return self.success()
 
     def teardown(self, node):
         # Disable valgrind
         self.Env["valgrind-prefix"] = None
 
         # Return all nodes to normal
         ret = self.stopall(None)
         if not ret:
             return self.failure("Stop all nodes failed")
 
         return self.success()
 
     def find_leaks(self):
         # Check for leaks
         leaked = []
         self.stop = StopTest(self.CM)
 
         for node in self.Env["nodes"]:
             (rc, ps_out) = self.rsh(node, "ps u --ppid `pidofproc aisexec`", None)
             rc = self.stop(node)
             if not rc:
                 self.failure("Couldn't shut down %s" % node)
 
             rc = self.rsh(node, "grep -e indirectly.*lost:.*[1-9] -e definitely.*lost:.*[1-9] -e (ERROR|error).*SUMMARY:.*[1-9].*errors %s" % self.logger.logPat, 0)
             if rc != 1:
                 leaked.append(node)
                 self.failure("Valgrind errors detected on %s" % node)
                 for line in ps_out:
                     self.logger.log(line)
                 (rc, output) = self.rsh(node, "grep -e lost: -e SUMMARY: %s" % self.logger.logPat, None)
                 for line in output:
                     self.logger.log(line)
                 (rc, output) = self.rsh(node, "cat %s" % self.logger.logPat, None)
                 for line in output:
                     self.debug(line)
 
         self.rsh(node, "rm -f %s" % self.logger.logPat, None)
         return leaked
 
     def __call__(self, node):
         leaked = self.find_leaks()
         if len(leaked) > 0:
             return self.failure("Nodes %s leaked" % repr(leaked))
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return [
             r"cib.*: \*\*\*\*\*\*\*\*\*\*\*\*\*",
             r"cib.*: .* avoid confusing Valgrind",
             r"HA_VALGRIND_ENABLED",
         ]
 
 
 class StandbyLoopTest(ValgrindTest):
     '''Check for memory leaks by putting a node in and out of standby for an hour'''
     def __init__(self, cm):
         ValgrindTest.__init__(self,cm)
         self.name = "StandbyLoop"
 
     def __call__(self, node):
 
         lpc = 0
         delay = 2
         failed = 0
         done = time.time() + self.Env["loop-minutes"] * 60
         while time.time() <= done and not failed:
             lpc = lpc + 1
 
             time.sleep(delay)
             if not self.CM.SetStandbyMode(node, "on"):
                 self.failure("can't set node %s to standby mode" % node)
                 failed = lpc
 
             time.sleep(delay)
             if not self.CM.SetStandbyMode(node, "off"):
                 self.failure("can't set node %s to active mode" % node)
                 failed = lpc
 
         leaked = self.find_leaks()
         if failed:
             return self.failure("Iteration %d failed" % failed)
         elif len(leaked) > 0:
             return self.failure("Nodes %s leaked" % repr(leaked))
 
         return self.success()
 
 AllTestClasses.append(StandbyLoopTest)
 
 
 class BandwidthTest(CTSTest):
 #        Tests should not be cluster-manager-specific
 #        If you need to find out cluster manager configuration to do this, then
 #        it should be added to the generic cluster manager API.
     '''Test the bandwidth which heartbeat uses'''
     def __init__(self, cm):
         CTSTest.__init__(self, cm)
         self.name = "Bandwidth"
         self.start = StartTest(cm)
         self.__setitem__("min",0)
         self.__setitem__("max",0)
         self.__setitem__("totalbandwidth",0)
         self.tempfile = tempfile.mktemp(".cts")
         self.startall = SimulStartLite(cm)
 
     def __call__(self, node):
         '''Perform the Bandwidth test'''
         self.incr("calls")
 
         if self.CM.upcount() < 1:
             return self.skipped()
 
         Path = self.CM.InternalCommConfig()
         if "ip" not in Path["mediatype"]:
              return self.skipped()
 
         port = Path["port"][0]
         port = int(port)
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Test setup failed")
         time.sleep(5)  # We get extra messages right after startup.
 
         fstmpfile = "/var/run/band_estimate"
         dumpcmd = "tcpdump -p -n -c 102 -i any udp port %d > %s 2>&1" \
         %                (port, fstmpfile)
 
         rc = self.rsh(node, dumpcmd)
         if rc == 0:
             farfile = "root@%s:%s" % (node, fstmpfile)
             self.rsh.cp(farfile, self.tempfile)
             Bandwidth = self.countbandwidth(self.tempfile)
             if not Bandwidth:
                 self.logger.log("Could not compute bandwidth.")
                 return self.success()
             intband = int(Bandwidth + 0.5)
             self.logger.log("...bandwidth: %d bits/sec" % intband)
             self.Stats["totalbandwidth"] = self.Stats["totalbandwidth"] + Bandwidth
             if self.Stats["min"] == 0:
                 self.Stats["min"] = Bandwidth
             if Bandwidth > self.Stats["max"]:
                 self.Stats["max"] = Bandwidth
             if Bandwidth < self.Stats["min"]:
                 self.Stats["min"] = Bandwidth
             self.rsh(node, "rm -f %s" % fstmpfile)
             os.unlink(self.tempfile)
             return self.success()
         else:
             return self.failure("no response from tcpdump command [%d]!" % rc)
 
     def countbandwidth(self, file):
         fp = open(file, "r")
         fp.seek(0)
         count = 0
         sum = 0
         while 1:
             line = fp.readline()
             if not line:
                 return None
             if re.search("udp",line) or re.search("UDP,", line):
                 count = count + 1
                 linesplit = string.split(line," ")
                 for j in range(len(linesplit)-1):
                     if linesplit[j] == "udp": break
                     if linesplit[j] == "length:": break
 
                 try:
                     sum = sum + int(linesplit[j+1])
                 except ValueError:
                     self.logger.log("Invalid tcpdump line: %s" % line)
                     return None
                 T1 = linesplit[0]
                 timesplit = string.split(T1,":")
                 time2split = string.split(timesplit[2],".")
                 time1 = (long(timesplit[0])*60+long(timesplit[1]))*60+long(time2split[0])+long(time2split[1])*0.000001
                 break
 
         while count < 100:
             line = fp.readline()
             if not line:
                 return None
             if re.search("udp",line) or re.search("UDP,", line):
                 count = count+1
                 linessplit = string.split(line," ")
                 for j in range(len(linessplit)-1):
                     if linessplit[j] == "udp": break
                     if linesplit[j] == "length:": break
                 try:
                     sum = int(linessplit[j+1]) + sum
                 except ValueError:
                     self.logger.log("Invalid tcpdump line: %s" % line)
                     return None
 
         T2 = linessplit[0]
         timesplit = string.split(T2,":")
         time2split = string.split(timesplit[2],".")
         time2 = (long(timesplit[0])*60+long(timesplit[1]))*60+long(time2split[0])+long(time2split[1])*0.000001
         time = time2-time1
         if (time <= 0):
             return 0
         return (sum*8)/time
 
     def is_applicable(self):
         '''BandwidthTest never applicable'''
         return 0
 
 AllTestClasses.append(BandwidthTest)
 
 
 ###################################################################
 class MaintenanceMode(CTSTest):
 ###################################################################
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "MaintenanceMode"
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.max = 30
         #self.is_unsafe = 1
         self.benchmark = 1
         self.action = "asyncmon"
         self.interval = 0
         self.rid = "maintenanceDummy"
 
     def toggleMaintenanceMode(self, node, action):
         pats = []
         pats.append(self.templates["Pat:DC_IDLE"])
 
         # fail the resource right after turning Maintenance mode on
         # verify it is not recovered until maintenance mode is turned off
         if action == "On":
-            pats.append("pengine.*: warning:.* Processing failed op %s for %s on" % (self.action, self.rid))
+            pats.append(r"pengine.*:\s+warning:.*Processing failed op %s for %s on" % (self.action, self.rid))
         else:
             pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "stop_0"))
             pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "start_0"))
 
         watch = self.create_watch(pats, 60)
         watch.setwatch()
 
         self.debug("Turning maintenance mode %s" % action)
         self.rsh(node, self.templates["MaintenanceMode%s" % (action)])
         if (action == "On"):
             self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node))
 
         self.set_timer("recover%s" % (action))
         watch.lookforall()
         self.log_timer("recover%s" % (action))
         if watch.unmatched:
             self.debug("Failed to find patterns when turning maintenance mode %s" % action)
             return repr(watch.unmatched)
 
         return ""
 
     def insertMaintenanceDummy(self, node):
         pats = []
         pats.append(("%s.*" % node) + (self.templates["Pat:RscOpOK"] % (self.rid, "start_0")))
 
         watch = self.create_watch(pats, 60)
         watch.setwatch()
 
         self.CM.AddDummyRsc(node, self.rid)
 
         self.set_timer("addDummy")
         watch.lookforall()
         self.log_timer("addDummy")
 
         if watch.unmatched:
             self.debug("Failed to find patterns when adding maintenance dummy resource")
             return repr(watch.unmatched)
         return ""
 
     def removeMaintenanceDummy(self, node):
         pats = []
         pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "stop_0"))
 
         watch = self.create_watch(pats, 60)
         watch.setwatch()
         self.CM.RemoveDummyRsc(node, self.rid)
 
         self.set_timer("removeDummy")
         watch.lookforall()
         self.log_timer("removeDummy")
 
         if watch.unmatched:
             self.debug("Failed to find patterns when removing maintenance dummy resource")
             return repr(watch.unmatched)
         return ""
 
     def managedRscList(self, node):
         rscList = []
         (rc, lines) = self.rsh(node, "crm_resource -c", None)
         for line in lines:
             if re.search("^Resource", line):
                 tmp = AuditResource(self.CM, line)
                 if tmp.managed():
                     rscList.append(tmp.id)
 
         return rscList
 
     def verifyResources(self, node, rscList, managed):
         managedList = list(rscList)
         managed_str = "managed"
         if not managed:
             managed_str = "unmanaged"
 
         (rc, lines) = self.rsh(node, "crm_resource -c", None)
         for line in lines:
             if re.search("^Resource", line):
                 tmp = AuditResource(self.CM, line)
                 if managed and not tmp.managed():
                     continue
                 elif not managed and tmp.managed():
                     continue
                 elif managedList.count(tmp.id):
                     managedList.remove(tmp.id)
 
         if len(managedList) == 0:
             self.debug("Found all %s resources on %s" % (managed_str, node))
             return True
 
         self.logger.log("Could not find all %s resources on %s. %s" % (managed_str, node, managedList))
         return False
 
     def __call__(self, node):
         '''Perform the 'MaintenanceMode' test. '''
         self.incr("calls")
         verify_managed = False
         verify_unmanaged = False
         failPat = ""
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         # get a list of all the managed resources. We use this list
         # after enabling maintenance mode to verify all managed resources
         # become un-managed.  After maintenance mode is turned off, we use
         # this list to verify all the resources become managed again.
         managedResources = self.managedRscList(node)
         if len(managedResources) == 0:
             self.logger.log("No managed resources on %s" % node)
             return self.skipped()
 
         # insert a fake resource we can fail during maintenance mode
         # so we can verify recovery does not take place until after maintenance
         # mode is disabled.
         failPat = failPat + self.insertMaintenanceDummy(node)
 
         # toggle maintenance mode ON, then fail dummy resource.
         failPat = failPat + self.toggleMaintenanceMode(node, "On")
 
         # verify all the resources are now unmanaged
         if self.verifyResources(node, managedResources, False):
             verify_unmanaged = True
 
         # Toggle maintenance mode  OFF, verify dummy is recovered.
         failPat = failPat + self.toggleMaintenanceMode(node, "Off")
 
         # verify all the resources are now managed again
         if self.verifyResources(node, managedResources, True):
             verify_managed = True
 
         # Remove our maintenance dummy resource.
         failPat = failPat + self.removeMaintenanceDummy(node)
 
         self.CM.cluster_stable()
 
         if failPat != "":
             return self.failure("Unmatched patterns: %s" % (failPat))
         elif verify_unmanaged is False:
             return self.failure("Failed to verify resources became unmanaged during maintenance mode")
         elif verify_managed is False:
             return self.failure("Failed to verify resources switched back to managed after disabling maintenance mode")
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return [
             r"Updating failcount for %s" % self.rid,
             r"pengine.*: Recover %s\s*\(.*\)" % self.rid,
             r"Unknown operation: fail",
             r"(ERROR|error): sending stonithRA op to stonithd failed.",
             self.templates["Pat:RscOpOK"] % (self.rid, ("%s_%d" % (self.action, self.interval))),
             r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
         ]
 
 AllTestClasses.append(MaintenanceMode)
 
 
 class ResourceRecover(CTSTest):
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "ResourceRecover"
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.max = 30
         self.rid = None
         self.rid_alt = None
         #self.is_unsafe = 1
         self.benchmark = 1
 
         # these are the values used for the new LRM API call
         self.action = "asyncmon"
         self.interval = 0
 
     def __call__(self, node):
         '''Perform the 'ResourceRecover' test. '''
         self.incr("calls")
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         resourcelist = self.CM.active_resources(node)
         # if there are no resourcelist, return directly
         if len(resourcelist) == 0:
             self.logger.log("No active resources on %s" % node)
             return self.skipped()
 
         self.rid = self.Env.RandomGen.choice(resourcelist)
         self.rid_alt = self.rid
 
         rsc = None
         (rc, lines) = self.rsh(node, "crm_resource -c", None)
         for line in lines:
             if re.search("^Resource", line):
                 tmp = AuditResource(self.CM, line)
                 if tmp.id == self.rid:
                     rsc = tmp
                     # Handle anonymous clones that get renamed
                     self.rid = rsc.clone_id
                     break
 
         if not rsc:
             return self.failure("Could not find %s in the resource list" % self.rid)
 
         self.debug("Shooting %s aka. %s" % (rsc.clone_id, rsc.id))
 
         pats = []
-        pats.append(r"pengine.*: warning:.* Processing failed op %s for (%s|%s) on" % (self.action,
+        pats.append(r"pengine.*:\s+warning:.*Processing failed op %s for (%s|%s) on" % (self.action,
             rsc.id, rsc.clone_id))
 
         if rsc.managed():
             pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "stop_0"))
             if rsc.unique():
                 pats.append(self.templates["Pat:RscOpOK"] % (self.rid, "start_0"))
             else:
                 # Anonymous clones may get restarted with a different clone number
                 pats.append(self.templates["Pat:RscOpOK"] % (".*", "start_0"))
 
         watch = self.create_watch(pats, 60)
         watch.setwatch()
 
         self.rsh(node, "crm_resource -V -F -r %s -H %s &>/dev/null" % (self.rid, node))
 
         self.set_timer("recover")
         watch.lookforall()
         self.log_timer("recover")
 
         self.CM.cluster_stable()
         recovered = self.CM.ResourceLocation(self.rid)
 
         if watch.unmatched:
             return self.failure("Patterns not found: %s" % repr(watch.unmatched))
 
         elif rsc.unique() and len(recovered) > 1:
             return self.failure("%s is now active on more than one node: %s"%(self.rid, repr(recovered)))
 
         elif len(recovered) > 0:
             self.debug("%s is running on: %s" % (self.rid, repr(recovered)))
 
         elif rsc.managed():
             return self.failure("%s was not recovered and is inactive" % self.rid)
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return [
             r"Updating failcount for %s" % self.rid,
             r"pengine.*: Recover (%s|%s)\s*\(.*\)" % (self.rid, self.rid_alt),
             r"Unknown operation: fail",
             r"(ERROR|error): sending stonithRA op to stonithd failed.",
             self.templates["Pat:RscOpOK"] % (self.rid, ("%s_%d" % (self.action, self.interval))),
             r"(ERROR|error).*: Action %s_%s_%d .* initiated outside of a transition" % (self.rid, self.action, self.interval),
         ]
 
 AllTestClasses.append(ResourceRecover)
 
 
 class ComponentFail(CTSTest):
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "ComponentFail"
         # TODO make this work correctly in docker.
         self.is_docker_unsafe = 1
         self.startall = SimulStartLite(cm)
         self.complist = cm.Components()
         self.patterns = []
         self.okerrpatterns = []
         self.is_unsafe = 1
 
     def __call__(self, node):
         '''Perform the 'ComponentFail' test. '''
         self.incr("calls")
         self.patterns = []
         self.okerrpatterns = []
 
         # start all nodes
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         if not self.CM.cluster_stable(self.Env["StableTime"]):
             return self.failure("Setup failed - unstable")
 
         node_is_dc = self.CM.is_node_dc(node, None)
 
         # select a component to kill
         chosen = self.Env.RandomGen.choice(self.complist)
         while chosen.dc_only == 1 and node_is_dc == 0:
             chosen = self.Env.RandomGen.choice(self.complist)
 
         self.debug("...component %s (dc=%d,boot=%d)" % (chosen.name, node_is_dc,chosen.triggersreboot))
         self.incr(chosen.name)
 
         if chosen.name != "aisexec" and chosen.name != "corosync":
             if self.Env["Name"] != "crm-lha" or chosen.name != "pengine":
                 self.patterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name))
                 self.patterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name))
 
         self.patterns.extend(chosen.pats)
         if node_is_dc:
           self.patterns.extend(chosen.dc_pats)
 
         # In an ideal world, this next stuff should be in the "chosen" object as a member function
         if self.Env["Name"] == "crm-lha" and chosen.triggersreboot:
             # Make sure the node goes down and then comes back up if it should reboot...
             for other in self.Env["nodes"]:
                 if other != node:
                     self.patterns.append(self.templates["Pat:They_stopped"] %(other, self.CM.key_for_node(node)))
             self.patterns.append(self.templates["Pat:Slave_started"] % node)
             self.patterns.append(self.templates["Pat:Local_started"] % node)
 
             if chosen.dc_only:
                 # Sometimes these will be in the log, and sometimes they won't...
                 self.okerrpatterns.append("%s .*Process %s:.* exited" % (node, chosen.name))
                 self.okerrpatterns.append("%s .*I_ERROR.*crmdManagedChildDied" % node)
                 self.okerrpatterns.append("%s .*The %s subsystem terminated unexpectedly" % (node, chosen.name))
                 self.okerrpatterns.append("(ERROR|error): Client .* exited with return code")
             else:
                 # Sometimes this won't be in the log...
                 self.okerrpatterns.append(self.templates["Pat:ChildKilled"] %(node, chosen.name))
                 self.okerrpatterns.append(self.templates["Pat:ChildRespawn"] %(node, chosen.name))
                 self.okerrpatterns.append(self.templates["Pat:ChildExit"])
 
         if chosen.name == "stonith":
             # Ignore actions for STONITH resources
             (rc, lines) = self.rsh(node, "crm_resource -c", None)
             for line in lines:
                 if re.search("^Resource", line):
                     r = AuditResource(self.CM, line)
                     if r.rclass == "stonith":
                         self.okerrpatterns.append(self.templates["Pat:Fencing_recover"] % r.id)
 
         # supply a copy so self.patterns doesnt end up empty
         tmpPats = []
         tmpPats.extend(self.patterns)
         self.patterns.extend(chosen.badnews_ignore)
 
         # Look for STONITH ops, depending on Env["at-boot"] we might need to change the nodes status
         stonithPats = []
         stonithPats.append(self.templates["Pat:Fencing_ok"] % node)
         stonith = self.create_watch(stonithPats, 0)
         stonith.setwatch()
 
         # set the watch for stable
         watch = self.create_watch(
             tmpPats, self.Env["DeadTime"] + self.Env["StableTime"] + self.Env["StartTime"])
         watch.setwatch()
 
         # kill the component
         chosen.kill(node)
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
 
         self.debug("Waiting for any STONITHd node to come back up")
         self.CM.ns.WaitForAllNodesToComeUp(self.Env["nodes"], 600)
 
         self.debug("Waiting for the cluster to re-stabilize with all nodes")
         self.CM.cluster_stable(self.Env["StartTime"])
 
         self.debug("Checking if %s was shot" % node)
         shot = stonith.look(60)
         if shot:
             self.debug("Found: " + repr(shot))
             self.okerrpatterns.append(self.templates["Pat:Fencing_start"] % node)
 
             if self.Env["at-boot"] == 0:
                 self.CM.ShouldBeStatus[node] = "down"
 
             # If fencing occurred, chances are many (if not all) the expected logs
             # will not be sent - or will be lost when the node reboots
             return self.success()
 
         # check for logs indicating a graceful recovery
         matched = watch.lookforall(allow_multiple_matches=1)
         if watch.unmatched:
             self.logger.log("Patterns not found: " + repr(watch.unmatched))
 
         self.debug("Waiting for the cluster to re-stabilize with all nodes")
         is_stable = self.CM.cluster_stable(self.Env["StartTime"])
 
         if not matched:
             return self.failure("Didn't find all expected %s patterns" % chosen.name)
         elif not is_stable:
             return self.failure("Cluster did not become stable after killing %s" % chosen.name)
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
     # Note that okerrpatterns refers to the last time we ran this test
     # The good news is that this works fine for us...
         self.okerrpatterns.extend(self.patterns)
         return self.okerrpatterns
 
 AllTestClasses.append(ComponentFail)
 
 
 class SplitBrainTest(CTSTest):
     '''It is used to test split-brain. when the path between the two nodes break
        check the two nodes both take over the resource'''
     def __init__(self,cm):
         CTSTest.__init__(self,cm)
         self.name = "SplitBrain"
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.is_experimental = 1
 
     def isolate_partition(self, partition):
         other_nodes = []
         other_nodes.extend(self.Env["nodes"])
 
         for node in partition:
             try:
                 other_nodes.remove(node)
             except ValueError:
                 self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]) + " from " +repr(partition))
 
         if len(other_nodes) == 0:
             return 1
 
         self.debug("Creating partition: " + repr(partition))
         self.debug("Everyone else: " + repr(other_nodes))
 
         for node in partition:
             if not self.CM.isolate_node(node, other_nodes):
                 self.logger.log("Could not isolate %s" % node)
                 return 0
 
         return 1
 
     def heal_partition(self, partition):
         other_nodes = []
         other_nodes.extend(self.Env["nodes"])
 
         for node in partition:
             try:
                 other_nodes.remove(node)
             except ValueError:
                 self.logger.log("Node "+node+" not in " + repr(self.Env["nodes"]))
 
         if len(other_nodes) == 0:
             return 1
 
         self.debug("Healing partition: " + repr(partition))
         self.debug("Everyone else: " + repr(other_nodes))
 
         for node in partition:
             self.CM.unisolate_node(node, other_nodes)
 
     def __call__(self, node):
         '''Perform split-brain test'''
         self.incr("calls")
         self.passed = 1
         partitions = {}
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed")
 
         while 1:
             # Retry until we get multiple partitions
             partitions = {}
             p_max = len(self.Env["nodes"])
             for node in self.Env["nodes"]:
                 p = self.Env.RandomGen.randint(1, p_max)
                 if not partitions.has_key(p):
                     partitions[p] = []
                 partitions[p].append(node)
             p_max = len(partitions.keys())
             if p_max > 1:
                 break
             # else, try again
 
         self.debug("Created %d partitions" % p_max)
         for key in partitions.keys():
             self.debug("Partition["+str(key)+"]:\t"+repr(partitions[key]))
 
         # Disabling STONITH to reduce test complexity for now
         self.rsh(node, "crm_attribute -V -n stonith-enabled -v false")
 
         for key in partitions.keys():
             self.isolate_partition(partitions[key])
 
         count = 30
         while count > 0:
             if len(self.CM.find_partitions()) != p_max:
                 time.sleep(10)
             else:
                 break
         else:
             self.failure("Expected partitions were not created")
 
         # Target number of partitions formed - wait for stability
         if not self.CM.cluster_stable():
             self.failure("Partitioned cluster not stable")
 
         # Now audit the cluster state
         self.CM.partitions_expected = p_max
         if not self.audit():
             self.failure("Audits failed")
         self.CM.partitions_expected = 1
 
         # And heal them again
         for key in partitions.keys():
             self.heal_partition(partitions[key])
 
         # Wait for a single partition to form
         count = 30
         while count > 0:
             if len(self.CM.find_partitions()) != 1:
                 time.sleep(10)
                 count -= 1
             else:
                 break
         else:
             self.failure("Cluster did not reform")
 
         # Wait for it to have the right number of members
         count = 30
         while count > 0:
             members = []
 
             partitions = self.CM.find_partitions()
             if len(partitions) > 0:
                 members = partitions[0].split()
 
             if len(members) != len(self.Env["nodes"]):
                 time.sleep(10)
                 count -= 1
             else:
                 break
         else:
             self.failure("Cluster did not completely reform")
 
         # Wait up to 20 minutes - the delay is more preferable than
         # trying to continue with in a messed up state
         if not self.CM.cluster_stable(1200):
             self.failure("Reformed cluster not stable")
             answer = raw_input('Continue? [nY]')
             if answer and answer == "n":
                 raise ValueError("Reformed cluster not stable")
 
         # Turn fencing back on
         if self.Env["DoFencing"]:
             self.rsh(node, "crm_attribute -V -D -n stonith-enabled")
 
         self.CM.cluster_stable()
 
         if self.passed:
             return self.success()
         return self.failure("See previous errors")
 
     def errorstoignore(self):
         '''Return list of errors which are 'normal' and should be ignored'''
         return [
             r"Another DC detected:",
             r"(ERROR|error).*: .*Application of an update diff failed",
             r"crmd.*:.*not in our membership list",
             r"CRIT:.*node.*returning after partition",
         ]
 
     def is_applicable(self):
         if not self.is_applicable_common():
             return 0
         return len(self.Env["nodes"]) > 2
 
 AllTestClasses.append(SplitBrainTest)
 
 
 class Reattach(CTSTest):
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "Reattach"
         self.startall = SimulStartLite(cm)
         self.restart1 = RestartTest(cm)
         self.stopall = SimulStopLite(cm)
         self.is_unsafe = 0 # Handled by canrunnow()
 
     def setup(self, node):
         attempt = 0
         if not self.startall(None):
             return None
 
         # Make sure we are really _really_ stable and that all
         # resources, including those that depend on transient node
         # attributes, are started
         while not self.CM.cluster_stable(double_check=True):
             if attempt < 5:
                 attempt += 1
                 self.debug("Not stable yet, re-testing")
             else:
                 self.logger.log("Cluster is not stable")
                 return None
 
         return 1
 
     def teardown(self, node):
 
         # Make sure 'node' is up
         start = StartTest(self.CM)
         start(node)
 
         is_managed = self.rsh(node, "crm_attribute -Q -G -t crm_config -n is-managed-default -d true", 1)
         is_managed = is_managed[:-1] # Strip off the newline
         if is_managed != "true":
             self.logger.log("Attempting to re-enable resource management on %s (%s)" % (node, is_managed))
             managed = self.create_watch(["is-managed-default"], 60)
             managed.setwatch()
 
             self.rsh(node, "crm_attribute -V -D -n is-managed-default")
 
             if not managed.lookforall():
                 self.logger.log("Patterns not found: " + repr(managed.unmatched))
                 self.logger.log("Could not re-enable resource management")
                 return 0
 
         return 1
 
     def canrunnow(self, node):
         '''Return TRUE if we can meaningfully run right now'''
         if self.find_ocfs2_resources(node):
             self.logger.log("Detach/Reattach scenarios are not possible with OCFS2 services present")
             return 0
         return 1
 
     def __call__(self, node):
         self.incr("calls")
 
         pats = []
         managed = self.create_watch(["is-managed-default"], 60)
         managed.setwatch()
 
         self.debug("Disable resource management")
         self.rsh(node, "crm_attribute -V -n is-managed-default -v false")
 
         if not managed.lookforall():
             self.logger.log("Patterns not found: " + repr(managed.unmatched))
             return self.failure("Resource management not disabled")
 
         pats = []
         pats.append(self.templates["Pat:RscOpOK"] % (".*", "start"))
         pats.append(self.templates["Pat:RscOpOK"] % (".*", "stop"))
         pats.append(self.templates["Pat:RscOpOK"] % (".*", "promote"))
         pats.append(self.templates["Pat:RscOpOK"] % (".*", "demote"))
         pats.append(self.templates["Pat:RscOpOK"] % (".*", "migrate"))
 
         watch = self.create_watch(pats, 60, "ShutdownActivity")
         watch.setwatch()
 
         self.debug("Shutting down the cluster")
         ret = self.stopall(None)
         if not ret:
             self.debug("Re-enable resource management")
             self.rsh(node, "crm_attribute -V -D -n is-managed-default")
             return self.failure("Couldn't shut down the cluster")
 
         self.debug("Bringing the cluster back up")
         ret = self.startall(None)
         time.sleep(5) # allow ping to update the CIB
         if not ret:
             self.debug("Re-enable resource management")
             self.rsh(node, "crm_attribute -V -D -n is-managed-default")
             return self.failure("Couldn't restart the cluster")
 
         if self.local_badnews("ResourceActivity:", watch):
             self.debug("Re-enable resource management")
             self.rsh(node, "crm_attribute -V -D -n is-managed-default")
             return self.failure("Resources stopped or started during cluster restart")
 
         watch = self.create_watch(pats, 60, "StartupActivity")
         watch.setwatch()
 
         managed = self.create_watch(["is-managed-default"], 60)
         managed.setwatch()
 
         self.debug("Re-enable resource management")
         self.rsh(node, "crm_attribute -V -D -n is-managed-default")
 
         if not managed.lookforall():
             self.logger.log("Patterns not found: " + repr(managed.unmatched))
             return self.failure("Resource management not enabled")
 
         self.CM.cluster_stable()
 
         # Ignore actions for STONITH resources
         ignore = []
         (rc, lines) = self.rsh(node, "crm_resource -c", None)
         for line in lines:
             if re.search("^Resource", line):
                 r = AuditResource(self.CM, line)
                 if r.rclass == "stonith":
 
                     self.debug("Ignoring start actions for %s" % r.id)
                     ignore.append(self.templates["Pat:RscOpOK"] % (r.id, "start_0"))
 
         if self.local_badnews("ResourceActivity:", watch, ignore):
             return self.failure("Resources stopped or started after resource management was re-enabled")
 
         return ret
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return [
             r"resources were active at shutdown",
         ]
 
     def is_applicable(self):
         if self.Env["Name"] == "crm-lha":
             return None
         return 1
 
 AllTestClasses.append(Reattach)
 
 
 class SpecialTest1(CTSTest):
     '''Set up a custom test to cause quorum failure issues for Andrew'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "SpecialTest1"
         self.startall = SimulStartLite(cm)
         self.restart1 = RestartTest(cm)
         self.stopall = SimulStopLite(cm)
 
     def __call__(self, node):
         '''Perform the 'SpecialTest1' test for Andrew. '''
         self.incr("calls")
 
         #        Shut down all the nodes...
         ret = self.stopall(None)
         if not ret:
             return self.failure("Could not stop all nodes")
 
         # Test config recovery when the other nodes come up
         self.rsh(node, "rm -f "+CTSvars.CRM_CONFIG_DIR+"/cib*")
 
         #        Start the selected node
         ret = self.restart1(node)
         if not ret:
             return self.failure("Could not start "+node)
 
         #        Start all remaining nodes
         ret = self.startall(None)
         if not ret:
             return self.failure("Could not start the remaining nodes")
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         # Errors that occur as a result of the CIB being wiped
         return [
             r"error.*: v1 patchset error, patch failed to apply: Application of an update diff failed",
             r"error.*: Resource start-up disabled since no STONITH resources have been defined",
             r"error.*: Either configure some or disable STONITH with the stonith-enabled option",
             r"error.*: NOTE: Clusters with shared data need STONITH to ensure data integrity",
         ]
 
 AllTestClasses.append(SpecialTest1)
 
 
 class HAETest(CTSTest):
     '''Set up a custom test to cause quorum failure issues for Andrew'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "HAETest"
         self.stopall = SimulStopLite(cm)
         self.startall = SimulStartLite(cm)
         self.is_loop = 1
 
     def setup(self, node):
         #  Start all remaining nodes
         ret = self.startall(None)
         if not ret:
             return self.failure("Couldn't start all nodes")
         return self.success()
 
     def teardown(self, node):
         # Stop everything
         ret = self.stopall(None)
         if not ret:
             return self.failure("Couldn't stop all nodes")
         return self.success()
 
     def wait_on_state(self, node, resource, expected_clones, attempts=240):
         while attempts > 0:
             active = 0
             (rc, lines) = self.rsh(node, "crm_resource -r %s -W -Q" % resource, stdout=None)
 
             # Hack until crm_resource does the right thing
             if rc == 0 and lines:
                 active = len(lines)
 
             if len(lines) == expected_clones:
                 return 1
 
             elif rc == 1:
                 self.debug("Resource %s is still inactive" % resource)
 
             elif rc == 234:
                 self.logger.log("Unknown resource %s" % resource)
                 return 0
 
             elif rc == 246:
                 self.logger.log("Cluster is inactive")
                 return 0
 
             elif rc != 0:
                 self.logger.log("Call to crm_resource failed, rc=%d" % rc)
                 return 0
 
             else:
                 self.debug("Resource %s is active on %d times instead of %d" % (resource, active, expected_clones))
 
             attempts -= 1
             time.sleep(1)
 
         return 0
 
     def find_dlm(self, node):
         self.r_dlm = None
 
         (rc, lines) = self.rsh(node, "crm_resource -c", None)
         for line in lines:
             if re.search("^Resource", line):
                 r = AuditResource(self.CM, line)
                 if r.rtype == "controld" and r.parent != "NA":
                     self.debug("Found dlm: %s" % self.r_dlm)
                     self.r_dlm = r.parent
                     return 1
         return 0
 
     def find_hae_resources(self, node):
         self.r_dlm = None
         self.r_o2cb = None
         self.r_ocfs2 = []
 
         if self.find_dlm(node):
             self.find_ocfs2_resources(node)
 
     def is_applicable(self):
         if not self.is_applicable_common():
             return 0
         if self.Env["Schema"] == "hae":
             return 1
         return None
 
 
 class HAERoleTest(HAETest):
     def __init__(self, cm):
         '''Lars' mount/unmount test for the HA extension. '''
         HAETest.__init__(self,cm)
         self.name = "HAERoleTest"
 
     def change_state(self, node, resource, target):
         rc = self.rsh(node, "crm_resource -V -r %s -p target-role -v %s  --meta" % (resource, target))
         return rc
 
     def __call__(self, node):
         self.incr("calls")
         lpc = 0
         failed = 0
         delay = 2
         done = time.time() + self.Env["loop-minutes"]*60
         self.find_hae_resources(node)
 
         clone_max = len(self.Env["nodes"])
         while time.time() <= done and not failed:
             lpc = lpc + 1
 
             self.change_state(node, self.r_dlm, "Stopped")
             if not self.wait_on_state(node, self.r_dlm, 0):
                 self.failure("%s did not go down correctly" % self.r_dlm)
                 failed = lpc
 
             self.change_state(node, self.r_dlm, "Started")
             if not self.wait_on_state(node, self.r_dlm, clone_max):
                 self.failure("%s did not come up correctly" % self.r_dlm)
                 failed = lpc
 
             if not self.wait_on_state(node, self.r_o2cb, clone_max):
                 self.failure("%s did not come up correctly" % self.r_o2cb)
                 failed = lpc
 
             for fs in self.r_ocfs2:
                 if not self.wait_on_state(node, fs, clone_max):
                     self.failure("%s did not come up correctly" % fs)
                     failed = lpc
 
         if failed:
             return self.failure("iteration %d failed" % failed)
         return self.success()
 
 AllTestClasses.append(HAERoleTest)
 
 
 class HAEStandbyTest(HAETest):
     '''Set up a custom test to cause quorum failure issues for Andrew'''
     def __init__(self, cm):
         HAETest.__init__(self,cm)
         self.name = "HAEStandbyTest"
 
     def change_state(self, node, resource, target):
         rc = self.rsh(node, "crm_standby -V -l reboot -v %s" % (target))
         return rc
 
     def __call__(self, node):
         self.incr("calls")
 
         lpc = 0
         failed = 0
         done = time.time() + self.Env["loop-minutes"]*60
         self.find_hae_resources(node)
 
         clone_max = len(self.Env["nodes"])
         while time.time() <= done and not failed:
             lpc = lpc + 1
 
             self.change_state(node, self.r_dlm, "true")
             if not self.wait_on_state(node, self.r_dlm, clone_max-1):
                 self.failure("%s did not go down correctly" % self.r_dlm)
                 failed = lpc
 
             self.change_state(node, self.r_dlm, "false")
             if not self.wait_on_state(node, self.r_dlm, clone_max):
                 self.failure("%s did not come up correctly" % self.r_dlm)
                 failed = lpc
 
             if not self.wait_on_state(node, self.r_o2cb, clone_max):
                 self.failure("%s did not come up correctly" % self.r_o2cb)
                 failed = lpc
 
             for fs in self.r_ocfs2:
                 if not self.wait_on_state(node, fs, clone_max):
                     self.failure("%s did not come up correctly" % fs)
                     failed = lpc
 
         if failed:
             return self.failure("iteration %d failed" % failed)
         return self.success()
 
 AllTestClasses.append(HAEStandbyTest)
 
 
 class NearQuorumPointTest(CTSTest):
     '''
     This test brings larger clusters near the quorum point (50%).
     In addition, it will test doing starts and stops at the same time.
 
     Here is how I think it should work:
     - loop over the nodes and decide randomly which will be up and which
       will be down  Use a 50% probability for each of up/down.
     - figure out what to do to get into that state from the current state
     - in parallel, bring up those going up  and bring those going down.
     '''
 
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "NearQuorumPoint"
 
     def __call__(self, dummy):
         '''Perform the 'NearQuorumPoint' test. '''
         self.incr("calls")
         startset = []
         stopset = []
 
         stonith = self.CM.prepare_fencing_watcher("NearQuorumPoint")
         #decide what to do with each node
         for node in self.Env["nodes"]:
             action = self.Env.RandomGen.choice(["start","stop"])
             #action = self.Env.RandomGen.choice(["start","stop","no change"])
             if action == "start" :
                 startset.append(node)
             elif action == "stop" :
                 stopset.append(node)
 
         self.debug("start nodes:" + repr(startset))
         self.debug("stop nodes:" + repr(stopset))
 
         #add search patterns
         watchpats = [ ]
         for node in stopset:
             if self.CM.ShouldBeStatus[node] == "up":
                 watchpats.append(self.templates["Pat:We_stopped"] % node)
 
         for node in startset:
             if self.CM.ShouldBeStatus[node] == "down":
                 #watchpats.append(self.templates["Pat:Slave_started"] % node)
                 watchpats.append(self.templates["Pat:Local_started"] % node)
             else:
                 for stopping in stopset:
                     if self.CM.ShouldBeStatus[stopping] == "up":
                         watchpats.append(self.templates["Pat:They_stopped"] % (node, self.CM.key_for_node(stopping)))
 
         if len(watchpats) == 0:
             return self.skipped()
 
         if len(startset) != 0:
             watchpats.append(self.templates["Pat:DC_IDLE"])
 
         watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
 
         watch.setwatch()
 
         #begin actions
         for node in stopset:
             if self.CM.ShouldBeStatus[node] == "up":
                 self.CM.StopaCMnoBlock(node)
 
         for node in startset:
             if self.CM.ShouldBeStatus[node] == "down":
                 self.CM.StartaCMnoBlock(node)
 
         #get the result
         if watch.lookforall():
             self.CM.cluster_stable()
             self.CM.fencing_cleanup("NearQuorumPoint", stonith)
             return self.success()
 
         self.logger.log("Warn: Patterns not found: " + repr(watch.unmatched))
 
         #get the "bad" nodes
         upnodes = []
         for node in stopset:
             if self.CM.StataCM(node) == 1:
                 upnodes.append(node)
 
         downnodes = []
         for node in startset:
             if self.CM.StataCM(node) == 0:
                 downnodes.append(node)
 
         self.CM.fencing_cleanup("NearQuorumPoint", stonith)
         if upnodes == [] and downnodes == []:
             self.CM.cluster_stable()
 
             # Make sure they're completely down with no residule
             for node in stopset:
                 self.rsh(node, self.templates["StopCmd"])
 
             return self.success()
 
         if len(upnodes) > 0:
             self.logger.log("Warn: Unstoppable nodes: " + repr(upnodes))
 
         if len(downnodes) > 0:
             self.logger.log("Warn: Unstartable nodes: " + repr(downnodes))
 
         return self.failure()
 
     def is_applicable(self):
         if self.Env["Name"] == "crm-cman":
             return None
         return 1
 
 AllTestClasses.append(NearQuorumPointTest)
 
 
 class RollingUpgradeTest(CTSTest):
     '''Perform a rolling upgrade of the cluster'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "RollingUpgrade"
         self.start = StartTest(cm)
         self.stop = StopTest(cm)
         self.stopall = SimulStopLite(cm)
         self.startall = SimulStartLite(cm)
 
     def setup(self, node):
         #  Start all remaining nodes
         ret = self.stopall(None)
         if not ret:
             return self.failure("Couldn't stop all nodes")
 
         for node in self.Env["nodes"]:
             if not self.downgrade(node, None):
                 return self.failure("Couldn't downgrade %s" % node)
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Couldn't start all nodes")
         return self.success()
 
     def teardown(self, node):
         # Stop everything
         ret = self.stopall(None)
         if not ret:
             return self.failure("Couldn't stop all nodes")
 
         for node in self.Env["nodes"]:
             if not self.upgrade(node, None):
                 return self.failure("Couldn't upgrade %s" % node)
 
         return self.success()
 
     def install(self, node, version, start=1, flags="--force"):
 
         target_dir = "/tmp/rpm-%s" % version
         src_dir = "%s/%s" % (self.Env["rpm-dir"], version)
 
         self.logger.log("Installing %s on %s with %s" % (version, node, flags))
         if not self.stop(node):
             return self.failure("stop failure: "+node)
 
         rc = self.rsh(node, "mkdir -p %s" % target_dir)
         rc = self.rsh(node, "rm -f %s/*.rpm" % target_dir)
         (rc, lines) = self.rsh(node, "ls -1 %s/*.rpm" % src_dir, None)
         for line in lines:
             line = line[:-1]
             rc = self.rsh.cp("%s" % (line), "%s:%s/" % (node, target_dir))
         rc = self.rsh(node, "rpm -Uvh %s %s/*.rpm" % (flags, target_dir))
 
         if start and not self.start(node):
             return self.failure("start failure: "+node)
 
         return self.success()
 
     def upgrade(self, node, start=1):
         return self.install(node, self.Env["current-version"], start)
 
     def downgrade(self, node, start=1):
         return self.install(node, self.Env["previous-version"], start, "--force --nodeps")
 
     def __call__(self, node):
         '''Perform the 'Rolling Upgrade' test. '''
         self.incr("calls")
 
         for node in self.Env["nodes"]:
             if self.upgrade(node):
                 return self.failure("Couldn't upgrade %s" % node)
 
             self.CM.cluster_stable()
 
         return self.success()
 
     def is_applicable(self):
         if not self.is_applicable_common():
             return None
 
         if not self.Env.has_key("rpm-dir"):
             return None
         if not self.Env.has_key("current-version"):
             return None
         if not self.Env.has_key("previous-version"):
             return None
 
         return 1
 
 #        Register RestartTest as a good test to run
 AllTestClasses.append(RollingUpgradeTest)
 
 
 class BSC_AddResource(CTSTest):
     '''Add a resource to the cluster'''
     def __init__(self, cm):
         CTSTest.__init__(self, cm)
         self.name = "AddResource"
         self.resource_offset = 0
         self.cib_cmd = """cibadmin -C -o %s -X '%s' """
 
     def __call__(self, node):
         self.incr("calls")
         self.resource_offset =         self.resource_offset  + 1
 
         r_id = "bsc-rsc-%s-%d" % (node, self.resource_offset)
         start_pat = "crmd.*%s_start_0.*confirmed.*ok"
 
         patterns = []
         patterns.append(start_pat % r_id)
 
         watch = self.create_watch(patterns, self.Env["DeadTime"])
         watch.setwatch()
 
         ip = self.NextIP()
         if not self.make_ip_resource(node, r_id, "ocf", "IPaddr", ip):
             return self.failure("Make resource %s failed" % r_id)
 
         failed = 0
         watch_result = watch.lookforall()
         if watch.unmatched:
             for regex in watch.unmatched:
                 self.logger.log ("Warn: Pattern not found: %s" % (regex))
                 failed = 1
 
         if failed:
             return self.failure("Resource pattern(s) not found")
 
         if not self.CM.cluster_stable(self.Env["DeadTime"]):
             return self.failure("Unstable cluster")
 
         return self.success()
 
     def NextIP(self):
         ip = self.Env["IPBase"]
         if ":" in ip:
             fields = ip.rpartition(":")
             fields[2] = str(hex(int(fields[2], 16)+1))
             print str(hex(int(f[2], 16)+1))
         else:
             fields = ip.rpartition('.')
             fields[2] = str(int(fields[2])+1)
 
         ip = fields[0] + fields[1] + fields[3];
         self.Env["IPBase"] = ip
         return ip.strip()
 
     def make_ip_resource(self, node, id, rclass, type, ip):
         self.logger.log("Creating %s::%s:%s (%s) on %s" % (rclass,type,id,ip,node))
         rsc_xml="""
 <primitive id="%s" class="%s" type="%s"  provider="heartbeat">
     <instance_attributes id="%s"><attributes>
         <nvpair id="%s" name="ip" value="%s"/>
     </attributes></instance_attributes>
 </primitive>""" % (id, rclass, type, id, id, ip)
 
         node_constraint = """
       <rsc_location id="run_%s" rsc="%s">
         <rule id="pref_run_%s" score="100">
           <expression id="%s_loc_expr" attribute="#uname" operation="eq" value="%s"/>
         </rule>
       </rsc_location>""" % (id, id, id, id, node)
 
         rc = 0
         (rc, lines) = self.rsh(node, self.cib_cmd % ("constraints", node_constraint), None)
         if rc != 0:
             self.logger.log("Constraint creation failed: %d" % rc)
             return None
 
         (rc, lines) = self.rsh(node, self.cib_cmd % ("resources", rsc_xml), None)
         if rc != 0:
             self.logger.log("Resource creation failed: %d" % rc)
             return None
 
         return 1
 
     def is_applicable(self):
         if self.Env["DoBSC"]:
             return 1
         return None
 
 AllTestClasses.append(BSC_AddResource)
 
 
 class SimulStopLite(CTSTest):
     '''Stop any active nodes ~ simultaneously'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "SimulStopLite"
 
     def __call__(self, dummy):
         '''Perform the 'SimulStopLite' setup work. '''
         self.incr("calls")
 
         self.debug("Setup: " + self.name)
 
         #     We ignore the "node" parameter...
         watchpats = [ ]
 
         for node in self.Env["nodes"]:
             if self.CM.ShouldBeStatus[node] == "up":
                 self.incr("WasStarted")
                 watchpats.append(self.templates["Pat:We_stopped"] % node)
                 #if self.Env["use_logd"]:
                 #    watchpats.append(self.templates["Pat:Logd_stopped"] % node)
 
         if len(watchpats) == 0:
             self.CM.clear_all_caches()
             return self.success()
 
         #     Stop all the nodes - at about the same time...
         watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
 
         watch.setwatch()
         self.set_timer()
         for node in self.Env["nodes"]:
             if self.CM.ShouldBeStatus[node] == "up":
                 self.CM.StopaCMnoBlock(node)
         if watch.lookforall():
             self.CM.clear_all_caches()
 
             # Make sure they're completely down with no residule
             for node in self.Env["nodes"]:
                 self.rsh(node, self.templates["StopCmd"])
 
             return self.success()
 
         did_fail = 0
         up_nodes = []
         for node in self.Env["nodes"]:
             if self.CM.StataCM(node) == 1:
                 did_fail = 1
                 up_nodes.append(node)
 
         if did_fail:
             return self.failure("Active nodes exist: " + repr(up_nodes))
 
         self.logger.log("Warn: All nodes stopped but CTS didnt detect: "
                     + repr(watch.unmatched))
 
         self.CM.clear_all_caches()
         return self.failure("Missing log message: "+repr(watch.unmatched))
 
     def is_applicable(self):
         '''SimulStopLite is a setup test and never applicable'''
         return 0
 
 
 class SimulStartLite(CTSTest):
     '''Start any stopped nodes ~ simultaneously'''
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "SimulStartLite"
 
     def __call__(self, dummy):
         '''Perform the 'SimulStartList' setup work. '''
         self.incr("calls")
         self.debug("Setup: " + self.name)
 
         #        We ignore the "node" parameter...
         node_list = []
         for node in self.Env["nodes"]:
             if self.CM.ShouldBeStatus[node] == "down":
                 self.incr("WasStopped")
                 node_list.append(node)
 
         self.set_timer()
         while len(node_list) > 0:
             # Repeat until all nodes come up
             watchpats = [ ]
 
             uppat = self.templates["Pat:Slave_started"]
             if self.CM.upcount() == 0:
                 uppat = self.templates["Pat:Local_started"]
 
             watchpats.append(self.templates["Pat:DC_IDLE"])
             for node in node_list:
                 watchpats.append(uppat % node)
                 watchpats.append(self.templates["Pat:InfraUp"] % node)
                 watchpats.append(self.templates["Pat:PacemakerUp"] % node)
 
             #   Start all the nodes - at about the same time...
             watch = self.create_watch(watchpats, self.Env["DeadTime"]+10)
             watch.setwatch()
 
             stonith = self.CM.prepare_fencing_watcher(self.name)
 
             for node in node_list:
                 self.CM.StartaCMnoBlock(node)
 
             watch.lookforall()
 
             node_list = self.CM.fencing_cleanup(self.name, stonith)
 
             # Remove node_list messages from watch.unmatched
             for node in node_list:
                 self.logger.debug("Dealing with stonith operations for %s" % repr(node_list))
                 if watch.unmatched:
                     try:
                         watch.unmatched.remove(uppat % node)
                     except:
                         self.debug("Already matched: %s" % (uppat % node))
                     try:                        
                         watch.unmatched.remove(self.templates["Pat:InfraUp"] % node)
                     except:
                         self.debug("Already matched: %s" % (self.templates["Pat:InfraUp"] % node))
                     try:
                         watch.unmatched.remove(self.templates["Pat:PacemakerUp"] % node)
                     except:
                         self.debug("Already matched: %s" % (self.templates["Pat:PacemakerUp"] % node))
 
             if watch.unmatched:
                 for regex in watch.unmatched:
                     self.logger.log ("Warn: Startup pattern not found: %s" %(regex))
 
             if not self.CM.cluster_stable():
                 return self.failure("Cluster did not stabilize")
 
         did_fail = 0
         unstable = []
         for node in self.Env["nodes"]:
             if self.CM.StataCM(node) == 0:
                 did_fail = 1
                 unstable.append(node)
 
         if did_fail:
             return self.failure("Unstarted nodes exist: " + repr(unstable))
 
         unstable = []
         for node in self.Env["nodes"]:
             if not self.CM.node_stable(node):
                 did_fail = 1
                 unstable.append(node)
 
         if did_fail:
             return self.failure("Unstable cluster nodes exist: " + repr(unstable))
 
         return self.success()
 
     def is_applicable(self):
         '''SimulStartLite is a setup test and never applicable'''
         return 0
 
 
 def TestList(cm, audits):
     result = []
     for testclass in AllTestClasses:
         bound_test = testclass(cm)
         if bound_test.is_applicable():
             bound_test.Audits = audits
             result.append(bound_test)
     return result
 
 
 class RemoteLXC(CTSTest):
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "RemoteLXC"
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.num_containers = 2
         self.is_container = 1
         self.is_docker_unsafe = 1
         self.failed = 0
         self.fail_string = ""
 
     def start_lxc_simple(self, node):
 
         # restore any artifacts laying around from a previous test.
         self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -R &>/dev/null")
 
         # generate the containers, put them in the config, add some resources to them
         pats = [ ]
         watch = self.create_watch(pats, 120)
         watch.setwatch()
         pats.append(self.templates["Pat:RscOpOK"] % ("lxc1", "start_0"))
         pats.append(self.templates["Pat:RscOpOK"] % ("lxc2", "start_0"))
         pats.append(self.templates["Pat:RscOpOK"] % ("lxc-ms", "start_0"))
         pats.append(self.templates["Pat:RscOpOK"] % ("lxc-ms", "promote_0"))
 
         self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -g -a -m -s -c %d &>/dev/null" % self.num_containers)
         self.set_timer("remoteSimpleInit")
         watch.lookforall()
         self.log_timer("remoteSimpleInit")
         if watch.unmatched:
             self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
             self.failed = 1
 
     def cleanup_lxc_simple(self, node):
 
         pats = [ ]
         # if the test failed, attempt to clean up the cib and libvirt environment
         # as best as possible 
         if self.failed == 1:
             # restore libvirt and cib
             self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -R &>/dev/null")
             self.rsh(node, "crm_resource -C -r container1 &>/dev/null")
             self.rsh(node, "crm_resource -C -r container2 &>/dev/null")
             self.rsh(node, "crm_resource -C -r lxc1 &>/dev/null")
             self.rsh(node, "crm_resource -C -r lxc2 &>/dev/null")
             self.rsh(node, "crm_resource -C -r lxc-ms &>/dev/null")
             time.sleep(20)
             return
 
         watch = self.create_watch(pats, 120)
         watch.setwatch()
 
         pats.append(self.templates["Pat:RscOpOK"] % ("container1", "stop_0"))
         pats.append(self.templates["Pat:RscOpOK"] % ("container2", "stop_0"))
 
         self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -p &>/dev/null")
         self.set_timer("remoteSimpleCleanup")
         watch.lookforall()
         self.log_timer("remoteSimpleCleanup")
 
         if watch.unmatched:
             self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
             self.failed = 1
 
         # cleanup libvirt
         self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -R &>/dev/null")
 
     def __call__(self, node):
         '''Perform the 'RemoteLXC' test. '''
         self.incr("calls")
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed, start all nodes failed.")
 
         rc = self.rsh(node, "/usr/share/pacemaker/tests/cts/lxc_autogen.sh -v &>/dev/null")
         if rc == 1:
             self.log("Environment test for lxc support failed.")
             return self.skipped()
 
         self.start_lxc_simple(node)
         self.cleanup_lxc_simple(node)
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
 
         if self.failed == 1:
             return self.failure(self.fail_string)
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return [
             r"Updating failcount for ping",
             r"pengine.*: Recover (ping|lxc-ms|container)\s*\(.*\)",
             # The orphaned lxc-ms resource causes an expected transition error
             # that is a result of the pengine not having knowledge that the 
             # ms resource used to be a clone.  As a result it looks like that 
             # resource is running in multiple locations when it shouldn't... But in
             # this instance we know why this error is occurring and that it is expected.
             r"Calculated Transition .* /var/lib/pacemaker/pengine/pe-error",
             r"Resource lxc-ms .* is active on 2 nodes attempting recovery",
             r"Unknown operation: fail",
             r"(ERROR|error): sending stonithRA op to stonithd failed.",
         ]
 
 AllTestClasses.append(RemoteLXC)
 
 
 ###################################################################
 class RemoteDriver(CTSTest):
 ###################################################################
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "RemoteDriver"
         self.is_docker_unsafe = 1
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.stop = StopTest(cm)
         self.pcmk_started = 0
         self.failed = 0
         self.fail_string = ""
         self.remote_node_added = 0
         self.remote_rsc_added = 0
         self.remote_rsc = "remote-rsc"
         self.remote_use_reconnect_interval = self.Env.RandomGen.choice(["true","false"])
         self.cib_cmd = """cibadmin -C -o %s -X '%s' """
 
     def get_othernode(self, node):
         for othernode in self.Env["nodes"]:
             if othernode == node:
                 # we don't want to try and use the cib that we just shutdown.
                 # find a cluster node that is not our soon to be remote-node.
                 continue
             else:
                 return othernode
 
     def del_rsc(self, node, rsc):
         othernode = self.get_othernode(node)
         rc = self.rsh(othernode, "crm_resource -D -r %s -t primitive" % (rsc))
         if rc != 0:
             self.fail_string = ("Removal of resource '%s' failed" % (rsc))
             self.failed = 1
 
     def add_rsc(self, node, rsc_xml):
         othernode = self.get_othernode(node)
         rc = self.rsh(othernode, self.cib_cmd % ("resources", rsc_xml))
         if rc != 0:
             self.fail_string = "resource creation failed"
             self.failed = 1
 
     def add_primitive_rsc(self, node):
         rsc_xml = """
 <primitive class="ocf" id="%s" provider="heartbeat" type="Dummy">
     <operations>
       <op id="remote-rsc-monitor-interval-10s" interval="10s" name="monitor"/>
     </operations>
     <meta_attributes id="remote-meta_attributes"/>
 </primitive>""" % (self.remote_rsc)
         self.add_rsc(node, rsc_xml)
         if self.failed == 0:
             self.remote_rsc_added = 1
 
     def add_connection_rsc(self, node):
         if self.remote_use_reconnect_interval == "true":
             # use reconnect interval and make sure to set cluster-recheck-interval as well.
             rsc_xml = """
 <primitive class="ocf" id="%s" provider="pacemaker" type="remote">
     <instance_attributes id="remote-instance_attributes"/>
         <instance_attributes id="remote-instance_attributes">
           <nvpair id="remote-instance_attributes-server" name="server" value="%s"/>
           <nvpair id="remote-instance_attributes-reconnect_interval" name="reconnect_interval" value="60s"/>
         </instance_attributes>
     <operations>
       <op id="remote-monitor-interval-60s" interval="60s" name="monitor"/>
       <op id="remote-name-start-interval-0-timeout-120" interval="0" name="start" timeout="60"/>
     </operations>
 </primitive>""" % (self.remote_node, node)
             self.rsh(self.get_othernode(node), self.templates["SetCheckInterval"] % ("45s"))
         else:
             # not using reconnect interval
             rsc_xml = """
 <primitive class="ocf" id="%s" provider="pacemaker" type="remote">
     <instance_attributes id="remote-instance_attributes"/>
         <instance_attributes id="remote-instance_attributes">
           <nvpair id="remote-instance_attributes-server" name="server" value="%s"/>
         </instance_attributes>
     <operations>
       <op id="remote-monitor-interval-60s" interval="60s" name="monitor"/>
       <op id="remote-name-start-interval-0-timeout-120" interval="0" name="start" timeout="120"/>
     </operations>
 </primitive>""" % (self.remote_node, node)
 
         self.add_rsc(node, rsc_xml)
         if self.failed == 0:
             self.remote_node_added = 1
 
     def stop_pcmk_remote(self, node):
         # disable pcmk remote
         for i in range(10):
             rc = self.rsh(node, "service pacemaker_remote stop")
             if rc != 0:
                 time.sleep(6)
             else:
                 break
 
     def start_pcmk_remote(self, node):
         for i in range(10):
             rc = self.rsh(node, "service pacemaker_remote start")
             if rc != 0:
                 time.sleep(6)
             else:
                 self.pcmk_started = 1
                 break
 
     def start_metal(self, node):
         pcmk_started = 0
 
         # make sure the resource doesn't already exist for some reason
         self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_rsc))
         self.rsh(node, "crm_resource -D -r %s -t primitive" % (self.remote_node))
 
         if not self.stop(node):
             self.failed = 1
             self.fail_string = "Failed to shutdown cluster node %s" % (node)
             return
 
         self.start_pcmk_remote(node)
 
         if self.pcmk_started == 0:
             self.failed = 1
             self.fail_string = "Failed to start pacemaker_remote on node %s" % (node)
             return
 
         # convert node to baremetal node now that it has shutdow the cluster stack
         pats = [ ]
         watch = self.create_watch(pats, 120)
         watch.setwatch()
         pats.append(self.templates["Pat:RscOpOK"] % (self.remote_node, "start"))
         pats.append(self.templates["Pat:DC_IDLE"])
 
         self.add_connection_rsc(node)
 
         self.set_timer("remoteMetalInit")
         watch.lookforall()
         self.log_timer("remoteMetalInit")
         if watch.unmatched:
             self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
             self.failed = 1
 
     def migrate_connection(self, node):
         if self.failed == 1:
             return
 
         pats = [ ]
         pats.append(self.templates["Pat:RscOpOK"] % (self.remote_node, "migrate_to"))
         pats.append(self.templates["Pat:RscOpOK"] % (self.remote_node, "migrate_from"))
         pats.append(self.templates["Pat:DC_IDLE"])
         watch = self.create_watch(pats, 120)
         watch.setwatch()
 
         (rc, lines) = self.rsh(node, "crm_resource -M -r %s" % (self.remote_node), None)
         if rc != 0:
             self.fail_string = "failed to move remote node connection resource"
             self.logger.log(self.fail_string)
             self.failed = 1
             return
 
         self.set_timer("remoteMetalMigrate")
         watch.lookforall()
         self.log_timer("remoteMetalMigrate")
 
         if watch.unmatched:
             self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
             self.logger.log(self.fail_string)
             self.failed = 1
             return
 
     def fail_rsc(self, node):
         if self.failed == 1:
             return
 
         watchpats = [ ]
         watchpats.append(self.templates["Pat:RscRemoteOpOK"] % (self.remote_rsc, "stop", self.remote_node))
         watchpats.append(self.templates["Pat:RscRemoteOpOK"] % (self.remote_rsc, "start", self.remote_node))
         watchpats.append(self.templates["Pat:DC_IDLE"])
 
         watch = self.create_watch(watchpats, 120)
         watch.setwatch()
 
         self.debug("causing dummy rsc to fail.")
 
         rc = self.rsh(node, "rm -f /var/run/resource-agents/Dummy*")
 
         self.set_timer("remoteRscFail")
         watch.lookforall()
         self.log_timer("remoteRscFail")
         if watch.unmatched:
             self.fail_string = "Unmatched patterns during rsc fail: %s" % (repr(watch.unmatched))
             self.logger.log(self.fail_string)
             self.failed = 1
 
     def fail_connection(self, node):
         if self.failed == 1:
             return
 
         watchpats = [ ]
         watchpats.append(self.templates["Pat:FenceOpOK"] % self.remote_node)
         watchpats.append(self.templates["Pat:NodeFenced"] % self.remote_node)
 
         watch = self.create_watch(watchpats, 120)
         watch.setwatch()
 
         # force stop the pcmk remote daemon. this will result in fencing
         self.debug("Force stopped active remote node")
         self.stop_pcmk_remote(node)
 
         self.debug("Waiting for remote node to be fenced.")
         self.set_timer("remoteMetalFence")
         watch.lookforall()
         self.log_timer("remoteMetalFence")
         if watch.unmatched:
             self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
             self.logger.log(self.fail_string)
             self.failed = 1
             return
 
         self.debug("Waiting for the remote node to come back up")
         self.CM.ns.WaitForNodeToComeUp(node, 120);
 
         pats = [ ]
         watch = self.create_watch(pats, 200)
         watch.setwatch()
         pats.append(self.templates["Pat:RscOpOK"] % (self.remote_node, "start"))
         if self.remote_rsc_added == 1:
             pats.append(self.templates["Pat:RscOpOK"] % (self.remote_rsc, "monitor"))
 
         # start the remote node again watch it integrate back into cluster.
         self.start_pcmk_remote(node)
         if self.pcmk_started == 0:
             self.failed = 1
             self.fail_string = "Failed to start pacemaker_remote on node %s" % (node)
             self.logger.log(self.fail_string)
             return
 
         self.debug("Waiting for remote node to rejoin cluster after being fenced.")
         self.set_timer("remoteMetalRestart")
         watch.lookforall()
         self.log_timer("remoteMetalRestart")
         if watch.unmatched:
             self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
             self.failed = 1
             self.logger.log(self.fail_string)
             return
 
     def add_dummy_rsc(self, node):
         if self.failed == 1:
             return
 
         # verify we can put a resource on the remote node
         pats = [ ]
         watch = self.create_watch(pats, 120)
         watch.setwatch()
         pats.append(self.templates["Pat:RscRemoteOpOK"] % (self.remote_rsc, "start", self.remote_node))
         pats.append(self.templates["Pat:DC_IDLE"])
 
         # Add a resource that must live on remote-node
         self.add_primitive_rsc(node)
 
         # force that rsc to prefer the remote node. 
         (rc, line) = self.CM.rsh(node, "crm_resource -M -r %s -N %s -f" % (self.remote_rsc, self.remote_node), None)
         if rc != 0:
             self.fail_string = "Failed to place remote resource on remote node."
             self.failed = 1
             return
 
         self.set_timer("remoteMetalRsc")
         watch.lookforall()
         self.log_timer("remoteMetalRsc")
         if watch.unmatched:
             self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
             self.failed = 1
 
     def test_attributes(self, node):
         if self.failed == 1:
             return
 
         # This verifies permanent attributes can be set on a remote-node. It also
         # verifies the remote-node can edit it's own cib node section remotely.
         (rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -v testval -N %s" % (self.remote_node), None)
         if rc != 0:
             self.fail_string = "Failed to set remote-node attribute. rc:%s output:%s" % (rc, line)
             self.failed = 1
             return
 
         (rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -Q -N %s" % (self.remote_node), None)
         if rc != 0:
             self.fail_string = "Failed to get remote-node attribute"
             self.failed = 1
             return
 
         (rc, line) = self.CM.rsh(node, "crm_attribute -l forever -n testattr -D -N %s" % (self.remote_node), None)
         if rc != 0:
             self.fail_string = "Failed to delete remote-node attribute"
             self.failed = 1
             return
 
     def cleanup_metal(self, node):
         if self.pcmk_started == 0:
             return
 
         pats = [ ]
 
         watch = self.create_watch(pats, 120)
         watch.setwatch()
 
         if self.remote_rsc_added == 1:
             pats.append(self.templates["Pat:RscOpOK"] % (self.remote_rsc, "stop"))
         if self.remote_node_added == 1:
             pats.append(self.templates["Pat:RscOpOK"] % (self.remote_node, "stop"))
 
         self.set_timer("remoteMetalCleanup")
 
         if self.remote_use_reconnect_interval == "true":
             self.debug("Cleaning up re-check interval")
             self.rsh(self.get_othernode(node), self.templates["ClearCheckInterval"])
         if self.remote_rsc_added == 1:
             self.debug("Cleaning up dummy rsc put on remote node")
             self.rsh(node, "crm_resource -U -r %s -N %s" % (self.remote_rsc, self.remote_node))
             self.del_rsc(node, self.remote_rsc)
         if self.remote_node_added == 1:
             self.debug("Cleaning up remote node connection resource")
             self.rsh(node, "crm_resource -U -r %s" % (self.remote_node))
             self.del_rsc(node, self.remote_node)
 
         watch.lookforall()
         self.log_timer("remoteMetalCleanup")
 
         if watch.unmatched:
             self.fail_string = "Unmatched patterns: %s" % (repr(watch.unmatched))
             self.failed = 1
 
         self.stop_pcmk_remote(node)
 
     def setup_env(self, node):
 
         self.remote_node = "remote_%s" % (node)
         sync_key = 0
 
         # we are assuming if all nodes have a key, that it is
         # the right key... If any node doesn't have a remote
         # key, we regenerate it everywhere.
         for node in self.Env["nodes"]:
             rc = self.rsh(node, "ls /etc/pacemaker/authkey")
             if rc != 0:
                 sync_key = 1
                 break
 
         if sync_key == 0:
             return
 
         # create key locally
         os.system("/usr/share/pacemaker/tests/cts/lxc_autogen.sh -k &> /dev/null")
 
         # sync key throughout the cluster
         for node in self.Env["nodes"]:
             rc = self.rsh(node, "mkdir /etc/pacemaker")
             self.rsh.cp("/etc/pacemaker/authkey", "%s:/etc/pacemaker/authkey" % (node))
 
     def is_applicable(self):
         if not self.is_applicable_common():
             return False
 
         for node in self.Env["nodes"]:
             rc = self.rsh(node, "type pacemaker_remoted >/dev/null 2>&1")
             if rc != 0:
                 return False
         return True
 
     def __call__(self, node):
         '''Perform the 'RemoteBaremetal' test. '''
         self.incr("calls")
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed, start all nodes failed.")
 
         self.setup_env(node)
         self.start_metal(node)
         self.add_dummy_rsc(node)
         self.test_attributes(node)
         self.cleanup_metal(node)
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
         if self.failed == 1:
             return self.failure(self.fail_string)
 
         return self.success()
 
     def errorstoignore(self):
         '''Return list of errors which should be ignored'''
         return [ """is running on remote.*which isn't allowed""",
                  """Connection terminated""",
                  """Failed to send remote""",
                 ]
 
 # Remote driver is called by other tests.
 
 ###################################################################
 class RemoteBasic(CTSTest):
 ###################################################################
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "RemoteBasic"
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.driver = RemoteDriver(cm)
         self.is_docker_unsafe = 1
 
     def __call__(self, node):
         '''Perform the 'RemoteBaremetal' test. '''
         self.incr("calls")
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed, start all nodes failed.")
 
         self.driver.setup_env(node)
         self.driver.start_metal(node)
         self.driver.add_dummy_rsc(node)
         self.driver.test_attributes(node)
         self.driver.cleanup_metal(node)
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
         if self.driver.failed == 1:
             return self.failure(self.driver.fail_string)
 
         return self.success()
 
     def is_applicable(self):
         return self.driver.is_applicable()
 
     def errorstoignore(self):
         return self.driver.errorstoignore()
 
 AllTestClasses.append(RemoteBasic)
 
 ###################################################################
 class RemoteStonithd(CTSTest):
 ###################################################################
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "RemoteStonithd"
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.driver = RemoteDriver(cm)
         self.is_docker_unsafe = 1
 
     def __call__(self, node):
         '''Perform the 'RemoteStonithd' test. '''
         self.incr("calls")
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed, start all nodes failed.")
 
         self.driver.setup_env(node)
         self.driver.start_metal(node)
         self.driver.add_dummy_rsc(node)
 
         self.driver.fail_connection(node)
         self.driver.cleanup_metal(node)
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
         if self.driver.failed == 1:
             return self.failure(self.driver.fail_string)
 
         return self.success()
 
     def is_applicable(self):
         if not self.driver.is_applicable():
             return False
 
         if self.Env.has_key("DoFencing"):
             return self.Env["DoFencing"]
 
         return True
 
     def errorstoignore(self):
         ignore_pats = [
             r"Unexpected disconnect on remote-node",
             r"crmd.*: error.*: Operation remote_.*_monitor",
             r"pengine.*: Recover remote_.*\s*\(.*\)",
             r"Calculated Transition .* /var/lib/pacemaker/pengine/pe-error",
             r"error.*: Resource .*ocf::.* is active on 2 nodes attempting recovery",
         ]
 
         ignore_pats.extend(self.driver.errorstoignore())
         return ignore_pats
 
 AllTestClasses.append(RemoteStonithd)
 
 ###################################################################
 class RemoteMigrate(CTSTest):
 ###################################################################
     def __init__(self, cm):
         CTSTest.__init__(self,cm)
         self.name = "RemoteMigrate"
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.driver = RemoteDriver(cm)
         self.is_docker_unsafe = 1
 
     def __call__(self, node):
         '''Perform the 'RemoteMigrate' test. '''
         self.incr("calls")
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed, start all nodes failed.")
 
         self.driver.setup_env(node)
         self.driver.start_metal(node)
         self.driver.add_dummy_rsc(node)
         self.driver.migrate_connection(node)
         self.driver.cleanup_metal(node)
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
         if self.driver.failed == 1:
             return self.failure(self.driver.fail_string)
 
         return self.success()
 
     def is_applicable(self):
         return self.driver.is_applicable()
 
     def errorstoignore(self):
         return self.driver.errorstoignore()
 
 AllTestClasses.append(RemoteMigrate)
 
 
 ###################################################################
 class RemoteRscFailure(CTSTest):
 ###################################################################
     def __init__(self, cm):
 
         # fail a rsc on a remote node, verify recovery.
         CTSTest.__init__(self,cm)
         self.name = "RemoteRscFailure"
         self.start = StartTest(cm)
         self.startall = SimulStartLite(cm)
         self.driver = RemoteDriver(cm)
         self.is_docker_unsafe = 1
 
     def __call__(self, node):
         '''Perform the 'RemoteRscFailure' test. '''
         self.incr("calls")
 
         ret = self.startall(None)
         if not ret:
             return self.failure("Setup failed, start all nodes failed.")
 
         self.driver.setup_env(node)
         self.driver.start_metal(node)
         self.driver.add_dummy_rsc(node)
 
         # This is an important step. We are migrating the connection
         # before failing the resource. This verifies that the migration
         # has properly maintained control over the remote-node.
         self.driver.migrate_connection(node)
 
         self.driver.fail_rsc(node)
         self.driver.cleanup_metal(node)
 
         self.debug("Waiting for the cluster to recover")
         self.CM.cluster_stable()
         if self.driver.failed == 1:
             return self.failure(self.driver.fail_string)
 
         return self.success()
 
     def is_applicable(self):
         return self.driver.is_applicable()
 
     def errorstoignore(self):
         ignore_pats = [
             r"pengine.*: Recover remote-rsc\s*\(.*\)",
         ]
 
         ignore_pats.extend(self.driver.errorstoignore())
         return ignore_pats
 
 AllTestClasses.append(RemoteRscFailure)
 
 # vim:ts=4:sw=4:et:
diff --git a/fencing/commands.c b/fencing/commands.c
index 0d2d614137..068300b4b5 100644
--- a/fencing/commands.c
+++ b/fencing/commands.c
@@ -1,2480 +1,2479 @@
 /*
  * Copyright (C) 2009 Andrew Beekhof <andrew@beekhof.net>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  *
  * This software is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
 #include <crm_internal.h>
 
 #include <sys/param.h>
 #include <stdio.h>
 #include <sys/types.h>
 #include <sys/wait.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #include <sys/utsname.h>
 
 #include <stdlib.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <ctype.h>
 
 #include <crm/crm.h>
 #include <crm/msg_xml.h>
 #include <crm/common/ipc.h>
 #include <crm/common/ipcs.h>
 #include <crm/cluster/internal.h>
 #include <crm/common/mainloop.h>
 
 #include <crm/stonith-ng.h>
 #include <crm/fencing/internal.h>
 #include <crm/common/xml.h>
 
 #if SUPPORT_CIBSECRETS
 #  include <crm/common/cib_secrets.h>
 #endif
 
 #include <internal.h>
 
 GHashTable *device_list = NULL;
 GHashTable *topology = NULL;
 GList *cmd_list = NULL;
 
 struct device_search_s {
     /* target of fence action */
     char *host;
     /* requested fence action */
     char *action;
     /* timeout to use if a device is queried dynamically for possible targets */
     int per_device_timeout;
     /* number of registered fencing devices at time of request */
     int replies_needed;
     /* number of device replies received so far */
     int replies_received;
     /* whether the target is eligible to perform requested action (or off) */
     bool allow_suicide;
 
     /* private data to pass to search callback function */
     void *user_data;
     /* function to call when all replies have been received */
     void (*callback) (GList * devices, void *user_data);
     /* devices capable of performing requested action (or off if remapping) */
     GListPtr capable;
 };
 
 static gboolean stonith_device_dispatch(gpointer user_data);
 static void st_child_done(GPid pid, int rc, const char *output, gpointer user_data);
 static void stonith_send_reply(xmlNode * reply, int call_options, const char *remote_peer,
                                const char *client_id);
 
 static void search_devices_record_result(struct device_search_s *search, const char *device,
                                          gboolean can_fence);
 
 typedef struct async_command_s {
 
     int id;
     int pid;
     int fd_stdout;
     int options;
     int default_timeout; /* seconds */
     int timeout; /* seconds */
 
     int start_delay; /* milliseconds */
     int delay_id;
 
     char *op;
     char *origin;
     char *client;
     char *client_name;
     char *remote_op_id;
 
     char *victim;
     uint32_t victim_nodeid;
     char *action;
     char *device;
     char *mode;
 
     GListPtr device_list;
     GListPtr device_next;
 
     void *internal_user_data;
     void (*done_cb) (GPid pid, int rc, const char *output, gpointer user_data);
     guint timer_sigterm;
     guint timer_sigkill;
     /*! If the operation timed out, this is the last signal
      *  we sent to the process to get it to terminate */
     int last_timeout_signo;
 } async_command_t;
 
 static xmlNode *stonith_construct_async_reply(async_command_t * cmd, const char *output,
                                               xmlNode * data, int rc);
 
 static gboolean
 is_action_required(const char *action, stonith_device_t *device)
 {
     if(device == NULL) {
         return FALSE;
 
     } else if (device->required_actions == NULL) {
         return FALSE;
 
     } else if (strstr(device->required_actions, action)) {
         return TRUE;
     }
 
     return FALSE;
 }
 
 static int
 get_action_delay_max(stonith_device_t * device, const char * action)
 {
     const char *value = NULL;
     int delay_max_ms = 0;
 
     if (safe_str_neq(action, "off") && safe_str_neq(action, "reboot")) {
         return 0;
     }
 
     value = g_hash_table_lookup(device->params, STONITH_ATTR_DELAY_MAX);
     if (value) {
        delay_max_ms = crm_get_msec(value);
     }
 
     return delay_max_ms;
 }
 
 /*!
  * \internal
  * \brief Override STONITH timeout with pcmk_*_timeout if available
  *
  * \param[in] device           STONITH device to use
  * \param[in] action           STONITH action name
  * \param[in] default_timeout  Timeout to use if device does not have
  *                             a pcmk_*_timeout parameter for action
  *
  * \return Value of pcmk_(action)_timeout if available, otherwise default_timeout
  * \note For consistency, it would be nice if reboot/off/on timeouts could be
  *       set the same way as start/stop/monitor timeouts, i.e. with an
  *       <operation> entry in the fencing resource configuration. However that
  *       is insufficient because fencing devices may be registered directly via
  *       the STONITH register_device() API instead of going through the CIB
  *       (e.g. stonith_admin uses it for its -R option, and the LRMD uses it to
  *       ensure a device is registered when a command is issued). As device
  *       properties, pcmk_*_timeout parameters can be grabbed by stonithd when
  *       the device is registered, whether by CIB change or API call.
  */
 static int
 get_action_timeout(stonith_device_t * device, const char *action, int default_timeout)
 {
     if (action && device && device->params) {
         char buffer[64] = { 0, };
         const char *value = NULL;
 
         /* If "reboot" was requested but the device does not support it,
          * we will remap to "off", so check timeout for "off" instead
          */
         if (safe_str_eq(action, "reboot")
             && is_not_set(device->flags, st_device_supports_reboot)) {
             crm_trace("%s doesn't support reboot, using timeout for off instead",
                       device->id);
             action = "off";
         }
 
         /* If the device config specified an action-specific timeout, use it */
         snprintf(buffer, sizeof(buffer) - 1, "pcmk_%s_timeout", action);
         value = g_hash_table_lookup(device->params, buffer);
         if (value) {
             return atoi(value);
         }
     }
     return default_timeout;
 }
 
 static void
 free_async_command(async_command_t * cmd)
 {
     if (!cmd) {
         return;
     }
 
     if (cmd->delay_id) {
         g_source_remove(cmd->delay_id);
     }
 
     cmd_list = g_list_remove(cmd_list, cmd);
 
     g_list_free_full(cmd->device_list, free);
     free(cmd->device);
     free(cmd->action);
     free(cmd->victim);
     free(cmd->remote_op_id);
     free(cmd->client);
     free(cmd->client_name);
     free(cmd->origin);
     free(cmd->mode);
     free(cmd->op);
     free(cmd);
 }
 
 static async_command_t *
 create_async_command(xmlNode * msg)
 {
     async_command_t *cmd = NULL;
     xmlNode *op = get_xpath_object("//@" F_STONITH_ACTION, msg, LOG_ERR);
     const char *action = crm_element_value(op, F_STONITH_ACTION);
 
     CRM_CHECK(action != NULL, crm_log_xml_warn(msg, "NoAction"); return NULL);
 
     crm_log_xml_trace(msg, "Command");
     cmd = calloc(1, sizeof(async_command_t));
     crm_element_value_int(msg, F_STONITH_CALLID, &(cmd->id));
     crm_element_value_int(msg, F_STONITH_CALLOPTS, &(cmd->options));
     crm_element_value_int(msg, F_STONITH_TIMEOUT, &(cmd->default_timeout));
     cmd->timeout = cmd->default_timeout;
 
     cmd->origin = crm_element_value_copy(msg, F_ORIG);
     cmd->remote_op_id = crm_element_value_copy(msg, F_STONITH_REMOTE_OP_ID);
     cmd->client = crm_element_value_copy(msg, F_STONITH_CLIENTID);
     cmd->client_name = crm_element_value_copy(msg, F_STONITH_CLIENTNAME);
     cmd->op = crm_element_value_copy(msg, F_STONITH_OPERATION);
     cmd->action = strdup(action);
     cmd->victim = crm_element_value_copy(op, F_STONITH_TARGET);
     cmd->mode = crm_element_value_copy(op, F_STONITH_MODE);
     cmd->device = crm_element_value_copy(op, F_STONITH_DEVICE);
 
     CRM_CHECK(cmd->op != NULL, crm_log_xml_warn(msg, "NoOp"); free_async_command(cmd); return NULL);
     CRM_CHECK(cmd->client != NULL, crm_log_xml_warn(msg, "NoClient"));
 
     cmd->done_cb = st_child_done;
     cmd_list = g_list_append(cmd_list, cmd);
     return cmd;
 }
 
 static gboolean
 stonith_device_execute(stonith_device_t * device)
 {
     int exec_rc = 0;
     const char *action_str = NULL;
     async_command_t *cmd = NULL;
     stonith_action_t *action = NULL;
 
     CRM_CHECK(device != NULL, return FALSE);
 
     if (device->active_pid) {
         crm_trace("%s is still active with pid %u", device->id, device->active_pid);
         return TRUE;
     }
 
     if (device->pending_ops) {
         GList *first = device->pending_ops;
 
         cmd = first->data;
         if (cmd && cmd->delay_id) {
             crm_trace
                 ("Operation %s%s%s on %s was asked to run too early, waiting for start_delay timeout of %dms",
                  cmd->action, cmd->victim ? " for node " : "", cmd->victim ? cmd->victim : "",
                  device->id, cmd->start_delay);
             return TRUE;
         }
 
         device->pending_ops = g_list_remove_link(device->pending_ops, first);
         g_list_free_1(first);
     }
 
     if (cmd == NULL) {
         crm_trace("Nothing further to do for %s", device->id);
         return TRUE;
     }
 
     if(safe_str_eq(device->agent, STONITH_WATCHDOG_AGENT)) {
         if(safe_str_eq(cmd->action, "reboot")) {
             pcmk_panic(__FUNCTION__);
             return TRUE;
 
         } else if(safe_str_eq(cmd->action, "off")) {
             pcmk_panic(__FUNCTION__);
             return TRUE;
 
         } else {
             crm_info("Faking success for %s watchdog operation", cmd->action);
             cmd->done_cb(0, 0, NULL, cmd);
             return TRUE;
         }
     }
 
 #if SUPPORT_CIBSECRETS
     if (replace_secret_params(device->id, device->params) < 0) {
         /* replacing secrets failed! */
         if (safe_str_eq(cmd->action,"stop")) {
             /* don't fail on stop! */
             crm_info("proceeding with the stop operation for %s", device->id);
 
         } else {
             crm_err("failed to get secrets for %s, "
                     "considering resource not configured", device->id);
             exec_rc = PCMK_OCF_NOT_CONFIGURED;
             cmd->done_cb(0, exec_rc, NULL, cmd);
             return TRUE;
         }
     }
 #endif
 
     action_str = cmd->action;
     if (safe_str_eq(cmd->action, "reboot") && is_not_set(device->flags, st_device_supports_reboot)) {
         crm_warn("Agent '%s' does not advertise support for 'reboot', performing 'off' action instead", device->agent);
         action_str = "off";
     }
 
     action = stonith_action_create(device->agent,
                                    action_str,
                                    cmd->victim,
                                    cmd->victim_nodeid,
                                    cmd->timeout, device->params, device->aliases);
 
     /* for async exec, exec_rc is pid if positive and error code if negative/zero */
     exec_rc = stonith_action_execute_async(action, (void *)cmd, cmd->done_cb);
 
     if (exec_rc > 0) {
         crm_debug("Operation %s%s%s on %s now running with pid=%d, timeout=%ds",
                   cmd->action, cmd->victim ? " for node " : "", cmd->victim ? cmd->victim : "",
                   device->id, exec_rc, cmd->timeout);
         device->active_pid = exec_rc;
 
     } else {
         crm_warn("Operation %s%s%s on %s failed: %s (%d)",
                  cmd->action, cmd->victim ? " for node " : "", cmd->victim ? cmd->victim : "",
                  device->id, pcmk_strerror(exec_rc), exec_rc);
         cmd->done_cb(0, exec_rc, NULL, cmd);
     }
     return TRUE;
 }
 
 static gboolean
 stonith_device_dispatch(gpointer user_data)
 {
     return stonith_device_execute(user_data);
 }
 
 static gboolean
 start_delay_helper(gpointer data)
 {
     async_command_t *cmd = data;
     stonith_device_t *device = NULL;
 
     cmd->delay_id = 0;
     device = cmd->device ? g_hash_table_lookup(device_list, cmd->device) : NULL;
 
     if (device) {
         mainloop_set_trigger(device->work);
     }
 
     return FALSE;
 }
 
 static void
 schedule_stonith_command(async_command_t * cmd, stonith_device_t * device)
 {
     int delay_max = 0;
 
     CRM_CHECK(cmd != NULL, return);
     CRM_CHECK(device != NULL, return);
 
     if (cmd->device) {
         free(cmd->device);
     }
 
     if (device->include_nodeid && cmd->victim) {
         crm_node_t *node = crm_get_peer(0, cmd->victim);
 
         cmd->victim_nodeid = node->id;
     }
 
     cmd->device = strdup(device->id);
     cmd->timeout = get_action_timeout(device, cmd->action, cmd->default_timeout);
 
     if (cmd->remote_op_id) {
         crm_debug("Scheduling %s on %s for remote peer %s with op id (%s) (timeout=%ds)",
                   cmd->action, device->id, cmd->origin, cmd->remote_op_id, cmd->timeout);
     } else {
         crm_debug("Scheduling %s on %s for %s (timeout=%ds)",
                   cmd->action, device->id, cmd->client, cmd->timeout);
     }
 
     device->pending_ops = g_list_append(device->pending_ops, cmd);
     mainloop_set_trigger(device->work);
 
     delay_max = get_action_delay_max(device, cmd->action);
     if (delay_max > 0) {
         cmd->start_delay = rand() % delay_max;
         crm_notice("Delaying %s on %s for %lldms (timeout=%ds)",
                     cmd->action, device->id, cmd->start_delay, cmd->timeout);
         cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd);
     }
 }
 
 void
 free_device(gpointer data)
 {
     GListPtr gIter = NULL;
     stonith_device_t *device = data;
 
     g_hash_table_destroy(device->params);
     g_hash_table_destroy(device->aliases);
 
     for (gIter = device->pending_ops; gIter != NULL; gIter = gIter->next) {
         async_command_t *cmd = gIter->data;
 
         crm_warn("Removal of device '%s' purged operation %s", device->id, cmd->action);
         cmd->done_cb(0, -ENODEV, NULL, cmd);
         free_async_command(cmd);
     }
     g_list_free(device->pending_ops);
 
     g_list_free_full(device->targets, free);
 
     mainloop_destroy_trigger(device->work);
 
     free_xml(device->agent_metadata);
     free(device->namespace);
     free(device->on_target_actions);
     free(device->required_actions);
     free(device->agent);
     free(device->id);
     free(device);
 }
 
 static GHashTable *
 build_port_aliases(const char *hostmap, GListPtr * targets)
 {
     char *name = NULL;
     int last = 0, lpc = 0, max = 0, added = 0;
     GHashTable *aliases =
         g_hash_table_new_full(crm_strcase_hash, crm_strcase_equal, g_hash_destroy_str, g_hash_destroy_str);
 
     if (hostmap == NULL) {
         return aliases;
     }
 
     max = strlen(hostmap);
     for (; lpc <= max; lpc++) {
         switch (hostmap[lpc]) {
                 /* Assignment chars */
             case '=':
             case ':':
                 if (lpc > last) {
                     free(name);
                     name = calloc(1, 1 + lpc - last);
                     memcpy(name, hostmap + last, lpc - last);
                 }
                 last = lpc + 1;
                 break;
 
                 /* Delimeter chars */
                 /* case ',': Potentially used to specify multiple ports */
             case 0:
             case ';':
             case ' ':
             case '\t':
                 if (name) {
                     char *value = NULL;
 
                     value = calloc(1, 1 + lpc - last);
                     memcpy(value, hostmap + last, lpc - last);
 
                     crm_debug("Adding alias '%s'='%s'", name, value);
                     g_hash_table_replace(aliases, name, value);
                     if (targets) {
                         *targets = g_list_append(*targets, strdup(value));
                     }
                     value = NULL;
                     name = NULL;
                     added++;
 
                 } else if (lpc > last) {
                     crm_debug("Parse error at offset %d near '%s'", lpc - last, hostmap + last);
                 }
 
                 last = lpc + 1;
                 break;
         }
 
         if (hostmap[lpc] == 0) {
             break;
         }
     }
 
     if (added == 0) {
         crm_info("No host mappings detected in '%s'", hostmap);
     }
 
     free(name);
     return aliases;
 }
 
 static void
 parse_host_line(const char *line, int max, GListPtr * output)
 {
     int lpc = 0;
     int last = 0;
 
     if (max <= 0) {
         return;
     }
 
     /* Check for any complaints about additional parameters that the device doesn't understand */
     if (strstr(line, "invalid") || strstr(line, "variable")) {
         crm_debug("Skipping: %s", line);
         return;
     }
 
     crm_trace("Processing %d bytes: [%s]", max, line);
     /* Skip initial whitespace */
     for (lpc = 0; lpc <= max && isspace(line[lpc]); lpc++) {
         last = lpc + 1;
     }
 
     /* Now the actual content */
     for (lpc = 0; lpc <= max; lpc++) {
         gboolean a_space = isspace(line[lpc]);
 
         if (a_space && lpc < max && isspace(line[lpc + 1])) {
             /* fast-forward to the end of the spaces */
 
         } else if (a_space || line[lpc] == ',' || line[lpc] == ';' || line[lpc] == 0) {
             int rc = 1;
             char *entry = NULL;
 
             if (lpc != last) {
                 entry = calloc(1, 1 + lpc - last);
                 rc = sscanf(line + last, "%[a-zA-Z0-9_-.]", entry);
             }
 
             if (entry == NULL) {
                 /* Skip */
             } else if (rc != 1) {
                 crm_warn("Could not parse (%d %d): %s", last, lpc, line + last);
             } else if (safe_str_neq(entry, "on") && safe_str_neq(entry, "off")) {
                 crm_trace("Adding '%s'", entry);
                 *output = g_list_append(*output, entry);
                 entry = NULL;
             }
 
             free(entry);
             last = lpc + 1;
         }
     }
 }
 
 static GListPtr
 parse_host_list(const char *hosts)
 {
     int lpc = 0;
     int max = 0;
     int last = 0;
     GListPtr output = NULL;
 
     if (hosts == NULL) {
         return output;
     }
 
     max = strlen(hosts);
     for (lpc = 0; lpc <= max; lpc++) {
         if (hosts[lpc] == '\n' || hosts[lpc] == 0) {
             char *line = NULL;
             int len = lpc - last;
 
             if(len > 1) {
                 line = malloc(1 + len);
             }
 
             if(line) {
                 snprintf(line, 1 + len, "%s", hosts + last);
                 line[len] = 0; /* Because it might be '\n' */
                 parse_host_line(line, len, &output);
                 free(line);
             }
 
             last = lpc + 1;
         }
     }
 
     crm_trace("Parsed %d entries from '%s'", g_list_length(output), hosts);
     return output;
 }
 
 GHashTable *metadata_cache = NULL;
 
 static xmlNode *
 get_agent_metadata(const char *agent)
 {
     xmlNode *xml = NULL;
     char *buffer = NULL;
 
     if(metadata_cache == NULL) {
         metadata_cache = g_hash_table_new_full(
             crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str);
     }
 
     buffer = g_hash_table_lookup(metadata_cache, agent);
     if(safe_str_eq(agent, STONITH_WATCHDOG_AGENT)) {
         return NULL;
 
     } else if(buffer == NULL) {
         stonith_t *st = stonith_api_new();
         int rc = st->cmds->metadata(st, st_opt_sync_call, agent, NULL, &buffer, 10);
 
         stonith_api_delete(st);
         if (rc || !buffer) {
             crm_err("Could not retrieve metadata for fencing agent %s", agent);
             return NULL;
         }
         g_hash_table_replace(metadata_cache, strdup(agent), buffer);
     }
 
     xml = string2xml(buffer);
 
     return xml;
 }
 
 static gboolean
 is_nodeid_required(xmlNode * xml)
 {
     xmlXPathObjectPtr xpath = NULL;
 
     if (stand_alone) {
         return FALSE;
     }
 
     if (!xml) {
         return FALSE;
     }
 
     xpath = xpath_search(xml, "//parameter[@name='nodeid']");
     if (numXpathResults(xpath)  <= 0) {
         freeXpathObject(xpath);
         return FALSE;
     }
 
     freeXpathObject(xpath);
     return TRUE;
 }
 
 static char *
 add_action(char *actions, const char *action)
 {
     static size_t len = 256;
     int offset = 0;
 
     if (actions == NULL) {
         actions = calloc(1, len);
     } else {
         offset = strlen(actions);
     }
 
     if (offset > 0) {
         offset += snprintf(actions+offset, len-offset, " ");
     }
     offset += snprintf(actions+offset, len-offset, "%s", action);
 
     return actions;
 }
 
 static void
 read_action_metadata(stonith_device_t *device)
 {
     xmlXPathObjectPtr xpath = NULL;
     int max = 0;
     int lpc = 0;
 
     if (device->agent_metadata == NULL) {
         return;
     }
 
     xpath = xpath_search(device->agent_metadata, "//action");
     max = numXpathResults(xpath);
 
     if (max <= 0) {
         freeXpathObject(xpath);
         return;
     }
 
     for (lpc = 0; lpc < max; lpc++) {
         const char *on_target = NULL;
         const char *action = NULL;
         const char *automatic = NULL;
         const char *required = NULL;
         xmlNode *match = getXpathResult(xpath, lpc);
 
         CRM_LOG_ASSERT(match != NULL);
         if(match == NULL) { continue; };
 
         on_target = crm_element_value(match, "on_target");
         action = crm_element_value(match, "name");
         automatic = crm_element_value(match, "automatic");
         required = crm_element_value(match, "required");
 
         if(safe_str_eq(action, "list")) {
             set_bit(device->flags, st_device_supports_list);
         } else if(safe_str_eq(action, "status")) {
             set_bit(device->flags, st_device_supports_status);
         } else if(safe_str_eq(action, "reboot")) {
             set_bit(device->flags, st_device_supports_reboot);
         } else if(safe_str_eq(action, "on") && (crm_is_true(automatic))) {
             /* this setting implies required=true for unfencing */
             required = "true";
         }
 
         if (action && crm_is_true(on_target)) {
             device->on_target_actions = add_action(device->on_target_actions, action);
         }
         if (action && crm_is_true(required)) {
             device->required_actions = add_action(device->required_actions, action);
         }
     }
 
     freeXpathObject(xpath);
 }
 
 static stonith_device_t *
 build_device_from_xml(xmlNode * msg)
 {
     const char *value = NULL;
     xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, msg, LOG_ERR);
     stonith_device_t *device = NULL;
 
     device = calloc(1, sizeof(stonith_device_t));
     device->id = crm_element_value_copy(dev, XML_ATTR_ID);
     device->agent = crm_element_value_copy(dev, "agent");
     device->namespace = crm_element_value_copy(dev, "namespace");
     device->params = xml2list(dev);
 
     value = g_hash_table_lookup(device->params, STONITH_ATTR_HOSTLIST);
     if (value) {
         device->targets = parse_host_list(value);
     }
 
     value = g_hash_table_lookup(device->params, STONITH_ATTR_HOSTMAP);
     device->aliases = build_port_aliases(value, &(device->targets));
 
     device->agent_metadata = get_agent_metadata(device->agent);
     read_action_metadata(device);
 
     value = g_hash_table_lookup(device->params, "nodeid");
     if (!value) {
         device->include_nodeid = is_nodeid_required(device->agent_metadata);
     }
 
     value = crm_element_value(dev, "rsc_provides");
     if (safe_str_eq(value, "unfencing")) {
         /* if this agent requires unfencing, 'on' is considered a required action */
         device->required_actions = add_action(device->required_actions, "on");
     }
 
     if (is_action_required("on", device)) {
         crm_info("The fencing device '%s' requires unfencing", device->id);
     }
 
     if (device->on_target_actions) {
         crm_info("The fencing device '%s' requires actions (%s) to be executed on the target node",
                  device->id, device->on_target_actions);
     }
 
     device->work = mainloop_add_trigger(G_PRIORITY_HIGH, stonith_device_dispatch, device);
     /* TODO: Hook up priority */
 
     return device;
 }
 
 static const char *
 target_list_type(stonith_device_t * dev)
 {
     const char *check_type = NULL;
 
     check_type = g_hash_table_lookup(dev->params, STONITH_ATTR_HOSTCHECK);
 
     if (check_type == NULL) {
 
         if (g_hash_table_lookup(dev->params, STONITH_ATTR_HOSTLIST)) {
             check_type = "static-list";
         } else if (g_hash_table_lookup(dev->params, STONITH_ATTR_HOSTMAP)) {
             check_type = "static-list";
         } else if(is_set(dev->flags, st_device_supports_list)){
             check_type = "dynamic-list";
         } else if(is_set(dev->flags, st_device_supports_status)){
             check_type = "status";
         } else {
             check_type = "none";
         }
     }
 
     return check_type;
 }
 
 void
 schedule_internal_command(const char *origin,
                           stonith_device_t * device,
                           const char *action,
                           const char *victim,
                           int timeout,
                           void *internal_user_data,
                           void (*done_cb) (GPid pid, int rc, const char *output,
                                            gpointer user_data))
 {
     async_command_t *cmd = NULL;
 
     cmd = calloc(1, sizeof(async_command_t));
 
     cmd->id = -1;
     cmd->default_timeout = timeout ? timeout : 60;
     cmd->timeout = cmd->default_timeout;
     cmd->action = strdup(action);
     cmd->victim = victim ? strdup(victim) : NULL;
     cmd->device = strdup(device->id);
     cmd->origin = strdup(origin);
     cmd->client = strdup(crm_system_name);
     cmd->client_name = strdup(crm_system_name);
 
     cmd->internal_user_data = internal_user_data;
     cmd->done_cb = done_cb; /* cmd, not internal_user_data, is passed to 'done_cb' as the userdata */
 
     schedule_stonith_command(cmd, device);
 }
 
 gboolean
 string_in_list(GListPtr list, const char *item)
 {
     int lpc = 0;
     int max = g_list_length(list);
 
     for (lpc = 0; lpc < max; lpc++) {
         const char *value = g_list_nth_data(list, lpc);
 
         if (safe_str_eq(item, value)) {
             return TRUE;
         } else {
             crm_trace("%d: '%s' != '%s'", lpc, item, value);
         }
     }
     return FALSE;
 }
 
 static void
 status_search_cb(GPid pid, int rc, const char *output, gpointer user_data)
 {
     async_command_t *cmd = user_data;
     struct device_search_s *search = cmd->internal_user_data;
     stonith_device_t *dev = cmd->device ? g_hash_table_lookup(device_list, cmd->device) : NULL;
     gboolean can = FALSE;
 
     free_async_command(cmd);
 
     if (!dev) {
         search_devices_record_result(search, NULL, FALSE);
         return;
     }
 
     dev->active_pid = 0;
     mainloop_set_trigger(dev->work);
 
     if (rc == 1 /* unknown */ ) {
         crm_trace("Host %s is not known by %s", search->host, dev->id);
 
     } else if (rc == 0 /* active */  || rc == 2 /* inactive */ ) {
         crm_trace("Host %s is known by %s", search->host, dev->id);
         can = TRUE;
 
     } else {
         crm_notice("Unknown result when testing if %s can fence %s: rc=%d", dev->id, search->host,
                    rc);
     }
     search_devices_record_result(search, dev->id, can);
 }
 
 static void
 dynamic_list_search_cb(GPid pid, int rc, const char *output, gpointer user_data)
 {
     async_command_t *cmd = user_data;
     struct device_search_s *search = cmd->internal_user_data;
     stonith_device_t *dev = cmd->device ? g_hash_table_lookup(device_list, cmd->device) : NULL;
     gboolean can_fence = FALSE;
 
     free_async_command(cmd);
 
     /* Host/alias must be in the list output to be eligible to be fenced
      *
      * Will cause problems if down'd nodes aren't listed or (for virtual nodes)
      *  if the guest is still listed despite being moved to another machine
      */
     if (!dev) {
         search_devices_record_result(search, NULL, FALSE);
         return;
     }
 
     dev->active_pid = 0;
     mainloop_set_trigger(dev->work);
 
     /* If we successfully got the targets earlier, don't disable. */
     if (rc != 0 && !dev->targets) {
         crm_notice("Disabling port list queries for %s (%d): %s", dev->id, rc, output);
         /* Fall back to status */
         g_hash_table_replace(dev->params, strdup(STONITH_ATTR_HOSTCHECK), strdup("status"));
 
         g_list_free_full(dev->targets, free);
         dev->targets = NULL;
     } else if (!rc) {
         crm_info("Refreshing port list for %s", dev->id);
         g_list_free_full(dev->targets, free);
         dev->targets = parse_host_list(output);
         dev->targets_age = time(NULL);
     }
 
     if (dev->targets) {
         const char *alias = g_hash_table_lookup(dev->aliases, search->host);
 
         if (!alias) {
             alias = search->host;
         }
         if (string_in_list(dev->targets, alias)) {
             can_fence = TRUE;
         }
     }
     search_devices_record_result(search, dev->id, can_fence);
 }
 
 /*!
  * \internal
  * \brief Checks to see if an identical device already exists in the device_list
  */
 static stonith_device_t *
 device_has_duplicate(stonith_device_t * device)
 {
     char *key = NULL;
     char *value = NULL;
     GHashTableIter gIter;
     stonith_device_t *dup = g_hash_table_lookup(device_list, device->id);
 
     if (!dup) {
         crm_trace("No match for %s", device->id);
         return NULL;
 
     } else if (safe_str_neq(dup->agent, device->agent)) {
         crm_trace("Different agent: %s != %s", dup->agent, device->agent);
         return NULL;
     }
 
     /* Use calculate_operation_digest() here? */
     g_hash_table_iter_init(&gIter, device->params);
     while (g_hash_table_iter_next(&gIter, (void **)&key, (void **)&value)) {
 
         if(strstr(key, "CRM_meta") == key) {
             continue;
         } else if(strcmp(key, "crm_feature_set") == 0) {
             continue;
         } else {
             char *other_value = g_hash_table_lookup(dup->params, key);
 
             if (!other_value || safe_str_neq(other_value, value)) {
                 crm_trace("Different value for %s: %s != %s", key, other_value, value);
                 return NULL;
             }
         }
     }
 
     crm_trace("Match");
     return dup;
 }
 
 int
 stonith_device_register(xmlNode * msg, const char **desc, gboolean from_cib)
 {
     stonith_device_t *dup = NULL;
     stonith_device_t *device = build_device_from_xml(msg);
 
     dup = device_has_duplicate(device);
     if (dup) {
         crm_debug("Device '%s' already existed in device list (%d active devices)", device->id,
                    g_hash_table_size(device_list));
         free_device(device);
         device = dup;
 
     } else {
         stonith_device_t *old = g_hash_table_lookup(device_list, device->id);
 
         if (from_cib && old && old->api_registered) {
             /* If the cib is writing over an entry that is shared with a stonith client,
              * copy any pending ops that currently exist on the old entry to the new one.
              * Otherwise the pending ops will be reported as failures
              */
             crm_info("Overwriting an existing entry for %s from the cib", device->id);
             device->pending_ops = old->pending_ops;
             device->api_registered = TRUE;
             old->pending_ops = NULL;
             if (device->pending_ops) {
                 mainloop_set_trigger(device->work);
             }
         }
         g_hash_table_replace(device_list, device->id, device);
 
         crm_notice("Added '%s' to the device list (%d active devices)", device->id,
                    g_hash_table_size(device_list));
     }
     if (desc) {
         *desc = device->id;
     }
 
     if (from_cib) {
         device->cib_registered = TRUE;
     } else {
         device->api_registered = TRUE;
     }
 
     return pcmk_ok;
 }
 
 int
 stonith_device_remove(const char *id, gboolean from_cib)
 {
     stonith_device_t *device = g_hash_table_lookup(device_list, id);
 
     if (!device) {
         crm_info("Device '%s' not found (%d active devices)", id, g_hash_table_size(device_list));
         return pcmk_ok;
     }
 
     if (from_cib) {
         device->cib_registered = FALSE;
     } else {
         device->verified = FALSE;
         device->api_registered = FALSE;
     }
 
     if (!device->cib_registered && !device->api_registered) {
         g_hash_table_remove(device_list, id);
         crm_info("Removed '%s' from the device list (%d active devices)",
                  id, g_hash_table_size(device_list));
     }
     return pcmk_ok;
 }
 
 /*!
  * \internal
  * \brief Return the number of stonith levels registered for a node
  *
  * \param[in] tp  Node's topology table entry
  *
  * \return Number of non-NULL levels in topology entry
  * \note This function is used only for log messages.
  */
 static int
 count_active_levels(stonith_topology_t * tp)
 {
     int lpc = 0;
     int count = 0;
 
     for (lpc = 0; lpc < ST_LEVEL_MAX; lpc++) {
         if (tp->levels[lpc] != NULL) {
             count++;
         }
     }
     return count;
 }
 
 void
 free_topology_entry(gpointer data)
 {
     stonith_topology_t *tp = data;
 
     int lpc = 0;
 
     for (lpc = 0; lpc < ST_LEVEL_MAX; lpc++) {
         if (tp->levels[lpc] != NULL) {
             g_list_free_full(tp->levels[lpc], free);
         }
     }
     free(tp->node);
     free(tp);
 }
 
 /*!
  * \internal
  * \brief Register a STONITH level for a node
  *
  * Given an XML request specifying the node name, level index, and device IDs
  * for the level, this will create an entry for the node in the global topology
  * table if one does not already exist, then append the specified device IDs to
  * the entry's device list for the specified level.
  *
  * \param[in]  msg   XML request for STONITH level registration
  * \param[out] desc  If not NULL, will be set to string representation ("NODE[LEVEL]")
  *
  * \return pcmk_ok on success, -EINVAL if XML does not specify valid level index
  */
 int
 stonith_level_register(xmlNode * msg, char **desc)
 {
     int id = 0;
     int rc = pcmk_ok;
     xmlNode *child = NULL;
 
     xmlNode *level = get_xpath_object("//" F_STONITH_LEVEL, msg, LOG_ERR);
     const char *node = crm_element_value(level, F_STONITH_TARGET);
     stonith_topology_t *tp = g_hash_table_lookup(topology, node);
 
     CRM_LOG_ASSERT(node != NULL);
 
     crm_element_value_int(level, XML_ATTR_ID, &id);
     if (desc) {
         *desc = crm_strdup_printf("%s[%d]", node, id);
     }
     if (id <= 0 || id >= ST_LEVEL_MAX) {
         return -EINVAL;
     }
 
     if (tp == NULL) {
         tp = calloc(1, sizeof(stonith_topology_t));
         tp->node = strdup(node);
         g_hash_table_replace(topology, tp->node, tp);
         crm_trace("Added %s to the topology (%d active entries)", node,
                   g_hash_table_size(topology));
     }
 
     if (tp->levels[id] != NULL) {
         crm_info("Adding to the existing %s[%d] topology entry (%d active entries)", node, id,
                  count_active_levels(tp));
     }
 
     for (child = __xml_first_child(level); child != NULL; child = __xml_next(child)) {
         const char *device = ID(child);
 
         crm_trace("Adding device '%s' for %s (%d)", device, node, id);
         tp->levels[id] = g_list_append(tp->levels[id], strdup(device));
     }
 
     crm_info("Node %s has %d active fencing levels", node, count_active_levels(tp));
     return rc;
 }
 
 int
 stonith_level_remove(xmlNode * msg, char **desc)
 {
     int id = 0;
     xmlNode *level = get_xpath_object("//" F_STONITH_LEVEL, msg, LOG_ERR);
     const char *node = crm_element_value(level, F_STONITH_TARGET);
     stonith_topology_t *tp = g_hash_table_lookup(topology, node);
 
     CRM_LOG_ASSERT(node != NULL);
 
     if (desc) {
         *desc = crm_strdup_printf("%s[%d]", node, id);
     }
     crm_element_value_int(level, XML_ATTR_ID, &id);
 
     if (tp == NULL) {
         crm_info("Node %s not found (%d active entries)", node, g_hash_table_size(topology));
         return pcmk_ok;
 
     } else if (id < 0 || id >= ST_LEVEL_MAX) {
         return -EINVAL;
     }
 
     if (id == 0 && g_hash_table_remove(topology, node)) {
         crm_info("Removed all %s related entries from the topology (%d active entries)",
                  node, g_hash_table_size(topology));
 
     } else if (id > 0 && tp->levels[id] != NULL) {
         g_list_free_full(tp->levels[id], free);
         tp->levels[id] = NULL;
 
         crm_info("Removed entry '%d' from %s's topology (%d active entries remaining)",
                  id, node, count_active_levels(tp));
     }
     return pcmk_ok;
 }
 
 static int
 stonith_device_action(xmlNode * msg, char **output)
 {
     int rc = pcmk_ok;
     xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, msg, LOG_ERR);
     const char *id = crm_element_value(dev, F_STONITH_DEVICE);
 
     async_command_t *cmd = NULL;
     stonith_device_t *device = NULL;
 
     if (id) {
         crm_trace("Looking for '%s'", id);
         device = g_hash_table_lookup(device_list, id);
     }
 
     if (device && device->api_registered == FALSE) {
         rc = -ENODEV;
 
     } else if (device) {
         cmd = create_async_command(msg);
         if (cmd == NULL) {
-            free_device(device);
             return -EPROTO;
         }
 
         schedule_stonith_command(cmd, device);
         rc = -EINPROGRESS;
 
     } else {
         crm_info("Device %s not found", id ? id : "<none>");
         rc = -ENODEV;
     }
     return rc;
 }
 
 static void
 search_devices_record_result(struct device_search_s *search, const char *device, gboolean can_fence)
 {
     search->replies_received++;
 
     if (can_fence && device) {
         search->capable = g_list_append(search->capable, strdup(device));
     }
 
     if (search->replies_needed == search->replies_received) {
 
         crm_debug("Finished Search. %d devices can perform action (%s) on node %s",
                   g_list_length(search->capable),
                   search->action ? search->action : "<unknown>",
                   search->host ? search->host : "<anyone>");
 
         search->callback(search->capable, search->user_data);
         free(search->host);
         free(search->action);
         free(search);
     }
 }
 
 /*
  * \internal
  * \brief Check whether the local host is allowed to execute a fencing action
  *
  * \param[in] device         Fence device to check
  * \param[in] action         Fence action to check
  * \param[in] target         Hostname of fence target
  * \param[in] allow_suicide  Whether self-fencing is allowed for this operation
  *
  * \return TRUE if local host is allowed to execute action, FALSE otherwise
  */
 static gboolean
 localhost_is_eligible(const stonith_device_t *device, const char *action,
                       const char *target, gboolean allow_suicide)
 {
     gboolean localhost_is_target = safe_str_eq(target, stonith_our_uname);
 
     if (device && action && device->on_target_actions
         && strstr(device->on_target_actions, action)) {
         if (!localhost_is_target) {
             crm_trace("%s operation with %s can only be executed for localhost not %s",
                       action, device->id, target);
             return FALSE;
         }
 
     } else if (localhost_is_target && !allow_suicide) {
         crm_trace("%s operation does not support self-fencing", action);
         return FALSE;
     }
     return TRUE;
 }
 
 static void
 can_fence_host_with_device(stonith_device_t * dev, struct device_search_s *search)
 {
     gboolean can = FALSE;
     const char *check_type = NULL;
     const char *host = search->host;
     const char *alias = NULL;
 
     CRM_LOG_ASSERT(dev != NULL);
 
     if (dev == NULL) {
         goto search_report_results;
     } else if (host == NULL) {
         can = TRUE;
         goto search_report_results;
     }
 
     /* Short-circuit query if this host is not allowed to perform the action */
     if (safe_str_eq(search->action, "reboot")) {
         /* A "reboot" *might* get remapped to "off" then "on", so short-circuit
          * only if all three are disallowed. If only one or two are disallowed,
          * we'll report that with the results. We never allow suicide for
          * remapped "on" operations because the host is off at that point.
          */
         if (!localhost_is_eligible(dev, "reboot", host, search->allow_suicide)
             && !localhost_is_eligible(dev, "off", host, search->allow_suicide)
             && !localhost_is_eligible(dev, "on", host, FALSE)) {
             goto search_report_results;
         }
     } else if (!localhost_is_eligible(dev, search->action, host,
                                       search->allow_suicide)) {
         goto search_report_results;
     }
 
     alias = g_hash_table_lookup(dev->aliases, host);
     if (alias == NULL) {
         alias = host;
     }
 
     check_type = target_list_type(dev);
 
     if (safe_str_eq(check_type, "none")) {
         can = TRUE;
 
     } else if (safe_str_eq(check_type, "static-list")) {
 
         /* Presence in the hostmap is sufficient
          * Only use if all hosts on which the device can be active can always fence all listed hosts
          */
 
         if (string_in_list(dev->targets, host)) {
             can = TRUE;
         } else if (g_hash_table_lookup(dev->params, STONITH_ATTR_HOSTMAP)
                    && g_hash_table_lookup(dev->aliases, host)) {
             can = TRUE;
         }
 
     } else if (safe_str_eq(check_type, "dynamic-list")) {
         time_t now = time(NULL);
 
         if (dev->targets == NULL || dev->targets_age + 60 < now) {
             crm_trace("Running %s command to see if %s can fence %s (%s)",
                       check_type, dev?dev->id:"N/A", search->host, search->action);
 
             schedule_internal_command(__FUNCTION__, dev, "list", NULL,
                                       search->per_device_timeout, search, dynamic_list_search_cb);
 
             /* we'll respond to this search request async in the cb */
             return;
         }
 
         if (string_in_list(dev->targets, alias)) {
             can = TRUE;
         }
 
     } else if (safe_str_eq(check_type, "status")) {
         crm_trace("Running %s command to see if %s can fence %s (%s)",
                   check_type, dev?dev->id:"N/A", search->host, search->action);
         schedule_internal_command(__FUNCTION__, dev, "status", search->host,
                                   search->per_device_timeout, search, status_search_cb);
         /* we'll respond to this search request async in the cb */
         return;
     } else {
         crm_err("Unknown check type: %s", check_type);
     }
 
     if (safe_str_eq(host, alias)) {
         crm_notice("%s can%s fence (%s) %s: %s", dev->id, can ? "" : " not", search->action, host, check_type);
     } else {
         crm_notice("%s can%s fence (%s) %s (aka. '%s'): %s", dev->id, can ? "" : " not", search->action, host, alias,
                    check_type);
     }
 
   search_report_results:
     search_devices_record_result(search, dev ? dev->id : NULL, can);
 }
 
 static void
 search_devices(gpointer key, gpointer value, gpointer user_data)
 {
     stonith_device_t *dev = value;
     struct device_search_s *search = user_data;
 
     can_fence_host_with_device(dev, search);
 }
 
 #define DEFAULT_QUERY_TIMEOUT 20
 static void
 get_capable_devices(const char *host, const char *action, int timeout, bool suicide, void *user_data,
                     void (*callback) (GList * devices, void *user_data))
 {
     struct device_search_s *search;
     int per_device_timeout = DEFAULT_QUERY_TIMEOUT;
     int devices_needing_async_query = 0;
     char *key = NULL;
     const char *check_type = NULL;
     GHashTableIter gIter;
     stonith_device_t *device = NULL;
 
     if (!g_hash_table_size(device_list)) {
         callback(NULL, user_data);
         return;
     }
 
     search = calloc(1, sizeof(struct device_search_s));
     if (!search) {
         callback(NULL, user_data);
         return;
     }
 
     g_hash_table_iter_init(&gIter, device_list);
     while (g_hash_table_iter_next(&gIter, (void **)&key, (void **)&device)) {
         check_type = target_list_type(device);
         if (safe_str_eq(check_type, "status") || safe_str_eq(check_type, "dynamic-list")) {
             devices_needing_async_query++;
         }
     }
 
     /* If we have devices that require an async event in order to know what
      * nodes they can fence, we have to give the events a timeout. The total
      * query timeout is divided among those events. */
     if (devices_needing_async_query) {
         per_device_timeout = timeout / devices_needing_async_query;
         if (!per_device_timeout) {
             crm_err("STONITH timeout %ds is too low; using %ds, but consider raising to at least %ds",
                     timeout, DEFAULT_QUERY_TIMEOUT,
                     DEFAULT_QUERY_TIMEOUT * devices_needing_async_query);
             per_device_timeout = DEFAULT_QUERY_TIMEOUT;
         } else if (per_device_timeout < DEFAULT_QUERY_TIMEOUT) {
             crm_notice("STONITH timeout %ds is low for the current configuration;"
                        " consider raising to at least %ds",
                        timeout, DEFAULT_QUERY_TIMEOUT * devices_needing_async_query);
         }
     }
 
     search->host = host ? strdup(host) : NULL;
     search->action = action ? strdup(action) : NULL;
     search->per_device_timeout = per_device_timeout;
     /* We are guaranteed this many replies. Even if a device gets
      * unregistered some how during the async search, we will get
      * the correct number of replies. */
     search->replies_needed = g_hash_table_size(device_list);
     search->allow_suicide = suicide;
     search->callback = callback;
     search->user_data = user_data;
     /* kick off the search */
 
     crm_debug("Searching through %d devices to see what is capable of action (%s) for target %s",
               search->replies_needed,
               search->action ? search->action : "<unknown>",
               search->host ? search->host : "<anyone>");
     g_hash_table_foreach(device_list, search_devices, search);
 }
 
 struct st_query_data {
     xmlNode *reply;
     char *remote_peer;
     char *client_id;
     char *target;
     char *action;
     int call_options;
 };
 
 /*
  * \internal
  * \brief Add action-specific attributes to query reply XML
  *
  * \param[in,out] xml     XML to add attributes to
  * \param[in]     action  Fence action
  * \param[in]     device  Fence device
  */
 static void
 add_action_specific_attributes(xmlNode *xml, const char *action,
                                stonith_device_t *device)
 {
     int action_specific_timeout;
     int delay_max;
 
     CRM_CHECK(xml && action && device, return);
 
     if (is_action_required(action, device)) {
         crm_trace("Action %s is required on %s", action, device->id);
         crm_xml_add_int(xml, F_STONITH_DEVICE_REQUIRED, 1);
     }
 
     action_specific_timeout = get_action_timeout(device, action, 0);
     if (action_specific_timeout) {
         crm_trace("Action %s has timeout %dms on %s",
                   action, action_specific_timeout, device->id);
         crm_xml_add_int(xml, F_STONITH_ACTION_TIMEOUT, action_specific_timeout);
     }
 
     delay_max = get_action_delay_max(device, action);
     if (delay_max > 0) {
         crm_trace("Action %s has maximum random delay %dms on %s",
                   action, delay_max, device->id);
         crm_xml_add_int(xml, F_STONITH_DELAY_MAX, delay_max / 1000);
     }
 }
 
 /*
  * \internal
  * \brief Add "disallowed" attribute to query reply XML if appropriate
  *
  * \param[in,out] xml            XML to add attribute to
  * \param[in]     action         Fence action
  * \param[in]     device         Fence device
  * \param[in]     target         Fence target
  * \param[in]     allow_suicide  Whether self-fencing is allowed
  */
 static void
 add_disallowed(xmlNode *xml, const char *action, stonith_device_t *device,
                const char *target, gboolean allow_suicide)
 {
     if (!localhost_is_eligible(device, action, target, allow_suicide)) {
         crm_trace("Action %s on %s is disallowed for local host",
                   action, device->id);
         crm_xml_add(xml, F_STONITH_ACTION_DISALLOWED, XML_BOOLEAN_TRUE);
     }
 }
 
 /*
  * \internal
  * \brief Add child element with action-specific values to query reply XML
  *
  * \param[in,out] xml            XML to add attribute to
  * \param[in]     action         Fence action
  * \param[in]     device         Fence device
  * \param[in]     target         Fence target
  * \param[in]     allow_suicide  Whether self-fencing is allowed
  */
 static void
 add_action_reply(xmlNode *xml, const char *action, stonith_device_t *device,
                const char *target, gboolean allow_suicide)
 {
     xmlNode *child = create_xml_node(xml, F_STONITH_ACTION);
 
     crm_xml_add(child, XML_ATTR_ID, action);
     add_action_specific_attributes(child, action, device);
     add_disallowed(child, action, device, target, allow_suicide);
 }
 
 static void
 stonith_query_capable_device_cb(GList * devices, void *user_data)
 {
     struct st_query_data *query = user_data;
     int available_devices = 0;
     xmlNode *dev = NULL;
     xmlNode *list = NULL;
     GListPtr lpc = NULL;
 
     /* Pack the results into XML */
     list = create_xml_node(NULL, __FUNCTION__);
     crm_xml_add(list, F_STONITH_TARGET, query->target);
     for (lpc = devices; lpc != NULL; lpc = lpc->next) {
         stonith_device_t *device = g_hash_table_lookup(device_list, lpc->data);
         const char *action = query->action;
 
         if (!device) {
             /* It is possible the device got unregistered while
              * determining who can fence the target */
             continue;
         }
 
         available_devices++;
 
         dev = create_xml_node(list, F_STONITH_DEVICE);
         crm_xml_add(dev, XML_ATTR_ID, device->id);
         crm_xml_add(dev, "namespace", device->namespace);
         crm_xml_add(dev, "agent", device->agent);
         crm_xml_add_int(dev, F_STONITH_DEVICE_VERIFIED, device->verified);
 
         /* If the originating stonithd wants to reboot the node, and we have a
          * capable device that doesn't support "reboot", remap to "off" instead.
          */
         if (is_not_set(device->flags, st_device_supports_reboot)
             && safe_str_eq(query->action, "reboot")) {
             crm_trace("%s doesn't support reboot, using values for off instead",
                       device->id);
             action = "off";
         }
 
         /* Add action-specific values if available */
         add_action_specific_attributes(dev, action, device);
         if (safe_str_eq(query->action, "reboot")) {
             /* A "reboot" *might* get remapped to "off" then "on", so after
              * sending the "reboot"-specific values in the main element, we add
              * sub-elements for "off" and "on" values.
              *
              * We short-circuited earlier if "reboot", "off" and "on" are all
              * disallowed for the local host. However if only one or two are
              * disallowed, we send back the results and mark which ones are
              * disallowed. If "reboot" is disallowed, this might cause problems
              * with older stonithd versions, which won't check for it. Older
              * versions will ignore "off" and "on", so they are not a problem.
              */
             add_disallowed(dev, action, device, query->target,
                            is_set(query->call_options, st_opt_allow_suicide));
             add_action_reply(dev, "off", device, query->target,
                              is_set(query->call_options, st_opt_allow_suicide));
             add_action_reply(dev, "on", device, query->target, FALSE);
         }
 
         /* A query without a target wants device parameters */
         if (query->target == NULL) {
             xmlNode *attrs = create_xml_node(dev, XML_TAG_ATTRS);
 
             g_hash_table_foreach(device->params, hash2field, attrs);
         }
     }
 
     crm_xml_add_int(list, F_STONITH_AVAILABLE_DEVICES, available_devices);
     if (query->target) {
         crm_debug("Found %d matching devices for '%s'", available_devices, query->target);
     } else {
         crm_debug("%d devices installed", available_devices);
     }
 
     if (list != NULL) {
         crm_log_xml_trace(list, "Add query results");
         add_message_xml(query->reply, F_STONITH_CALLDATA, list);
     }
     stonith_send_reply(query->reply, query->call_options, query->remote_peer, query->client_id);
 
     free_xml(query->reply);
     free(query->remote_peer);
     free(query->client_id);
     free(query->target);
     free(query->action);
     free(query);
     free_xml(list);
     g_list_free_full(devices, free);
 }
 
 static void
 stonith_query(xmlNode * msg, const char *remote_peer, const char *client_id, int call_options)
 {
     struct st_query_data *query = NULL;
     const char *action = NULL;
     const char *target = NULL;
     int timeout = 0;
     xmlNode *dev = get_xpath_object("//@" F_STONITH_ACTION, msg, LOG_DEBUG_3);
 
     crm_element_value_int(msg, F_STONITH_TIMEOUT, &timeout);
     if (dev) {
         const char *device = crm_element_value(dev, F_STONITH_DEVICE);
 
         target = crm_element_value(dev, F_STONITH_TARGET);
         action = crm_element_value(dev, F_STONITH_ACTION);
         if (device && safe_str_eq(device, "manual_ack")) {
             /* No query or reply necessary */
             return;
         }
     }
 
     crm_log_xml_debug(msg, "Query");
     query = calloc(1, sizeof(struct st_query_data));
 
     query->reply = stonith_construct_reply(msg, NULL, NULL, pcmk_ok);
     query->remote_peer = remote_peer ? strdup(remote_peer) : NULL;
     query->client_id = client_id ? strdup(client_id) : NULL;
     query->target = target ? strdup(target) : NULL;
     query->action = action ? strdup(action) : NULL;
     query->call_options = call_options;
 
     get_capable_devices(target, action, timeout,
                         is_set(call_options, st_opt_allow_suicide),
                         query, stonith_query_capable_device_cb);
 }
 
 #define ST_LOG_OUTPUT_MAX 512
 static void
 log_operation(async_command_t * cmd, int rc, int pid, const char *next, const char *output)
 {
     if (rc == 0) {
         next = NULL;
     }
 
     if (cmd->victim != NULL) {
         do_crm_log(rc == 0 ? LOG_NOTICE : LOG_ERR,
                    "Operation '%s' [%d] (call %d from %s) for host '%s' with device '%s' returned: %d (%s)%s%s",
                    cmd->action, pid, cmd->id, cmd->client_name, cmd->victim, cmd->device, rc,
                    pcmk_strerror(rc), next ? ". Trying: " : "", next ? next : "");
     } else {
         do_crm_log_unlikely(rc == 0 ? LOG_DEBUG : LOG_NOTICE,
                             "Operation '%s' [%d] for device '%s' returned: %d (%s)%s%s",
                             cmd->action, pid, cmd->device, rc, pcmk_strerror(rc),
                             next ? ". Trying: " : "", next ? next : "");
     }
 
     if (output) {
         /* Logging the whole string confuses syslog when the string is xml */
         char *prefix = crm_strdup_printf("%s:%d", cmd->device, pid);
 
         crm_log_output(rc == 0 ? LOG_DEBUG : LOG_WARNING, prefix, output);
         free(prefix);
     }
 }
 
 static void
 stonith_send_async_reply(async_command_t * cmd, const char *output, int rc, GPid pid)
 {
     xmlNode *reply = NULL;
     gboolean bcast = FALSE;
 
     reply = stonith_construct_async_reply(cmd, output, NULL, rc);
 
     if (safe_str_eq(cmd->action, "metadata")) {
         /* Too verbose to log */
         crm_trace("Metadata query for %s", cmd->device);
         output = NULL;
 
     } else if (crm_str_eq(cmd->action, "monitor", TRUE) ||
                crm_str_eq(cmd->action, "list", TRUE) || crm_str_eq(cmd->action, "status", TRUE)) {
         crm_trace("Never broadcast %s replies", cmd->action);
 
     } else if (!stand_alone && safe_str_eq(cmd->origin, cmd->victim) && safe_str_neq(cmd->action, "on")) {
         crm_trace("Broadcast %s reply for %s", cmd->action, cmd->victim);
         crm_xml_add(reply, F_SUBTYPE, "broadcast");
         bcast = TRUE;
     }
 
     log_operation(cmd, rc, pid, NULL, output);
     crm_log_xml_trace(reply, "Reply");
 
     if (bcast) {
         crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY);
         send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE);
 
     } else if (cmd->origin) {
         crm_trace("Directed reply to %s", cmd->origin);
         send_cluster_message(crm_get_peer(0, cmd->origin), crm_msg_stonith_ng, reply, FALSE);
 
     } else {
         crm_trace("Directed local %ssync reply to %s",
                   (cmd->options & st_opt_sync_call) ? "" : "a-", cmd->client_name);
         do_local_reply(reply, cmd->client, cmd->options & st_opt_sync_call, FALSE);
     }
 
     if (stand_alone) {
         /* Do notification with a clean data object */
         xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE);
 
         crm_xml_add_int(notify_data, F_STONITH_RC, rc);
         crm_xml_add(notify_data, F_STONITH_TARGET, cmd->victim);
         crm_xml_add(notify_data, F_STONITH_OPERATION, cmd->op);
         crm_xml_add(notify_data, F_STONITH_DELEGATE, "localhost");
         crm_xml_add(notify_data, F_STONITH_DEVICE, cmd->device);
         crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id);
         crm_xml_add(notify_data, F_STONITH_ORIGIN, cmd->client);
 
         do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data);
     }
 
     free_xml(reply);
 }
 
 void
 unfence_cb(GPid pid, int rc, const char *output, gpointer user_data)
 {
     async_command_t * cmd = user_data;
     stonith_device_t *dev = g_hash_table_lookup(device_list, cmd->device);
 
     log_operation(cmd, rc, pid, NULL, output);
 
     if(dev) {
         dev->active_pid = 0;
         mainloop_set_trigger(dev->work);
     } else {
         crm_trace("Device %s does not exist", cmd->device);
     }
 
     if(rc != 0) {
         crm_exit(DAEMON_RESPAWN_STOP);
     }
 }
 
 static void
 cancel_stonith_command(async_command_t * cmd)
 {
     stonith_device_t *device;
 
     CRM_CHECK(cmd != NULL, return);
 
     if (!cmd->device) {
         return;
     }
 
     device = g_hash_table_lookup(device_list, cmd->device);
 
     if (device) {
         crm_trace("Cancel scheduled %s on %s", cmd->action, device->id);
         device->pending_ops = g_list_remove(device->pending_ops, cmd);
     }
 }
 
 static void
 st_child_done(GPid pid, int rc, const char *output, gpointer user_data)
 {
     stonith_device_t *device = NULL;
     stonith_device_t *next_device = NULL;
     async_command_t *cmd = user_data;
 
     GListPtr gIter = NULL;
     GListPtr gIterNext = NULL;
 
     CRM_CHECK(cmd != NULL, return);
 
     /* The device is ready to do something else now */
     device = g_hash_table_lookup(device_list, cmd->device);
     if (device) {
         device->active_pid = 0;
         if (rc == pcmk_ok &&
             (safe_str_eq(cmd->action, "list") ||
              safe_str_eq(cmd->action, "monitor") || safe_str_eq(cmd->action, "status"))) {
 
             device->verified = TRUE;
         }
 
         mainloop_set_trigger(device->work);
     }
 
     crm_debug("Operation '%s' on '%s' completed with rc=%d (%d remaining)",
               cmd->action, cmd->device, rc, g_list_length(cmd->device_next));
 
     if (rc == 0) {
         GListPtr iter;
         /* see if there are any required devices left to execute for this op */
         for (iter = cmd->device_next; iter != NULL; iter = iter->next) {
             next_device = g_hash_table_lookup(device_list, iter->data);
 
             if (next_device != NULL && is_action_required(cmd->action, next_device)) {
                 cmd->device_next = iter->next;
                 break;
             }
             next_device = NULL;
         }
 
     } else if (rc != 0 && cmd->device_next && (is_action_required(cmd->action, device) == FALSE)) {
         /* if this device didn't work out, see if there are any others we can try.
          * if the failed device was 'required', we can't pick another device. */
         next_device = g_hash_table_lookup(device_list, cmd->device_next->data);
         cmd->device_next = cmd->device_next->next;
     }
 
     /* this operation requires more fencing, hooray! */
     if (next_device) {
         log_operation(cmd, rc, pid, cmd->device, output);
 
         schedule_stonith_command(cmd, next_device);
         /* Prevent cmd from being freed */
         cmd = NULL;
         goto done;
     }
 
     if (rc > 0) {
         /* Try to provide _something_ useful */
         if(output == NULL) {
             rc = -ENODATA;
 
         } else if(strstr(output, "imed out")) {
             rc = -ETIMEDOUT;
 
         } else if(strstr(output, "Unrecognised action")) {
             rc = -EOPNOTSUPP;
 
         } else {
             rc = -pcmk_err_generic;
         }
     }
 
     stonith_send_async_reply(cmd, output, rc, pid);
 
     if (rc != 0) {
         goto done;
     }
 
     /* Check to see if any operations are scheduled to do the exact
      * same thing that just completed.  If so, rather than
      * performing the same fencing operation twice, return the result
      * of this operation for all pending commands it matches. */
     for (gIter = cmd_list; gIter != NULL; gIter = gIterNext) {
         async_command_t *cmd_other = gIter->data;
 
         gIterNext = gIter->next;
 
         if (cmd == cmd_other) {
             continue;
         }
 
         /* A pending scheduled command matches the command that just finished if.
          * 1. The client connections are different.
          * 2. The node victim is the same.
          * 3. The fencing action is the same.
          * 4. The device scheduled to execute the action is the same.
          */
         if (safe_str_eq(cmd->client, cmd_other->client) ||
             safe_str_neq(cmd->victim, cmd_other->victim) ||
             safe_str_neq(cmd->action, cmd_other->action) ||
             safe_str_neq(cmd->device, cmd_other->device)) {
 
             continue;
         }
 
         /* Duplicate merging will do the right thing for either type of remapped
          * reboot. If the executing stonithd remapped an unsupported reboot to
          * off, then cmd->action will be reboot and will be merged with any
          * other reboot requests. If the originating stonithd remapped a
          * topology reboot to off then on, we will get here once with
          * cmd->action "off" and once with "on", and they will be merged
          * separately with similar requests.
          */
         crm_notice
             ("Merging stonith action %s for node %s originating from client %s with identical stonith request from client %s",
              cmd_other->action, cmd_other->victim, cmd_other->client_name, cmd->client_name);
 
         cmd_list = g_list_remove_link(cmd_list, gIter);
 
         stonith_send_async_reply(cmd_other, output, rc, pid);
         cancel_stonith_command(cmd_other);
 
         free_async_command(cmd_other);
         g_list_free_1(gIter);
     }
 
   done:
     free_async_command(cmd);
 }
 
 static gint
 sort_device_priority(gconstpointer a, gconstpointer b)
 {
     const stonith_device_t *dev_a = a;
     const stonith_device_t *dev_b = b;
 
     if (dev_a->priority > dev_b->priority) {
         return -1;
     } else if (dev_a->priority < dev_b->priority) {
         return 1;
     }
     return 0;
 }
 
 static void
 stonith_fence_get_devices_cb(GList * devices, void *user_data)
 {
     async_command_t *cmd = user_data;
     stonith_device_t *device = NULL;
 
     crm_info("Found %d matching devices for '%s'", g_list_length(devices), cmd->victim);
 
     if (g_list_length(devices) > 0) {
         /* Order based on priority */
         devices = g_list_sort(devices, sort_device_priority);
         device = g_hash_table_lookup(device_list, devices->data);
 
         if (device) {
             cmd->device_list = devices;
             cmd->device_next = devices->next;
             devices = NULL;     /* list owned by cmd now */
         }
     }
 
     /* we have a device, schedule it for fencing. */
     if (device) {
         schedule_stonith_command(cmd, device);
         /* in progress */
         return;
     }
 
     /* no device found! */
     stonith_send_async_reply(cmd, NULL, -ENODEV, 0);
 
     free_async_command(cmd);
     g_list_free_full(devices, free);
 }
 
 static int
 stonith_fence(xmlNode * msg)
 {
     const char *device_id = NULL;
     stonith_device_t *device = NULL;
     async_command_t *cmd = create_async_command(msg);
     xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR);
 
     if (cmd == NULL) {
         return -EPROTO;
     }
 
     device_id = crm_element_value(dev, F_STONITH_DEVICE);
     if (device_id) {
         device = g_hash_table_lookup(device_list, device_id);
         if (device == NULL) {
             crm_err("Requested device '%s' is not available", device_id);
             return -ENODEV;
         }
         schedule_stonith_command(cmd, device);
 
     } else {
         const char *host = crm_element_value(dev, F_STONITH_TARGET);
 
         if (cmd->options & st_opt_cs_nodeid) {
             int nodeid = crm_atoi(host, NULL);
             crm_node_t *node = crm_get_peer(nodeid, NULL);
 
             if (node) {
                 host = node->uname;
             }
         }
 
         /* If we get to here, then self-fencing is implicitly allowed */
         get_capable_devices(host, cmd->action, cmd->default_timeout,
                             TRUE, cmd, stonith_fence_get_devices_cb);
     }
 
     return -EINPROGRESS;
 }
 
 xmlNode *
 stonith_construct_reply(xmlNode * request, const char *output, xmlNode * data, int rc)
 {
     int lpc = 0;
     xmlNode *reply = NULL;
 
     const char *name = NULL;
     const char *value = NULL;
 
     const char *names[] = {
         F_STONITH_OPERATION,
         F_STONITH_CALLID,
         F_STONITH_CLIENTID,
         F_STONITH_CLIENTNAME,
         F_STONITH_REMOTE_OP_ID,
         F_STONITH_CALLOPTS
     };
 
     crm_trace("Creating a basic reply");
     reply = create_xml_node(NULL, T_STONITH_REPLY);
 
     crm_xml_add(reply, "st_origin", __FUNCTION__);
     crm_xml_add(reply, F_TYPE, T_STONITH_NG);
     crm_xml_add(reply, "st_output", output);
     crm_xml_add_int(reply, F_STONITH_RC, rc);
 
     CRM_CHECK(request != NULL, crm_warn("Can't create a sane reply"); return reply);
     for (lpc = 0; lpc < DIMOF(names); lpc++) {
         name = names[lpc];
         value = crm_element_value(request, name);
         crm_xml_add(reply, name, value);
     }
 
     if (data != NULL) {
         crm_trace("Attaching reply output");
         add_message_xml(reply, F_STONITH_CALLDATA, data);
     }
     return reply;
 }
 
 static xmlNode *
 stonith_construct_async_reply(async_command_t * cmd, const char *output, xmlNode * data, int rc)
 {
     xmlNode *reply = NULL;
 
     crm_trace("Creating a basic reply");
     reply = create_xml_node(NULL, T_STONITH_REPLY);
 
     crm_xml_add(reply, "st_origin", __FUNCTION__);
     crm_xml_add(reply, F_TYPE, T_STONITH_NG);
 
     crm_xml_add(reply, F_STONITH_OPERATION, cmd->op);
     crm_xml_add(reply, F_STONITH_DEVICE, cmd->device);
     crm_xml_add(reply, F_STONITH_REMOTE_OP_ID, cmd->remote_op_id);
     crm_xml_add(reply, F_STONITH_CLIENTID, cmd->client);
     crm_xml_add(reply, F_STONITH_CLIENTNAME, cmd->client_name);
     crm_xml_add(reply, F_STONITH_TARGET, cmd->victim);
     crm_xml_add(reply, F_STONITH_ACTION, cmd->op);
     crm_xml_add(reply, F_STONITH_ORIGIN, cmd->origin);
     crm_xml_add_int(reply, F_STONITH_CALLID, cmd->id);
     crm_xml_add_int(reply, F_STONITH_CALLOPTS, cmd->options);
 
     crm_xml_add_int(reply, F_STONITH_RC, rc);
 
     crm_xml_add(reply, "st_output", output);
 
     if (data != NULL) {
         crm_info("Attaching reply output");
         add_message_xml(reply, F_STONITH_CALLDATA, data);
     }
     return reply;
 }
 
 bool fencing_peer_active(crm_node_t *peer)
 {
     if (peer == NULL) {
         return FALSE;
     } else if (peer->uname == NULL) {
         return FALSE;
     } else if (is_set(peer->processes, crm_get_cluster_proc())) {
         return TRUE;
     }
     return FALSE;
 }
 
 /*!
  * \internal
  * \brief Determine if we need to use an alternate node to
  * fence the target. If so return that node's uname
  *
  * \retval NULL, no alternate host
  * \retval uname, uname of alternate host to use
  */
 static const char *
 check_alternate_host(const char *target)
 {
     const char *alternate_host = NULL;
 
     if (find_topology_for_host(target) && safe_str_eq(target, stonith_our_uname)) {
         GHashTableIter gIter;
         crm_node_t *entry = NULL;
 
         g_hash_table_iter_init(&gIter, crm_peer_cache);
         while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
             crm_trace("Checking for %s.%d != %s", entry->uname, entry->id, target);
             if (fencing_peer_active(entry)
                 && safe_str_neq(entry->uname, target)) {
                 alternate_host = entry->uname;
                 break;
             }
         }
         if (alternate_host == NULL) {
             crm_err("No alternate host available to handle complex self fencing request");
             g_hash_table_iter_init(&gIter, crm_peer_cache);
             while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
                 crm_notice("Peer[%d] %s", entry->id, entry->uname);
             }
         }
     }
 
     return alternate_host;
 }
 
 static void
 stonith_send_reply(xmlNode * reply, int call_options, const char *remote_peer,
                    const char *client_id)
 {
     if (remote_peer) {
         send_cluster_message(crm_get_peer(0, remote_peer), crm_msg_stonith_ng, reply, FALSE);
     } else {
         do_local_reply(reply, client_id, is_set(call_options, st_opt_sync_call), remote_peer != NULL);
     }
 }
 
 static int
 handle_request(crm_client_t * client, uint32_t id, uint32_t flags, xmlNode * request,
                const char *remote_peer)
 {
     int call_options = 0;
     int rc = -EOPNOTSUPP;
 
     xmlNode *data = NULL;
     xmlNode *reply = NULL;
 
     char *output = NULL;
     const char *op = crm_element_value(request, F_STONITH_OPERATION);
     const char *client_id = crm_element_value(request, F_STONITH_CLIENTID);
 
     crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options);
 
     if (is_set(call_options, st_opt_sync_call)) {
         CRM_ASSERT(client == NULL || client->request_id == id);
     }
 
     if (crm_str_eq(op, CRM_OP_REGISTER, TRUE)) {
         xmlNode *reply = create_xml_node(NULL, "reply");
 
         CRM_ASSERT(client);
         crm_xml_add(reply, F_STONITH_OPERATION, CRM_OP_REGISTER);
         crm_xml_add(reply, F_STONITH_CLIENTID, client->id);
         crm_ipcs_send(client, id, reply, flags);
         client->request_id = 0;
         free_xml(reply);
         return 0;
 
     } else if (crm_str_eq(op, STONITH_OP_EXEC, TRUE)) {
         rc = stonith_device_action(request, &output);
 
     } else if (crm_str_eq(op, STONITH_OP_TIMEOUT_UPDATE, TRUE)) {
         const char *call_id = crm_element_value(request, F_STONITH_CALLID);
         const char *client_id = crm_element_value(request, F_STONITH_CLIENTID);
         int op_timeout = 0;
 
         crm_element_value_int(request, F_STONITH_TIMEOUT, &op_timeout);
         do_stonith_async_timeout_update(client_id, call_id, op_timeout);
         return 0;
 
     } else if (crm_str_eq(op, STONITH_OP_QUERY, TRUE)) {
         if (remote_peer) {
             create_remote_stonith_op(client_id, request, TRUE); /* Record it for the future notification */
         }
         stonith_query(request, remote_peer, client_id, call_options);
         return 0;
 
     } else if (crm_str_eq(op, T_STONITH_NOTIFY, TRUE)) {
         const char *flag_name = NULL;
 
         CRM_ASSERT(client);
         flag_name = crm_element_value(request, F_STONITH_NOTIFY_ACTIVATE);
         if (flag_name) {
             crm_debug("Setting %s callbacks for %s (%s): ON", flag_name, client->name, client->id);
             client->options |= get_stonith_flag(flag_name);
         }
 
         flag_name = crm_element_value(request, F_STONITH_NOTIFY_DEACTIVATE);
         if (flag_name) {
             crm_debug("Setting %s callbacks for %s (%s): off", flag_name, client->name, client->id);
             client->options |= get_stonith_flag(flag_name);
         }
 
         if (flags & crm_ipc_client_response) {
             crm_ipcs_send_ack(client, id, flags, "ack", __FUNCTION__, __LINE__);
         }
         return 0;
 
     } else if (crm_str_eq(op, STONITH_OP_RELAY, TRUE)) {
         xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE);
 
         crm_notice("Peer %s has received a forwarded fencing request from %s to fence (%s) peer %s",
                    stonith_our_uname,
                    client ? client->name : remote_peer,
                    crm_element_value(dev, F_STONITH_ACTION),
                    crm_element_value(dev, F_STONITH_TARGET));
 
         if (initiate_remote_stonith_op(NULL, request, FALSE) != NULL) {
             rc = -EINPROGRESS;
         }
 
     } else if (crm_str_eq(op, STONITH_OP_FENCE, TRUE)) {
 
         if (remote_peer || stand_alone) {
             rc = stonith_fence(request);
 
         } else if (call_options & st_opt_manual_ack) {
             remote_fencing_op_t *rop = NULL;
             xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE);
             const char *target = crm_element_value(dev, F_STONITH_TARGET);
 
             crm_notice("Received manual confirmation that %s is fenced", target);
             rop = initiate_remote_stonith_op(client, request, TRUE);
             rc = stonith_manual_ack(request, rop);
 
         } else {
             const char *alternate_host = NULL;
             xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE);
             const char *target = crm_element_value(dev, F_STONITH_TARGET);
             const char *action = crm_element_value(dev, F_STONITH_ACTION);
             const char *device = crm_element_value(dev, F_STONITH_DEVICE);
 
             if (client) {
                 int tolerance = 0;
 
                 crm_notice("Client %s.%.8s wants to fence (%s) '%s' with device '%s'",
                            client->name, client->id, action, target, device ? device : "(any)");
 
                 crm_element_value_int(dev, F_STONITH_TOLERANCE, &tolerance);
 
                 if (stonith_check_fence_tolerance(tolerance, target, action)) {
                     rc = 0;
                     goto done;
                 }
 
             } else {
                 crm_notice("Peer %s wants to fence (%s) '%s' with device '%s'",
                            remote_peer, action, target, device ? device : "(any)");
             }
 
             alternate_host = check_alternate_host(target);
 
             if (alternate_host && client) {
                 const char *client_id = NULL;
 
                 crm_notice("Forwarding complex self fencing request to peer %s", alternate_host);
 
                 if (client) {
                     client_id = client->id;
                 } else {
                     client_id = crm_element_value(request, F_STONITH_CLIENTID);
                 }
                 /* Create a record of it, otherwise call_id will be 0 if we need to notify of failures */
                 create_remote_stonith_op(client_id, request, FALSE);
 
                 crm_xml_add(request, F_STONITH_OPERATION, STONITH_OP_RELAY);
                 crm_xml_add(request, F_STONITH_CLIENTID, client->id);
                 send_cluster_message(crm_get_peer(0, alternate_host), crm_msg_stonith_ng, request,
                                      FALSE);
                 rc = -EINPROGRESS;
 
             } else if (initiate_remote_stonith_op(client, request, FALSE) != NULL) {
                 rc = -EINPROGRESS;
             }
         }
 
     } else if (crm_str_eq(op, STONITH_OP_FENCE_HISTORY, TRUE)) {
         rc = stonith_fence_history(request, &data);
 
     } else if (crm_str_eq(op, STONITH_OP_DEVICE_ADD, TRUE)) {
         const char *id = NULL;
         xmlNode *notify_data = create_xml_node(NULL, op);
 
         rc = stonith_device_register(request, &id, FALSE);
 
         crm_xml_add(notify_data, F_STONITH_DEVICE, id);
         crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(device_list));
 
         do_stonith_notify(call_options, op, rc, notify_data);
         free_xml(notify_data);
 
     } else if (crm_str_eq(op, STONITH_OP_DEVICE_DEL, TRUE)) {
         xmlNode *dev = get_xpath_object("//" F_STONITH_DEVICE, request, LOG_ERR);
         const char *id = crm_element_value(dev, XML_ATTR_ID);
         xmlNode *notify_data = create_xml_node(NULL, op);
 
         rc = stonith_device_remove(id, FALSE);
 
         crm_xml_add(notify_data, F_STONITH_DEVICE, id);
         crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(device_list));
 
         do_stonith_notify(call_options, op, rc, notify_data);
         free_xml(notify_data);
 
     } else if (crm_str_eq(op, STONITH_OP_LEVEL_ADD, TRUE)) {
         char *id = NULL;
         xmlNode *notify_data = create_xml_node(NULL, op);
 
         rc = stonith_level_register(request, &id);
 
         crm_xml_add(notify_data, F_STONITH_DEVICE, id);
         crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(topology));
 
         do_stonith_notify(call_options, op, rc, notify_data);
         free_xml(notify_data);
         free(id);
 
     } else if (crm_str_eq(op, STONITH_OP_LEVEL_DEL, TRUE)) {
         char *id = NULL;
         xmlNode *notify_data = create_xml_node(NULL, op);
 
         rc = stonith_level_remove(request, &id);
 
         crm_xml_add(notify_data, F_STONITH_DEVICE, id);
         crm_xml_add_int(notify_data, F_STONITH_ACTIVE, g_hash_table_size(topology));
 
         do_stonith_notify(call_options, op, rc, notify_data);
         free_xml(notify_data);
 
     } else if (crm_str_eq(op, STONITH_OP_CONFIRM, TRUE)) {
         async_command_t *cmd = create_async_command(request);
         xmlNode *reply = stonith_construct_async_reply(cmd, NULL, NULL, 0);
 
         crm_xml_add(reply, F_STONITH_OPERATION, T_STONITH_NOTIFY);
         crm_notice("Broadcasting manual fencing confirmation for node %s", cmd->victim);
         send_cluster_message(NULL, crm_msg_stonith_ng, reply, FALSE);
 
         free_async_command(cmd);
         free_xml(reply);
 
     } else if(safe_str_eq(op, CRM_OP_RM_NODE_CACHE)) {
         int id = 0;
         const char *name = NULL;
 
         crm_element_value_int(request, XML_ATTR_ID, &id);
         name = crm_element_value(request, XML_ATTR_UNAME);
         reap_crm_member(id, name);
 
         return pcmk_ok;
 
     } else {
         crm_err("Unknown %s from %s", op, client ? client->name : remote_peer);
         crm_log_xml_warn(request, "UnknownOp");
     }
 
   done:
 
     /* Always reply unless the request is in process still.
      * If in progress, a reply will happen async after the request
      * processing is finished */
     if (rc != -EINPROGRESS) {
         crm_trace("Reply handling: %p %u %u %d %d %s", client, client?client->request_id:0,
                   id, is_set(call_options, st_opt_sync_call), call_options,
                   crm_element_value(request, F_STONITH_CALLOPTS));
 
         if (is_set(call_options, st_opt_sync_call)) {
             CRM_ASSERT(client == NULL || client->request_id == id);
         }
         reply = stonith_construct_reply(request, output, data, rc);
         stonith_send_reply(reply, call_options, remote_peer, client_id);
     }
 
     free(output);
     free_xml(data);
     free_xml(reply);
 
     return rc;
 }
 
 static void
 handle_reply(crm_client_t * client, xmlNode * request, const char *remote_peer)
 {
     const char *op = crm_element_value(request, F_STONITH_OPERATION);
 
     if (crm_str_eq(op, STONITH_OP_QUERY, TRUE)) {
         process_remote_stonith_query(request);
     } else if (crm_str_eq(op, T_STONITH_NOTIFY, TRUE)) {
         process_remote_stonith_exec(request);
     } else if (crm_str_eq(op, STONITH_OP_FENCE, TRUE)) {
         /* Reply to a complex fencing op */
         process_remote_stonith_exec(request);
     } else {
         crm_err("Unknown %s reply from %s", op, client ? client->name : remote_peer);
         crm_log_xml_warn(request, "UnknownOp");
     }
 }
 
 void
 stonith_command(crm_client_t * client, uint32_t id, uint32_t flags, xmlNode * request,
                 const char *remote_peer)
 {
     int call_options = 0;
     int rc = 0;
     gboolean is_reply = FALSE;
 
     /* Copy op for reporting. The original might get freed by handle_reply()
      * before we use it in crm_debug():
      *     handle_reply()
      *     |- process_remote_stonith_exec()
      *     |-- remote_op_done()
      *     |--- handle_local_reply_and_notify()
      *     |---- crm_xml_add(...F_STONITH_OPERATION...)
      *     |--- free_xml(op->request)
      */
     char *op = crm_element_value_copy(request, F_STONITH_OPERATION);
 
     if (get_xpath_object("//" T_STONITH_REPLY, request, LOG_DEBUG_3)) {
         is_reply = TRUE;
     }
 
     crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options);
     crm_debug("Processing %s%s %u from %s (%16x)", op, is_reply ? " reply" : "",
               id, client ? client->name : remote_peer, call_options);
 
     if (is_set(call_options, st_opt_sync_call)) {
         CRM_ASSERT(client == NULL || client->request_id == id);
     }
 
     if (is_reply) {
         handle_reply(client, request, remote_peer);
     } else {
         rc = handle_request(client, id, flags, request, remote_peer);
     }
 
     crm_debug("Processed %s%s from %s: %s (%d)", op,
               is_reply ? " reply" : "", client ? client->name : remote_peer,
               rc > 0 ? "" : pcmk_strerror(rc), rc);
 
     free(op);
 }
diff --git a/lib/services/upstart.c b/lib/services/upstart.c
index 365af6a117..e0047e23f6 100644
--- a/lib/services/upstart.c
+++ b/lib/services/upstart.c
@@ -1,573 +1,575 @@
 /*
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
  * This software is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  *
  * File: upstart-dbus.c
  * Copyright (C) 2010 Senko Rasic <senko.rasic@dobarkod.hr>
  * Copyright (c) 2010 Ante Karamatic <ivoks@init.hr>
  *
  *
  * Each exported function is standalone, and creates a new connection to
  * the upstart daemon. This is because lrmd plugins fork off for exec,
  * and if we try and share the connection, the whole thing blocks
  * indefinitely.
  */
 
 #include <crm_internal.h>
 
 #include <stdio.h>
 
 #include <crm/crm.h>
 #include <crm/services.h>
 #include <crm/common/mainloop.h>
 
 #include <services_private.h>
 #include <upstart.h>
 #include <dbus/dbus.h>
 #include <pcmk-dbus.h>
 
 #include <glib.h>
 #include <gio/gio.h>
 
 #define BUS_NAME "com.ubuntu.Upstart"
 #define BUS_PATH "/com/ubuntu/Upstart"
 
 #define UPSTART_06_API     BUS_NAME"0_6"
 #define UPSTART_JOB_IFACE  UPSTART_06_API".Job"
 #define BUS_PROPERTY_IFACE "org.freedesktop.DBus.Properties"
 
 /*
   http://upstart.ubuntu.com/wiki/DBusInterface
 */
 static DBusConnection *upstart_proxy = NULL;
 
 static gboolean
 upstart_init(void)
 {
     static int need_init = 1;
 
     if (need_init) {
         need_init = 0;
         upstart_proxy = pcmk_dbus_connect();
     }
     if (upstart_proxy == NULL) {
         return FALSE;
     }
     return TRUE;
 }
 
 void
 upstart_cleanup(void)
 {
     if (upstart_proxy) {
         pcmk_dbus_disconnect(upstart_proxy);
         upstart_proxy = NULL;
     }
 }
 
 static gboolean
 upstart_job_by_name(const gchar * arg_name, gchar ** out_unit, int timeout)
 {
 /*
   com.ubuntu.Upstart0_6.GetJobByName (in String name, out ObjectPath job)
 */
     DBusError error;
     DBusMessage *msg;
     DBusMessage *reply = NULL;
     const char *method = "GetJobByName";
 
     if(upstart_init() == FALSE) {
         return FALSE;
     }
     msg = dbus_message_new_method_call(BUS_NAME, // target for the method call
                                        BUS_PATH, // object to call on
                                        UPSTART_06_API, // interface to call on
                                        method); // method name
 
     dbus_error_init(&error);
     CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_STRING, &arg_name, DBUS_TYPE_INVALID));
     reply = pcmk_dbus_send_recv(msg, upstart_proxy, &error, timeout);
     dbus_message_unref(msg);
 
     if(error.name) {
         /* ignore "already started" or "not running" errors */
         crm_err("Could not issue %s for %s: %s", method, arg_name, error.name);
 
     } else if(!pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, __FUNCTION__, __LINE__)) {
         crm_err("Invalid return type for %s", method);
 
     } else {
         if(out_unit) {
             char *path = NULL;
 
             dbus_message_get_args (reply, NULL,
                                    DBUS_TYPE_OBJECT_PATH, &path,
                                    DBUS_TYPE_INVALID);
 
             *out_unit = strdup(path);
         }
         dbus_message_unref(reply);
         return TRUE;
     }
 
     if(reply) {
         dbus_message_unref(reply);
     }
     return FALSE;
 }
 
 static void
 fix(char *input, const char *search, char replace)
 {
     char *match = NULL;
     int shuffle = strlen(search) - 1;
 
     while (TRUE) {
         int len, lpc;
 
         match = strstr(input, search);
         if (match == NULL) {
             break;
         }
         crm_trace("Found: %s", match);
         match[0] = replace;
         len = strlen(match) - shuffle;
         for (lpc = 1; lpc <= len; lpc++) {
             match[lpc] = match[lpc + shuffle];
         }
     }
 }
 
 static char *
 fix_upstart_name(const char *input)
 {
     char *output = strdup(input);
 
     fix(output, "_2b", '+');
     fix(output, "_2c", ',');
     fix(output, "_2d", '-');
     fix(output, "_2e", '.');
     fix(output, "_40", '@');
     fix(output, "_5f", '_');
     return output;
 }
 
 GList *
 upstart_job_listall(void)
 {
     GList *units = NULL;
     DBusMessageIter args;
     DBusMessageIter unit;
     DBusMessage *msg = NULL;
     DBusMessage *reply = NULL;
     const char *method = "GetAllJobs";
     DBusError error;
     int lpc = 0;
 
     if (upstart_init() == FALSE) {
         return NULL;
     }
 
 /*
   com.ubuntu.Upstart0_6.GetAllJobs (out <Array of ObjectPath> jobs)
 */
 
     dbus_error_init(&error);
     msg = dbus_message_new_method_call(BUS_NAME, // target for the method call
                                        BUS_PATH, // object to call on
                                        UPSTART_06_API, // interface to call on
                                        method); // method name
     CRM_ASSERT(msg != NULL);
 
     reply = pcmk_dbus_send_recv(msg, upstart_proxy, &error, DBUS_TIMEOUT_USE_DEFAULT);
     dbus_message_unref(msg);
 
     if(error.name) {
         crm_err("Call to %s failed: %s", method, error.name);
         return NULL;
 
     } else if (!dbus_message_iter_init(reply, &args)) {
         crm_err("Call to %s failed: Message has no arguments", method);
         dbus_message_unref(reply);
         return NULL;
     }
 
     if(!pcmk_dbus_type_check(reply, &args, DBUS_TYPE_ARRAY, __FUNCTION__, __LINE__)) {
         crm_err("Call to %s failed: Message has invalid arguments", method);
         dbus_message_unref(reply);
         return NULL;
     }
 
     dbus_message_iter_recurse(&args, &unit);
     while (dbus_message_iter_get_arg_type (&unit) != DBUS_TYPE_INVALID) {
         DBusBasicValue value;
         const char *job = NULL;
         char *path = NULL;
 
         if(!pcmk_dbus_type_check(reply, &unit, DBUS_TYPE_OBJECT_PATH, __FUNCTION__, __LINE__)) {
             continue;
         }
 
         dbus_message_iter_get_basic(&unit, &value);
 
         if(value.str) {
             int llpc = 0;
             path = value.str;
             job = value.str;
             while (path[llpc] != 0) {
                 if (path[llpc] == '/') {
                     job = path + llpc + 1;
                 }
                 llpc++;
             }
             lpc++;
             crm_trace("%s -> %s\n", path, job);
             units = g_list_append(units, fix_upstart_name(job));
         }
         dbus_message_iter_next (&unit);
     }
 
     dbus_message_unref(reply);
     crm_trace("Found %d upstart jobs", lpc);
     return units;
 }
 
 gboolean
 upstart_job_exists(const char *name)
 {
     return upstart_job_by_name(name, NULL, DBUS_TIMEOUT_USE_DEFAULT);
 }
 
 static char *
 get_first_instance(const gchar * job, int timeout)
 {
     char *instance = NULL;
     const char *method = "GetAllInstances";
     DBusError error;
     DBusMessage *msg;
     DBusMessage *reply;
     DBusMessageIter args;
     DBusMessageIter unit;
 
     dbus_error_init(&error);
     msg = dbus_message_new_method_call(BUS_NAME, // target for the method call
                                        job, // object to call on
                                        UPSTART_JOB_IFACE, // interface to call on
                                        method); // method name
     CRM_ASSERT(msg != NULL);
 
     dbus_message_append_args(msg, DBUS_TYPE_INVALID);
     reply = pcmk_dbus_send_recv(msg, upstart_proxy, &error, timeout);
     dbus_message_unref(msg);
 
     if(error.name) {
         crm_err("Call to %s failed: %s", method, error.name);
         goto done;
 
     } else if(reply == NULL) {
         crm_err("Call to %s failed: no reply", method);
         goto done;
 
     } else if (!dbus_message_iter_init(reply, &args)) {
         crm_err("Call to %s failed: Message has no arguments", method);
         goto done;
     }
 
     if(!pcmk_dbus_type_check(reply, &args, DBUS_TYPE_ARRAY, __FUNCTION__, __LINE__)) {
         crm_err("Call to %s failed: Message has invalid arguments", method);
         goto done;
     }
 
     dbus_message_iter_recurse(&args, &unit);
     if(pcmk_dbus_type_check(reply, &unit, DBUS_TYPE_OBJECT_PATH, __FUNCTION__, __LINE__)) {
         DBusBasicValue value;
 
         dbus_message_iter_get_basic(&unit, &value);
 
         if(value.str) {
             instance = strdup(value.str);
             crm_trace("Result: %s", instance);
         }
     }
 
   done:
     if(reply) {
         dbus_message_unref(reply);
     }
     return instance;
 }
 
 static void
 upstart_job_check(const char *name, const char *state, void *userdata)
 {
     svc_action_t * op = userdata;
 
     if (state && g_strcmp0(state, "running") == 0) {
         op->rc = PCMK_OCF_OK;
     /* } else if (g_strcmp0(state, "activating") == 0) { */
     /*     op->rc = PCMK_OCF_PENDING; */
     } else {
         op->rc = PCMK_OCF_NOT_RUNNING;
     }
 
     if (op->synchronous == FALSE) {
         services_set_op_pending(op, NULL);
         operation_finalize(op);
     }
 }
 
 static char *
 upstart_job_metadata(const char *name)
 {
     return crm_strdup_printf("<?xml version=\"1.0\"?>\n"
                            "<!DOCTYPE resource-agent SYSTEM \"ra-api-1.dtd\">\n"
                            "<resource-agent name=\"%s\" version=\"0.1\">\n"
                            "  <version>1.0</version>\n"
                            "  <longdesc lang=\"en\">\n"
                            "    Upstart agent for controlling the system %s service\n"
                            "  </longdesc>\n"
                            "  <shortdesc lang=\"en\">%s upstart agent</shortdesc>\n"
                            "  <parameters>\n"
                            "  </parameters>\n"
                            "  <actions>\n"
                            "    <action name=\"start\"   timeout=\"15\" />\n"
                            "    <action name=\"stop\"    timeout=\"15\" />\n"
                            "    <action name=\"status\"  timeout=\"15\" />\n"
                            "    <action name=\"restart\"  timeout=\"15\" />\n"
                            "    <action name=\"monitor\" timeout=\"15\" interval=\"15\" start-delay=\"15\" />\n"
                            "    <action name=\"meta-data\"  timeout=\"5\" />\n"
                            "  </actions>\n"
                            "  <special tag=\"upstart\">\n"
                            "  </special>\n" "</resource-agent>\n", name, name, name);
 }
 
 static bool
 upstart_mask_error(svc_action_t *op, const char *error)
 {
     crm_trace("Could not issue %s for %s: %s", op->action, op->rsc, error);
     if(strstr(error, UPSTART_06_API ".Error.UnknownInstance")) {
         if(safe_str_eq(op->action, "stop")) {
             crm_trace("Masking %s failure for %s: unknown services are stopped", op->action, op->rsc);
             op->rc = PCMK_OCF_OK;
 
         } else if(safe_str_eq(op->action, "start")) {
             crm_trace("Mapping %s failure for %s: unknown services are not installed", op->action, op->rsc);
             op->rc = PCMK_OCF_NOT_INSTALLED;
             op->status = PCMK_LRM_OP_NOT_INSTALLED;
         }
         return TRUE;
 
     } else if (safe_str_eq(op->action, "start")
                && strstr(error, UPSTART_06_API ".Error.AlreadyStarted")) {
         crm_trace("Mapping %s failure for %s: starting a started resource is allowed", op->action, op->rsc);
         op->rc = PCMK_OCF_OK;
         return TRUE;
     }
 
     return FALSE;
 }
 
 static void
 upstart_async_dispatch(DBusPendingCall *pending, void *user_data)
 {
     DBusError error;
     DBusMessage *reply = NULL;
     svc_action_t *op = user_data;
 
     dbus_error_init(&error);
     if(pending) {
         reply = dbus_pending_call_steal_reply(pending);
     }
 
     if(pcmk_dbus_find_error(op->action, pending, reply, &error)) {
 
         /* ignore "already started" or "not running" errors */
         if (!upstart_mask_error(op, error.name)) {
             crm_err("%s for %s: %s", op->action, op->rsc, error.message);
         }
 
     } else if (!g_strcmp0(op->action, "stop")) {
         /* No return vaue */
         op->rc = PCMK_OCF_OK;
 
     } else {
         if(!pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, __FUNCTION__, __LINE__)) {
             crm_warn("Call to %s passed but return type was unexpected", op->action);
             op->rc = PCMK_OCF_OK;
 
         } else {
             const char *path = NULL;
 
             dbus_message_get_args (reply, NULL,
                                    DBUS_TYPE_OBJECT_PATH, &path,
                                    DBUS_TYPE_INVALID);
             crm_info("Call to %s passed: %s", op->action, path);
             op->rc = PCMK_OCF_OK;
         }
     }
 
+    CRM_LOG_ASSERT(pending == op->opaque->pending);
     services_set_op_pending(op, NULL);
     operation_finalize(op);
 
     if(reply) {
         dbus_message_unref(reply);
     }
 }
 
 gboolean
 upstart_job_exec(svc_action_t * op, gboolean synchronous)
 {
     char *job = NULL;
     int arg_wait = TRUE;
     const char *arg_env = "pacemaker=1";
     const char *action = op->action;
 
     DBusError error;
     DBusMessage *msg = NULL;
     DBusMessage *reply = NULL;
     DBusMessageIter iter, array_iter;
 
     op->rc = PCMK_OCF_UNKNOWN_ERROR;
     CRM_ASSERT(upstart_init());
 
     if (safe_str_eq(op->action, "meta-data")) {
         op->stdout_data = upstart_job_metadata(op->agent);
         op->rc = PCMK_OCF_OK;
         goto cleanup;
     }
 
     if(!upstart_job_by_name(op->agent, &job, op->timeout)) {
         crm_debug("Could not obtain job named '%s' to %s", op->agent, action);
         if (!g_strcmp0(action, "stop")) {
             op->rc = PCMK_OCF_OK;
 
         } else {
             op->rc = PCMK_OCF_NOT_INSTALLED;
             op->status = PCMK_LRM_OP_NOT_INSTALLED;
         }
         goto cleanup;
     }
 
     if (safe_str_eq(op->action, "monitor") || safe_str_eq(action, "status")) {
 
         char *path = get_first_instance(job, op->timeout);
 
         op->rc = PCMK_OCF_NOT_RUNNING;
         if(path) {
             DBusPendingCall *pending = NULL;
             char *state = pcmk_dbus_get_property(
                 upstart_proxy, BUS_NAME, path, UPSTART_06_API ".Instance", "state",
                 op->synchronous?NULL:upstart_job_check, op,
                 op->synchronous?NULL:&pending, op->timeout);
 
             free(job);
             free(path);
 
             if(op->synchronous) {
                 upstart_job_check("state", state, op);
                 free(state);
                 return op->rc == PCMK_OCF_OK;
             } else if (pending) {
+                dbus_pending_call_ref(pending);
                 services_set_op_pending(op, pending);
                 return TRUE;
             }
             return FALSE;
         }
         goto cleanup;
 
     } else if (!g_strcmp0(action, "start")) {
         action = "Start";
     } else if (!g_strcmp0(action, "stop")) {
         action = "Stop";
     } else if (!g_strcmp0(action, "restart")) {
         action = "Restart";
     } else {
         op->rc = PCMK_OCF_UNIMPLEMENT_FEATURE;
         goto cleanup;
     }
 
     crm_debug("Calling %s for %s on %s", action, op->rsc, job);
 
     msg = dbus_message_new_method_call(BUS_NAME, // target for the method call
                                        job, // object to call on
                                        UPSTART_JOB_IFACE, // interface to call on
                                        action); // method name
     CRM_ASSERT(msg != NULL);
 
     dbus_message_iter_init_append (msg, &iter);
 
     CRM_LOG_ASSERT(dbus_message_iter_open_container (&iter,
                                                      DBUS_TYPE_ARRAY,
                                                      DBUS_TYPE_STRING_AS_STRING,
                                                      &array_iter));
 
     CRM_LOG_ASSERT(dbus_message_iter_append_basic (&array_iter, DBUS_TYPE_STRING, &arg_env));
     CRM_LOG_ASSERT(dbus_message_iter_close_container (&iter, &array_iter));
 
     CRM_LOG_ASSERT(dbus_message_append_args(msg, DBUS_TYPE_BOOLEAN, &arg_wait, DBUS_TYPE_INVALID));
 
     if (op->synchronous == FALSE) {
         DBusPendingCall* pending = pcmk_dbus_send(msg, upstart_proxy, upstart_async_dispatch, op, op->timeout);
         free(job);
 
         if(pending) {
             services_set_op_pending(op, pending);
             return TRUE;
         }
         return FALSE;
     }
 
     dbus_error_init(&error);
     reply = pcmk_dbus_send_recv(msg, upstart_proxy, &error, op->timeout);
 
     if(error.name) {
         if(!upstart_mask_error(op, error.name)) {
             crm_err("Could not issue %s for %s: %s (%s)", action, op->rsc, error.name, job);
         }
 
     } else if (!g_strcmp0(op->action, "stop")) {
         /* No return vaue */
         op->rc = PCMK_OCF_OK;
 
     } else if(!pcmk_dbus_type_check(reply, NULL, DBUS_TYPE_OBJECT_PATH, __FUNCTION__, __LINE__)) {
         crm_warn("Call to %s passed but return type was unexpected", op->action);
         op->rc = PCMK_OCF_OK;
 
     } else {
         const char *path = NULL;
 
         dbus_message_get_args (reply, NULL,
                                DBUS_TYPE_OBJECT_PATH, &path,
                                DBUS_TYPE_INVALID);
         crm_info("Call to %s passed: %s", op->action, path);
         op->rc = PCMK_OCF_OK;
     }
 
 
   cleanup:
     free(job);
     if(msg) {
         dbus_message_unref(msg);
     }
 
     if(reply) {
         dbus_message_unref(reply);
     }
 
     if (op->synchronous == FALSE) {
         operation_finalize(op);
         return TRUE;
     }
     return op->rc == PCMK_OCF_OK;
 }