diff --git a/crmd/remote_lrmd_ra.c b/crmd/remote_lrmd_ra.c
index 47a72c515d..4f940135f7 100644
--- a/crmd/remote_lrmd_ra.c
+++ b/crmd/remote_lrmd_ra.c
@@ -1,970 +1,1020 @@
 /* 
  * Copyright (C) 2013 David Vossel <davidvossel@gmail.com>
  * 
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License as published by the Free Software Foundation; either
  * version 2 of the License, or (at your option) any later version.
  * 
  * This software is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  * 
  * You should have received a copy of the GNU General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 
 #include <crm_internal.h>
 #include <crm/crm.h>
 #include <crm/msg_xml.h>
 
 #include <crmd.h>
 #include <crmd_fsa.h>
 #include <crmd_messages.h>
 #include <crmd_callbacks.h>
 #include <crmd_lrm.h>
 #include <crm/lrmd.h>
 #include <crm/services.h>
 
 #define REMOTE_LRMD_RA "remote"
 
 /* The max start timeout before cmd retry */
 #define MAX_START_TIMEOUT_MS 10000
 
 typedef struct remote_ra_cmd_s {
     /*! the local node the cmd is issued from */
     char *owner;
     /*! the remote node the cmd is executed on */
     char *rsc_id;
     /*! the action to execute */
     char *action;
     /*! some string the client wants us to give it back */
     char *userdata;
     /*! start delay in ms */
     int start_delay;
     /*! timer id used for start delay. */
     int delay_id;
     /*! timeout in ms for cmd */
     int timeout;
     int remaining_timeout;
     /*! recurring interval in ms */
     int interval;
     /*! interval timer id */
     int interval_id;
     int reported_success;
     int monitor_timeout_id;
     int takeover_timeout_id;
     /*! action parameters */
     lrmd_key_value_t *params;
     /*! executed rc */
     int rc;
     int op_status;
     int call_id;
     time_t start_time;
     gboolean cancel;
 } remote_ra_cmd_t;
 
 enum remote_migration_status {
     expect_takeover = 1,
     takeover_complete,
 };
 
 typedef struct remote_ra_data_s {
     crm_trigger_t *work;
     remote_ra_cmd_t *cur_cmd;
     GList *cmds;
     GList *recurring_cmds;
 
     enum remote_migration_status migrate_status;
 
     gboolean active;
 } remote_ra_data_t;
 
 static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
 static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
 static GList *fail_all_monitor_cmds(GList * list);
 
 static void
 free_cmd(gpointer user_data)
 {
     remote_ra_cmd_t *cmd = user_data;
 
     if (!cmd) {
         return;
     }
     if (cmd->delay_id) {
         g_source_remove(cmd->delay_id);
     }
     if (cmd->interval_id) {
         g_source_remove(cmd->interval_id);
     }
     if (cmd->monitor_timeout_id) {
         g_source_remove(cmd->monitor_timeout_id);
     }
     if (cmd->takeover_timeout_id) {
         g_source_remove(cmd->takeover_timeout_id);
     }
     free(cmd->owner);
     free(cmd->rsc_id);
     free(cmd->action);
     free(cmd->userdata);
     lrmd_key_value_freeall(cmd->params);
     free(cmd);
 }
 
 static int
 generate_callid(void)
 {
     static int remote_ra_callid = 0;
 
     remote_ra_callid++;
     if (remote_ra_callid <= 0) {
         remote_ra_callid = 1;
     }
 
     return remote_ra_callid;
 }
 
 static gboolean
 recurring_helper(gpointer data)
 {
     remote_ra_cmd_t *cmd = data;
     lrm_state_t *connection_rsc = NULL;
 
     cmd->interval_id = 0;
     connection_rsc = lrm_state_find(cmd->rsc_id);
     if (connection_rsc && connection_rsc->remote_ra_data) {
         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 
         ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
 
         ra_data->cmds = g_list_append(ra_data->cmds, cmd);
         mainloop_set_trigger(ra_data->work);
     }
     return FALSE;
 }
 
 static gboolean
 start_delay_helper(gpointer data)
 {
     remote_ra_cmd_t *cmd = data;
     lrm_state_t *connection_rsc = NULL;
 
     cmd->delay_id = 0;
     connection_rsc = lrm_state_find(cmd->rsc_id);
     if (connection_rsc && connection_rsc->remote_ra_data) {
         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 
         mainloop_set_trigger(ra_data->work);
     }
     return FALSE;
 }
 
 /*!
  * \internal
  * \brief Handle cluster communication related to pacemaker_remote node joining
  *
  * \param[in] node_name  Name of newly integrated pacemaker_remote node
  */
 static void
 remote_node_up(const char *node_name)
 {
     int call_opt, call_id = 0;
     xmlNode *update, *state;
     crm_node_t *node;
 
     CRM_CHECK(node_name != NULL, return);
     crm_info("Announcing pacemaker_remote node %s", node_name);
 
     /* Clear node's operation history and transient attributes */
     call_opt = crmd_cib_smart_opt();
     erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt);
     erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt);
 
     /* Clear node's probed attribute */
     update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
 
     /* Ensure node is in the remote peer cache with member status */
     node = crm_remote_peer_get(node_name);
     CRM_CHECK(node != NULL, return);
     crm_update_peer_state(__FUNCTION__, node, CRM_NODE_MEMBER, 0);
 
     /* pacemaker_remote nodes don't participate in the membership layer,
      * so cluster nodes don't automatically get notified when they come and go.
      * We send a cluster message to the DC, and update the CIB node state entry,
      * so the DC will get it sooner (via message) or later (via CIB refresh),
      * and any other interested parties can query the CIB.
      */
     send_remote_state_message(node_name, TRUE);
 
     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
     state = do_update_node_cib(node, node_update_cluster, update, __FUNCTION__);
 
     /* Clear the XML_NODE_IS_FENCED flag in the node state. If the node ever
      * needs to be fenced, this flag will allow various actions to determine
      * whether the fencing has happened yet.
      */
     crm_xml_add(state, XML_NODE_IS_FENCED, "0");
 
     /* TODO: If the remote connection drops, and this (async) CIB update either
      * failed or has not yet completed, later actions could mistakenly think the
      * node has already been fenced (if the XML_NODE_IS_FENCED attribute was
      * previously set, because it won't have been cleared). This could prevent
      * actual fencing or allow recurring monitor failures to be cleared too
      * soon. Ideally, we wouldn't rely on the CIB for the fenced status.
      */
     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
     if (call_id < 0) {
         crm_perror(LOG_WARNING, "%s CIB node state setup", node_name);
     }
     free_xml(update);
 }
 
+/*!
+ * \internal
+ * \brief Handle cluster communication related to pacemaker_remote node leaving
+ *
+ * \param[in] node_name  Name of lost node
+ */
+static void
+remote_node_down(const char *node_name)
+{
+    xmlNode *update;
+    int call_id = 0;
+    int call_opt = crmd_cib_smart_opt();
+    crm_node_t *node;
+
+    /* Clear all node attributes */
+    update_attrd_remote_node_removed(node_name, NULL);
+
+    /* Ensure node is in the remote peer cache with lost state */
+    node = crm_remote_peer_get(node_name);
+    CRM_CHECK(node != NULL, return);
+    crm_update_peer_state(__FUNCTION__, node, CRM_NODE_LOST, 0);
+
+    /* Notify DC */
+    send_remote_state_message(node_name, FALSE);
+
+    /* Update CIB node state */
+    update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
+    do_update_node_cib(node, node_update_cluster, update, __FUNCTION__);
+    fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
+    if (call_id < 0) {
+        crm_perror(LOG_ERR, "%s CIB node state update", node_name);
+    }
+    free_xml(update);
+}
+
 /*!
  * \internal
  * \brief Handle effects of a remote RA command on node state
  *
  * \param[in] cmd  Completed remote RA command
  */
 static void
 check_remote_node_state(remote_ra_cmd_t *cmd)
 {
     /* Only successful actions can change node state */
     if (cmd->rc != PCMK_OCF_OK) {
         return;
     }
 
     if (safe_str_eq(cmd->action, "start")) {
         remote_node_up(cmd->rsc_id);
+
+    } else if (safe_str_eq(cmd->action, "stop")) {
+        lrm_state_t *lrm_state = lrm_state_find(cmd->rsc_id);
+        remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
+
+        if (ra_data) {
+            if (ra_data->migrate_status != takeover_complete) {
+                /* Stop means down if we didn't successfully migrate elsewhere */
+                remote_node_down(cmd->rsc_id);
+            } else if (AM_I_DC == FALSE) {
+                /* Only the connection host and DC track node state,
+                 * so if the connection migrated elsewhere and we aren't DC,
+                 * un-cache the node, so we don't have stale info
+                 */
+                crm_remote_peer_cache_remove(cmd->rsc_id);
+            }
+        }
     }
 }
 
 static void
 report_remote_ra_result(remote_ra_cmd_t * cmd)
 {
     lrmd_event_data_t op = { 0, };
 
     check_remote_node_state(cmd);
 
     op.type = lrmd_event_exec_complete;
     op.rsc_id = cmd->rsc_id;
     op.op_type = cmd->action;
     op.user_data = cmd->userdata;
     op.timeout = cmd->timeout;
     op.interval = cmd->interval;
     op.rc = cmd->rc;
     op.op_status = cmd->op_status;
     op.t_run = cmd->start_time;
     op.t_rcchange = cmd->start_time;
     if (cmd->reported_success && cmd->rc != PCMK_OCF_OK) {
         op.t_rcchange = time(NULL);
         /* This edge case will likely never ever occur, but if it does the
          * result is that a failure will not be processed correctly. This is only
          * remotely possible because we are able to detect a connection resource's tcp
          * connection has failed at any moment after start has completed. The actual
          * recurring operation is just a connectivity ping.
          *
          * basically, we are not guaranteed that the first successful monitor op and
          * a subsequent failed monitor op will not occur in the same timestamp. We have to
          * make it look like the operations occurred at separate times though. */
         if (op.t_rcchange == op.t_run) {
             op.t_rcchange++;
         }
     }
 
     if (cmd->params) {
         lrmd_key_value_t *tmp;
 
         op.params = g_hash_table_new_full(crm_str_hash,
                                           g_str_equal, g_hash_destroy_str, g_hash_destroy_str);
         for (tmp = cmd->params; tmp; tmp = tmp->next) {
             g_hash_table_insert(op.params, strdup(tmp->key), strdup(tmp->value));
         }
 
     }
     op.call_id = cmd->call_id;
     op.remote_nodename = cmd->owner;
 
     lrm_op_callback(&op);
 
     if (op.params) {
         g_hash_table_destroy(op.params);
     }
 }
 
 static void
 update_remaining_timeout(remote_ra_cmd_t * cmd)
 {
     cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000;
 }
 
 static gboolean
 retry_start_cmd_cb(gpointer data)
 {
     lrm_state_t *lrm_state = data;
     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
     remote_ra_cmd_t *cmd = NULL;
     int rc = -1;
 
     if (!ra_data || !ra_data->cur_cmd) {
         return FALSE;
     }
     cmd = ra_data->cur_cmd;
     if (safe_str_neq(cmd->action, "start") && safe_str_neq(cmd->action, "migrate_from")) {
         return FALSE;
     }
     update_remaining_timeout(cmd);
 
     if (cmd->remaining_timeout > 0) {
         rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout);
     }
 
     if (rc != 0) {
         cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
         cmd->op_status = PCMK_LRM_OP_ERROR;
         report_remote_ra_result(cmd);
 
         if (ra_data->cmds) {
             mainloop_set_trigger(ra_data->work);
         }
         ra_data->cur_cmd = NULL;
         free_cmd(cmd);
     } else {
         /* wait for connection event */
     }
 
     return FALSE;
 }
 
 
 static gboolean
 connection_takeover_timeout_cb(gpointer data)
 {
     lrm_state_t *lrm_state = NULL;
     remote_ra_cmd_t *cmd = data;
 
     crm_info("takeover event timed out for node %s", cmd->rsc_id);
     cmd->takeover_timeout_id = 0;
 
     lrm_state = lrm_state_find(cmd->rsc_id);
 
     handle_remote_ra_stop(lrm_state, cmd);
     free_cmd(cmd);
 
     return FALSE;
 }
 
 static gboolean
 monitor_timeout_cb(gpointer data)
 {
     lrm_state_t *lrm_state = NULL;
     remote_ra_cmd_t *cmd = data;
 
     lrm_state = lrm_state_find(cmd->rsc_id);
 
     crm_info("Poke async response timed out for node %s (%p)", cmd->rsc_id, lrm_state);
     cmd->monitor_timeout_id = 0;
     cmd->op_status = PCMK_LRM_OP_TIMEOUT;
     cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 
     if (lrm_state && lrm_state->remote_ra_data) {
         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 
         if (ra_data->cur_cmd == cmd) {
             ra_data->cur_cmd = NULL;
         }
         if (ra_data->cmds) {
             mainloop_set_trigger(ra_data->work);
         }
     }
 
     report_remote_ra_result(cmd);
     free_cmd(cmd);
 
     if(lrm_state) {
         lrm_state_disconnect(lrm_state);
     }
     return FALSE;
 }
 
 void
 remote_lrm_op_callback(lrmd_event_data_t * op)
 {
     gboolean cmd_handled = FALSE;
     lrm_state_t *lrm_state = NULL;
     remote_ra_data_t *ra_data = NULL;
     remote_ra_cmd_t *cmd = NULL;
 
     crm_debug("remote connection event - event_type:%s node:%s action:%s rc:%s op_status:%s",
               lrmd_event_type2str(op->type),
               op->remote_nodename,
               op->op_type ? op->op_type : "none",
               services_ocf_exitcode_str(op->rc), services_lrm_status_str(op->op_status));
 
     lrm_state = lrm_state_find(op->remote_nodename);
     if (!lrm_state || !lrm_state->remote_ra_data) {
         crm_debug("lrm_state info not found for remote lrmd connection event");
         return;
     }
     ra_data = lrm_state->remote_ra_data;
 
     /* Another client has connected to the remote daemon,
      * determine if this is expected. */
     if (op->type == lrmd_event_new_client) {
         /* great, we new this was coming */
         if (ra_data->migrate_status == expect_takeover) {
             ra_data->migrate_status = takeover_complete;
         } else {
             crm_err("Unexpected pacemaker_remote client takeover. Disconnecting");
             lrm_state_disconnect(lrm_state);
         }
         return;
     }
 
     /* filter all EXEC events up */
     if (op->type == lrmd_event_exec_complete) {
         if (ra_data->migrate_status == takeover_complete) {
             crm_debug("ignoring event, this connection is taken over by another node");
         } else {
             lrm_op_callback(op);
         }
         return;
     }
 
     if ((op->type == lrmd_event_disconnect) &&
         (ra_data->cur_cmd == NULL) &&
         (ra_data->active == TRUE)) {
 
         crm_err("Unexpected disconnect on remote-node %s", lrm_state->node_name);
         ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
         ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
         return;
     }
 
     if (!ra_data->cur_cmd) {
         crm_debug("no event to match");
         return;
     }
 
     cmd = ra_data->cur_cmd;
 
     /* Start actions and migrate from actions complete after connection
      * comes back to us. */
     if (op->type == lrmd_event_connect && (safe_str_eq(cmd->action, "start") ||
                                            safe_str_eq(cmd->action, "migrate_from"))) {
 
         if (op->connection_rc < 0) {
             update_remaining_timeout(cmd);
             /* There isn't much of a reason to reschedule if the timeout is too small */
             if (cmd->remaining_timeout > 3000) {
                 crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout);
                 g_timeout_add(1000, retry_start_cmd_cb, lrm_state);
                 return;
             } else {
                 crm_trace("can't reschedule start, remaining timeout too small %d",
                           cmd->remaining_timeout);
             }
             cmd->op_status = PCMK_LRM_OP_TIMEOUT;
             cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 
         } else {
             lrm_state_reset_tables(lrm_state);
             cmd->rc = PCMK_OCF_OK;
             cmd->op_status = PCMK_LRM_OP_DONE;
             ra_data->active = TRUE;
         }
 
         crm_debug("remote lrmd connect event matched %s action. ", cmd->action);
         report_remote_ra_result(cmd);
         cmd_handled = TRUE;
 
     } else if (op->type == lrmd_event_poke && safe_str_eq(cmd->action, "monitor")) {
 
         if (cmd->monitor_timeout_id) {
             g_source_remove(cmd->monitor_timeout_id);
             cmd->monitor_timeout_id = 0;
         }
 
         /* Only report success the first time, after that only worry about failures.
          * For this function, if we get the poke pack, it is always a success. Pokes
          * only fail if the send fails, or the response times out. */
         if (!cmd->reported_success) {
             cmd->rc = PCMK_OCF_OK;
             cmd->op_status = PCMK_LRM_OP_DONE;
             report_remote_ra_result(cmd);
             cmd->reported_success = 1;
         }
 
         crm_debug("remote lrmd poke event matched %s action. ", cmd->action);
 
         /* success, keep rescheduling if interval is present. */
         if (cmd->interval && (cmd->cancel == FALSE)) {
             ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
             cmd->interval_id = g_timeout_add(cmd->interval, recurring_helper, cmd);
             cmd = NULL;         /* prevent free */
         }
         cmd_handled = TRUE;
 
     } else if (op->type == lrmd_event_disconnect && safe_str_eq(cmd->action, "monitor")) {
         if (ra_data->active == TRUE && (cmd->cancel == FALSE)) {
             cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
             cmd->op_status = PCMK_LRM_OP_ERROR;
             report_remote_ra_result(cmd);
             crm_err("remote-node %s unexpectedly disconneced during monitor operation", lrm_state->node_name);
         }
         cmd_handled = TRUE;
 
     } else if (op->type == lrmd_event_new_client && safe_str_eq(cmd->action, "stop")) {
 
         handle_remote_ra_stop(lrm_state, cmd);
         cmd_handled = TRUE;
 
     } else {
         crm_debug("Event did not match %s action", ra_data->cur_cmd->action);
     }
 
     if (cmd_handled) {
         ra_data->cur_cmd = NULL;
         if (ra_data->cmds) {
             mainloop_set_trigger(ra_data->work);
         }
         free_cmd(cmd);
     }
 }
 
 static void
 handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
 {
     remote_ra_data_t *ra_data = NULL;
 
     CRM_ASSERT(lrm_state);
     ra_data = lrm_state->remote_ra_data;
 
     if (ra_data->migrate_status != takeover_complete) {
-        /* only clear the status if this stop is not apart of a successful migration */
-        update_attrd_remote_node_removed(lrm_state->node_name, NULL);
         /* delete pending ops when ever the remote connection is intentionally stopped */
         g_hash_table_remove_all(lrm_state->pending_ops);
     } else {
         /* we no longer hold the history if this connection has been migrated */
         lrm_state_reset_tables(lrm_state);
     }
 
     ra_data->active = FALSE;
     lrm_state_disconnect(lrm_state);
     cmd->rc = PCMK_OCF_OK;
     cmd->op_status = PCMK_LRM_OP_DONE;
 
     if (ra_data->cmds) {
         g_list_free_full(ra_data->cmds, free_cmd);
     }
     if (ra_data->recurring_cmds) {
         g_list_free_full(ra_data->recurring_cmds, free_cmd);
     }
     ra_data->cmds = NULL;
     ra_data->recurring_cmds = NULL;
     ra_data->cur_cmd = NULL;
 
     report_remote_ra_result(cmd);
 }
 
 static int
 handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
 {
     const char *server = NULL;
     lrmd_key_value_t *tmp = NULL;
     int port = 0;
     int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
 
     for (tmp = cmd->params; tmp; tmp = tmp->next) {
         if (safe_str_eq(tmp->key, "addr") || safe_str_eq(tmp->key, "server")) {
             server = tmp->value;
         }
         if (safe_str_eq(tmp->key, "port")) {
             port = atoi(tmp->value);
         }
     }
 
     return lrm_state_remote_connect_async(lrm_state, server, port, timeout_used);
 }
 
 static gboolean
 handle_remote_ra_exec(gpointer user_data)
 {
     int rc = 0;
     lrm_state_t *lrm_state = user_data;
     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
     remote_ra_cmd_t *cmd;
     GList *first = NULL;
 
     if (ra_data->cur_cmd) {
         /* still waiting on previous cmd */
         return TRUE;
     }
 
     while (ra_data->cmds) {
         first = ra_data->cmds;
         cmd = first->data;
         if (cmd->delay_id) {
             /* still waiting for start delay timer to trip */
             return TRUE;
         }
 
         ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
         g_list_free_1(first);
 
         if (!strcmp(cmd->action, "start") || !strcmp(cmd->action, "migrate_from")) {
             ra_data->migrate_status = 0;
             rc = handle_remote_ra_start(lrm_state, cmd, cmd->timeout);
             if (rc == 0) {
                 /* take care of this later when we get async connection result */
                 crm_debug("began remote lrmd connect, waiting for connect event.");
                 ra_data->cur_cmd = cmd;
                 return TRUE;
             } else {
                 crm_debug("connect failed, not expecting to match any connection event later");
                 cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
                 cmd->op_status = PCMK_LRM_OP_ERROR;
             }
             report_remote_ra_result(cmd);
 
         } else if (!strcmp(cmd->action, "monitor")) {
 
             if (lrm_state_is_connected(lrm_state) == TRUE) {
                 rc = lrm_state_poke_connection(lrm_state);
                 if (rc < 0) {
                     cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
                     cmd->op_status = PCMK_LRM_OP_ERROR;
                 }
             } else {
                 rc = -1;
                 cmd->op_status = PCMK_LRM_OP_DONE;
                 cmd->rc = PCMK_OCF_NOT_RUNNING;
             }
 
             if (rc == 0) {
                 crm_debug("poked remote lrmd at node %s, waiting for async response.", cmd->rsc_id);
                 ra_data->cur_cmd = cmd;
                 cmd->monitor_timeout_id = g_timeout_add(cmd->timeout, monitor_timeout_cb, cmd);
                 return TRUE;
             }
             report_remote_ra_result(cmd);
 
         } else if (!strcmp(cmd->action, "stop")) {
 
             if (ra_data->migrate_status == expect_takeover) {
                 /* briefly wait on stop for the takeover event to occur. If the
                  * takeover event does not occur during the wait period, that's fine.
                  * It just means that the remote-node's lrm_status section is going to get
                  * cleared which will require all the resources running in the remote-node
                  * to be explicitly re-detected via probe actions.  If the takeover does occur
                  * successfully, then we can leave the status section intact. */
                 cmd->takeover_timeout_id = g_timeout_add((cmd->timeout/2), connection_takeover_timeout_cb, cmd);
                 ra_data->cur_cmd = cmd;
                 return TRUE;
             }
 
             handle_remote_ra_stop(lrm_state, cmd);
 
         } else if (!strcmp(cmd->action, "migrate_to")) {
             ra_data->migrate_status = expect_takeover;
             cmd->rc = PCMK_OCF_OK;
             cmd->op_status = PCMK_LRM_OP_DONE;
             report_remote_ra_result(cmd);
         } else if (!strcmp(cmd->action, "reload")) {
             /* reloads are a no-op right now, add logic here when they become important */
             cmd->rc = PCMK_OCF_OK;
             cmd->op_status = PCMK_LRM_OP_DONE;
             report_remote_ra_result(cmd);
         }
 
         free_cmd(cmd);
     }
 
     return TRUE;
 }
 
 static void
 remote_ra_data_init(lrm_state_t * lrm_state)
 {
     remote_ra_data_t *ra_data = NULL;
 
     if (lrm_state->remote_ra_data) {
         return;
     }
 
     ra_data = calloc(1, sizeof(remote_ra_data_t));
     ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
     lrm_state->remote_ra_data = ra_data;
 }
 
 void
 remote_ra_cleanup(lrm_state_t * lrm_state)
 {
     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 
     if (!ra_data) {
         return;
     }
 
     if (ra_data->cmds) {
         g_list_free_full(ra_data->cmds, free_cmd);
     }
 
     if (ra_data->recurring_cmds) {
         g_list_free_full(ra_data->recurring_cmds, free_cmd);
     }
     mainloop_destroy_trigger(ra_data->work);
     free(ra_data);
     lrm_state->remote_ra_data = NULL;
 }
 
 gboolean
 is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
 {
     if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
         return TRUE;
     }
     if (id && lrm_state_find(id) && safe_str_neq(id, fsa_our_uname)) {
         return TRUE;
     }
 
     return FALSE;
 }
 
 lrmd_rsc_info_t *
 remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
 {
     lrmd_rsc_info_t *info = NULL;
 
     if ((lrm_state_find(rsc_id))) {
         info = calloc(1, sizeof(lrmd_rsc_info_t));
 
         info->id = strdup(rsc_id);
         info->type = strdup(REMOTE_LRMD_RA);
         info->class = strdup("ocf");
         info->provider = strdup("pacemaker");
     }
 
     return info;
 }
 
 static gboolean
 is_remote_ra_supported_action(const char *action)
 {
     if (!action) {
         return FALSE;
     } else if (strcmp(action, "start") &&
                strcmp(action, "stop") &&
                strcmp(action, "reload") &&
                strcmp(action, "migrate_to") &&
                strcmp(action, "migrate_from") && strcmp(action, "monitor")) {
         return FALSE;
     }
 
     return TRUE;
 }
 
 static GList *
 fail_all_monitor_cmds(GList * list)
 {
     GList *rm_list = NULL;
     remote_ra_cmd_t *cmd = NULL;
     GListPtr gIter = NULL;
 
     for (gIter = list; gIter != NULL; gIter = gIter->next) {
         cmd = gIter->data;
         if (cmd->interval > 0 && safe_str_eq(cmd->action, "monitor")) {
             rm_list = g_list_append(rm_list, cmd);
         }
     }
 
     for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
         cmd = gIter->data;
 
         cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
         cmd->op_status = PCMK_LRM_OP_ERROR;
         crm_trace("Pre-emptively failing %s %s (interval=%d, %s)", cmd->action, cmd->rsc_id, cmd->interval, cmd->userdata);
         report_remote_ra_result(cmd);
 
         list = g_list_remove(list, cmd);
         free_cmd(cmd);
     }
 
     /* frees only the list data, not the cmds */
     g_list_free(rm_list);
     return list;
 }
 
 static GList *
 remove_cmd(GList * list, const char *action, int interval)
 {
     remote_ra_cmd_t *cmd = NULL;
     GListPtr gIter = NULL;
 
     for (gIter = list; gIter != NULL; gIter = gIter->next) {
         cmd = gIter->data;
         if (cmd->interval == interval && safe_str_eq(cmd->action, action)) {
             break;
         }
         cmd = NULL;
     }
     if (cmd) {
         list = g_list_remove(list, cmd);
         free_cmd(cmd);
     }
     return list;
 }
 
 int
 remote_ra_cancel(lrm_state_t * lrm_state, const char *rsc_id, const char *action, int interval)
 {
     lrm_state_t *connection_rsc = NULL;
     remote_ra_data_t *ra_data = NULL;
 
     connection_rsc = lrm_state_find(rsc_id);
     if (!connection_rsc || !connection_rsc->remote_ra_data) {
         return -EINVAL;
     }
 
     ra_data = connection_rsc->remote_ra_data;
     ra_data->cmds = remove_cmd(ra_data->cmds, action, interval);
     ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action, interval);
     if (ra_data->cur_cmd &&
         (ra_data->cur_cmd->interval == interval) &&
         (safe_str_eq(ra_data->cur_cmd->action, action))) {
 
         ra_data->cur_cmd->cancel = TRUE;
     }
 
     return 0;
 }
 
 static remote_ra_cmd_t *
 handle_dup_monitor(remote_ra_data_t *ra_data, int interval, const char *userdata)
 {
     GList *gIter = NULL;
     remote_ra_cmd_t *cmd = NULL;
 
     /* there are 3 places a potential duplicate monitor operation
      * could exist.
      * 1. recurring_cmds list. where the op is waiting for its next interval
      * 2. cmds list, where the op is queued to get executed immediately
      * 3. cur_cmd, which means the monitor op is in flight right now.
      */
     if (interval == 0) {
         return NULL;
     }
 
     if (ra_data->cur_cmd &&
         ra_data->cur_cmd->cancel == FALSE &&
         ra_data->cur_cmd->interval == interval &&
         safe_str_eq(ra_data->cur_cmd->action, "monitor")) {
 
         cmd = ra_data->cur_cmd;
         goto handle_dup;
     }
 
     for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
         cmd = gIter->data;
         if (cmd->interval == interval && safe_str_eq(cmd->action, "monitor")) {
             goto handle_dup;
         }
     }
 
     for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
         cmd = gIter->data;
         if (cmd->interval == interval && safe_str_eq(cmd->action, "monitor")) {
             goto handle_dup;
         }
     }
 
     return NULL;
 
 handle_dup:
 
     crm_trace("merging duplicate monitor cmd %s_monitor_%d", cmd->rsc_id, interval);
 
     /* update the userdata */
     if (userdata) {
        free(cmd->userdata);
        cmd->userdata = strdup(userdata);
     }
 
     /* if we've already reported success, generate a new call id */
     if (cmd->reported_success) {
         cmd->start_time = time(NULL);
         cmd->call_id = generate_callid();
         cmd->reported_success = 0;
     }
 
     /* if we have an interval_id set, that means we are in the process of
      * waiting for this cmd's next interval. instead of waiting, cancel
      * the timer and execute the action immediately */
     if (cmd->interval_id) {
         g_source_remove(cmd->interval_id);
         cmd->interval_id = 0;
         recurring_helper(cmd);
     }
 
     return cmd;  
 }
 
 int
 remote_ra_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *action, const char *userdata, int interval,     /* ms */
                int timeout,     /* ms */
                int start_delay, /* ms */
                lrmd_key_value_t * params)
 {
     int rc = 0;
     lrm_state_t *connection_rsc = NULL;
     remote_ra_cmd_t *cmd = NULL;
     remote_ra_data_t *ra_data = NULL;
 
     if (is_remote_ra_supported_action(action) == FALSE) {
         rc = -EINVAL;
         goto exec_done;
     }
 
     connection_rsc = lrm_state_find(rsc_id);
     if (!connection_rsc) {
         rc = -EINVAL;
         goto exec_done;
     }
 
     remote_ra_data_init(connection_rsc);
     ra_data = connection_rsc->remote_ra_data;
 
     cmd = handle_dup_monitor(ra_data, interval, userdata);
     if (cmd) {
        return cmd->call_id;
     }
 
     cmd = calloc(1, sizeof(remote_ra_cmd_t));
     cmd->owner = strdup(lrm_state->node_name);
     cmd->rsc_id = strdup(rsc_id);
     cmd->action = strdup(action);
     cmd->userdata = strdup(userdata);
     cmd->interval = interval;
     cmd->timeout = timeout;
     cmd->start_delay = start_delay;
     cmd->params = params;
     cmd->start_time = time(NULL);
 
     cmd->call_id = generate_callid();
 
     if (cmd->start_delay) {
         cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd);
     }
 
     ra_data->cmds = g_list_append(ra_data->cmds, cmd);
     mainloop_set_trigger(ra_data->work);
 
     return cmd->call_id;
   exec_done:
 
     lrmd_key_value_freeall(params);
     return rc;
 }
diff --git a/lib/cluster/membership.c b/lib/cluster/membership.c
index 806032e250..9d17bfb259 100644
--- a/lib/cluster/membership.c
+++ b/lib/cluster/membership.c
@@ -1,1100 +1,1102 @@
 /*
  * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
  *
  * This library is free software; you can redistribute it and/or
  * modify it under the terms of the GNU Lesser General Public
  * License as published by the Free Software Foundation; either
  * version 2.1 of the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public
  * License along with this library; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  */
 #include <crm_internal.h>
 
 #ifndef _GNU_SOURCE
 #  define _GNU_SOURCE
 #endif
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <string.h>
 #include <glib.h>
 #include <crm/common/ipc.h>
 #include <crm/cluster/internal.h>
 #include <crm/msg_xml.h>
 #include <crm/stonith-ng.h>
 
 /* The peer cache remembers cluster nodes that have been seen.
  * This is managed mostly automatically by libcluster, based on
  * cluster membership events.
  *
  * Because cluster nodes can have conflicting names or UUIDs,
  * the hash table key is a uniquely generated ID.
  */
 GHashTable *crm_peer_cache = NULL;
 
 /*
  * The remote peer cache tracks pacemaker_remote nodes. While the
  * value has the same type as the peer cache's, it is tracked separately for
  * three reasons: pacemaker_remote nodes can't have conflicting names or UUIDs,
  * so the name (which is also the UUID) is used as the hash table key; there
  * is no equivalent of membership events, so management is not automatic; and
  * most users of the peer cache need to exclude pacemaker_remote nodes.
  *
  * That said, using a single cache would be more logical and less error-prone,
  * so it would be a good idea to merge them one day.
  *
  * libcluster provides two avenues for populating the cache:
  * crm_remote_peer_get(), crm_remote_peer_cache_add() and
  * crm_remote_peer_cache_remove() directly manage it,
  * while crm_remote_peer_cache_refresh() populates it via the CIB.
  */
 GHashTable *crm_remote_peer_cache = NULL;
 
 unsigned long long crm_peer_seq = 0;
 gboolean crm_have_quorum = FALSE;
 static gboolean crm_autoreap  = TRUE;
 
 int
 crm_remote_peer_cache_size(void)
 {
     if (crm_remote_peer_cache == NULL) {
         return 0;
     }
     return g_hash_table_size(crm_remote_peer_cache);
 }
 
 /*!
  * \brief Get a remote node peer cache entry, creating it if necessary
  *
  * \param[in] node_name  Name of remote node
  *
  * \return Cache entry for node on success, NULL (and set errno) otherwise
  *
  * \note When creating a new entry, this will leave the node state undetermined,
  *       so the caller should also call crm_update_peer_state() if the state is
  *       known.
  */
 crm_node_t *
 crm_remote_peer_get(const char *node_name)
 {
     crm_node_t *node;
 
     if (node_name == NULL) {
         errno = -EINVAL;
         return NULL;
     }
 
     /* Return existing cache entry if one exists */
     node = g_hash_table_lookup(crm_remote_peer_cache, node_name);
     if (node) {
         return node;
     }
 
     /* Allocate a new entry */
     node = calloc(1, sizeof(crm_node_t));
     if (node == NULL) {
         return NULL;
     }
 
     /* Populate the essential information */
     node->flags = crm_remote_node;
     node->uuid = strdup(node_name);
     if (node->uuid == NULL) {
         free(node);
         errno = -ENOMEM;
         return NULL;
     }
 
     /* Add the new entry to the cache */
     g_hash_table_replace(crm_remote_peer_cache, node->uuid, node);
     crm_trace("added %s to remote cache", node_name);
 
     /* Update the entry's uname, ensuring peer status callbacks are called */
     crm_update_peer_uname(node, node_name);
     return node;
 }
 
 /*!
  * \brief Add a node to the remote peer cache
  *
  * \param[in] node_name  Name of remote node
  *
  * \note This is a legacy convenience wrapper for crm_remote_peer_get()
  *       for callers that don't need the cache entry returned.
  */
 void
 crm_remote_peer_cache_add(const char *node_name)
 {
     CRM_ASSERT(crm_remote_peer_get(node_name) != NULL);
 }
 
 void
 crm_remote_peer_cache_remove(const char *node_name)
 {
-    g_hash_table_remove(crm_remote_peer_cache, node_name);
+    if (g_hash_table_remove(crm_remote_peer_cache, node_name)) {
+        crm_trace("removed %s from remote peer cache", node_name);
+    }
 }
 
 /*!
  * \internal
  * \brief Return node status based on a CIB status entry
  *
  * \param[in] node_state  XML of node state
  *
  * \return CRM_NODE_LOST if XML_NODE_IN_CLUSTER is false in node_state,
  *         CRM_NODE_MEMBER otherwise
  * \note Unlike most boolean XML attributes, this one defaults to true, for
  *       backward compatibility with older crmd versions that don't set it.
  */
 static const char *
 remote_state_from_cib(xmlNode *node_state)
 {
     const char *status;
 
     status = crm_element_value(node_state, XML_NODE_IN_CLUSTER);
     if (status && !crm_is_true(status)) {
         status = CRM_NODE_LOST;
     } else {
         status = CRM_NODE_MEMBER;
     }
     return status;
 }
 
 /* user data for looping through remote node xpath searches */
 struct refresh_data {
     const char *field;  /* XML attribute to check for node name */
     gboolean has_state; /* whether to update node state based on XML */
 };
 
 /*!
  * \internal
  * \brief Process one pacemaker_remote node xpath search result
  *
  * \param[in] result     XML search result
  * \param[in] user_data  what to look for in the XML
  */
 static void
 remote_cache_refresh_helper(xmlNode *result, void *user_data)
 {
     struct refresh_data *data = user_data;
     const char *remote = crm_element_value(result, data->field);
     const char *state = NULL;
     crm_node_t *node;
 
     CRM_CHECK(remote != NULL, return);
 
     /* Determine node's state, if the result has it */
     if (data->has_state) {
         state = remote_state_from_cib(result);
     }
 
     /* Check whether cache already has entry for node */
     node = g_hash_table_lookup(crm_remote_peer_cache, remote);
 
     if (node == NULL) {
         /* Node is not in cache, so add a new entry for it */
         node = crm_remote_peer_get(remote);
         CRM_ASSERT(node);
         if (state) {
             crm_update_peer_state(__FUNCTION__, node, state, 0);
         }
 
     } else if (is_set(node->flags, crm_node_dirty)) {
         /* Node is in cache and hasn't been updated already, so mark it clean */
         clear_bit(node->flags, crm_node_dirty);
         if (state) {
             crm_update_peer_state(__FUNCTION__, node, state, 0);
         }
     }
 }
 
 static void
 mark_dirty(gpointer key, gpointer value, gpointer user_data)
 {
     set_bit(((crm_node_t*)value)->flags, crm_node_dirty);
 }
 
 static gboolean
 is_dirty(gpointer key, gpointer value, gpointer user_data)
 {
     return is_set(((crm_node_t*)value)->flags, crm_node_dirty);
 }
 
 /* search string to find CIB resources entries for guest nodes */
 #define XPATH_GUEST_NODE_CONFIG \
     "//" XML_TAG_CIB "//" XML_CIB_TAG_CONFIGURATION "//" XML_CIB_TAG_RESOURCE \
     "//" XML_TAG_META_SETS "//" XML_CIB_TAG_NVPAIR \
     "[@name='" XML_RSC_ATTR_REMOTE_NODE "']"
 
 /* search string to find CIB resources entries for remote nodes */
 #define XPATH_REMOTE_NODE_CONFIG \
     "//" XML_TAG_CIB "//" XML_CIB_TAG_CONFIGURATION "//" XML_CIB_TAG_RESOURCE \
     "[@type='remote'][@provider='pacemaker']"
 
 /* search string to find CIB node status entries for pacemaker_remote nodes */
 #define XPATH_REMOTE_NODE_STATUS \
     "//" XML_TAG_CIB "//" XML_CIB_TAG_STATUS "//" XML_CIB_TAG_STATE \
     "[@" XML_NODE_IS_REMOTE "='true']"
 
 /*!
  * \brief Repopulate the remote peer cache based on CIB XML
  *
  * \param[in] xmlNode  CIB XML to parse
  */
 void
 crm_remote_peer_cache_refresh(xmlNode *cib)
 {
     struct refresh_data data;
 
     /* First, we mark all existing cache entries as dirty,
      * so that later we can remove any that weren't in the CIB.
      * We don't empty the cache, because we need to detect changes in state.
      */
     g_hash_table_foreach(crm_remote_peer_cache, mark_dirty, NULL);
 
     /* Look for guest nodes and remote nodes in the status section */
     data.field = "id";
     data.has_state = TRUE;
     crm_foreach_xpath_result(cib, XPATH_REMOTE_NODE_STATUS,
                              remote_cache_refresh_helper, &data);
 
     /* Look for guest nodes and remote nodes in the configuration section,
      * because they may have just been added and not have a status entry yet.
      * In that case, the cached node state will be left NULL, so that the
      * peer status callback isn't called until we're sure the node started
      * successfully.
      */
     data.field = "value";
     data.has_state = FALSE;
     crm_foreach_xpath_result(cib, XPATH_GUEST_NODE_CONFIG,
                              remote_cache_refresh_helper, &data);
     data.field = "id";
     data.has_state = FALSE;
     crm_foreach_xpath_result(cib, XPATH_REMOTE_NODE_CONFIG,
                              remote_cache_refresh_helper, &data);
 
     /* Remove all old cache entries that weren't seen in the CIB */
     g_hash_table_foreach_remove(crm_remote_peer_cache, is_dirty, NULL);
 }
 
 gboolean
 crm_is_peer_active(const crm_node_t * node)
 {
     if(node == NULL) {
         return FALSE;
     }
 
     if (is_set(node->flags, crm_remote_node)) {
         /* remote nodes are never considered active members. This
          * guarantees they will never be considered for DC membership.*/
         return FALSE;
     }
 #if SUPPORT_COROSYNC
     if (is_openais_cluster()) {
         return crm_is_corosync_peer_active(node);
     }
 #endif
 #if SUPPORT_HEARTBEAT
     if (is_heartbeat_cluster()) {
         return crm_is_heartbeat_peer_active(node);
     }
 #endif
     crm_err("Unhandled cluster type: %s", name_for_cluster_type(get_cluster_type()));
     return FALSE;
 }
 
 static gboolean
 crm_reap_dead_member(gpointer key, gpointer value, gpointer user_data)
 {
     crm_node_t *node = value;
     crm_node_t *search = user_data;
 
     if (search == NULL) {
         return FALSE;
 
     } else if (search->id && node->id != search->id) {
         return FALSE;
 
     } else if (search->id == 0 && safe_str_neq(node->uname, search->uname)) {
         return FALSE;
 
     } else if (crm_is_peer_active(value) == FALSE) {
         crm_notice("Removing %s/%u from the membership list", node->uname, node->id);
         return TRUE;
     }
     return FALSE;
 }
 
 /*!
  * \brief Remove all peer cache entries matching a node ID and/or uname
  *
  * \param[in] id    ID of node to remove (or 0 to ignore)
  * \param[in] name  Uname of node to remove (or NULL to ignore)
  *
  * \return Number of cache entries removed
  */
 guint
 reap_crm_member(uint32_t id, const char *name)
 {
     int matches = 0;
     crm_node_t search;
 
     if (crm_peer_cache == NULL) {
         crm_trace("Nothing to do, cache not initialized");
         return 0;
     }
 
     search.id = id;
     search.uname = name ? strdup(name) : NULL;
     matches = g_hash_table_foreach_remove(crm_peer_cache, crm_reap_dead_member, &search);
     if(matches) {
         crm_notice("Purged %d peers with id=%u and/or uname=%s from the membership cache",
                    matches, search.id, search.uname);
 
     } else {
         crm_info("No peers with id=%u and/or uname=%s exist", id, name);
     }
 
     free(search.uname);
     return matches;
 }
 
 static void
 crm_count_peer(gpointer key, gpointer value, gpointer user_data)
 {
     guint *count = user_data;
     crm_node_t *node = value;
 
     if (crm_is_peer_active(node)) {
         *count = *count + 1;
     }
 }
 
 guint
 crm_active_peers(void)
 {
     guint count = 0;
 
     if (crm_peer_cache) {
         g_hash_table_foreach(crm_peer_cache, crm_count_peer, &count);
     }
     return count;
 }
 
 static void
 destroy_crm_node(gpointer data)
 {
     crm_node_t *node = data;
 
     crm_trace("Destroying entry for node %u: %s", node->id, node->uname);
 
     free(node->addr);
     free(node->uname);
     free(node->state);
     free(node->uuid);
     free(node->expected);
     free(node);
 }
 
 void
 crm_peer_init(void)
 {
     if (crm_peer_cache == NULL) {
         crm_peer_cache = g_hash_table_new_full(crm_strcase_hash, crm_strcase_equal, free, destroy_crm_node);
     }
 
     if (crm_remote_peer_cache == NULL) {
         crm_remote_peer_cache = g_hash_table_new_full(crm_strcase_hash, crm_strcase_equal, NULL, destroy_crm_node);
     }
 }
 
 void
 crm_peer_destroy(void)
 {
     if (crm_peer_cache != NULL) {
         crm_trace("Destroying peer cache with %d members", g_hash_table_size(crm_peer_cache));
         g_hash_table_destroy(crm_peer_cache);
         crm_peer_cache = NULL;
     }
 
     if (crm_remote_peer_cache != NULL) {
         crm_trace("Destroying remote peer cache with %d members", g_hash_table_size(crm_remote_peer_cache));
         g_hash_table_destroy(crm_remote_peer_cache);
         crm_remote_peer_cache = NULL;
     }
 }
 
 void (*crm_status_callback) (enum crm_status_type, crm_node_t *, const void *) = NULL;
 
 /*!
  * \brief Set a client function that will be called after peer status changes
  *
  * \param[in] dispatch  Pointer to function to use as callback
  *
  * \note Previously, client callbacks were responsible for peer cache
  *       management. This is no longer the case, and client callbacks should do
  *       only client-specific handling. Callbacks MUST NOT add or remove entries
  *       in the peer caches.
  */
 void
 crm_set_status_callback(void (*dispatch) (enum crm_status_type, crm_node_t *, const void *))
 {
     crm_status_callback = dispatch;
 }
 
 /*!
  * \brief Tell the library whether to automatically reap lost nodes
  *
  * If TRUE (the default), calling crm_update_peer_proc() will also update the
  * peer state to CRM_NODE_MEMBER or CRM_NODE_LOST, and crm_update_peer_state()
  * will reap peers whose state changes to anything other than CRM_NODE_MEMBER.
  * Callers should leave this enabled unless they plan to manage the cache
  * separately on their own.
  *
  * \param[in] autoreap  TRUE to enable automatic reaping, FALSE to disable
  */
 void
 crm_set_autoreap(gboolean autoreap)
 {
     crm_autoreap = autoreap;
 }
 
 static void crm_dump_peer_hash(int level, const char *caller)
 {
     GHashTableIter iter;
     const char *id = NULL;
     crm_node_t *node = NULL;
 
     g_hash_table_iter_init(&iter, crm_peer_cache);
     while (g_hash_table_iter_next(&iter, (gpointer *) &id, (gpointer *) &node)) {
         do_crm_log(level, "%s: Node %u/%s = %p - %s", caller, node->id, node->uname, node, id);
     }
 }
 
 static gboolean crm_hash_find_by_data(gpointer key, gpointer value, gpointer user_data)
 {
     if(value == user_data) {
         return TRUE;
     }
     return FALSE;
 }
 
 crm_node_t *
 crm_find_peer_full(unsigned int id, const char *uname, int flags)
 {
     crm_node_t *node = NULL;
 
     CRM_ASSERT(id > 0 || uname != NULL);
 
     crm_peer_init();
 
     if (flags & CRM_GET_PEER_REMOTE) {
         node = g_hash_table_lookup(crm_remote_peer_cache, uname);
     }
 
     if (node == NULL && (flags & CRM_GET_PEER_CLUSTER)) {
         node = crm_find_peer(id, uname);
     }
     return node;
 }
 
 crm_node_t *
 crm_get_peer_full(unsigned int id, const char *uname, int flags)
 {
     crm_node_t *node = NULL;
 
     CRM_ASSERT(id > 0 || uname != NULL);
 
     crm_peer_init();
 
     if (flags & CRM_GET_PEER_REMOTE) {
         node = g_hash_table_lookup(crm_remote_peer_cache, uname);
     }
 
     if (node == NULL && (flags & CRM_GET_PEER_CLUSTER)) {
         node = crm_get_peer(id, uname);
     }
     return node;
 }
 
 crm_node_t *
 crm_find_peer(unsigned int id, const char *uname)
 {
     GHashTableIter iter;
     crm_node_t *node = NULL;
     crm_node_t *by_id = NULL;
     crm_node_t *by_name = NULL;
 
     CRM_ASSERT(id > 0 || uname != NULL);
 
     crm_peer_init();
 
     if (uname != NULL) {
         g_hash_table_iter_init(&iter, crm_peer_cache);
         while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
             if(node->uname && strcasecmp(node->uname, uname) == 0) {
                 crm_trace("Name match: %s = %p", node->uname, node);
                 by_name = node;
                 break;
             }
         }
     }
 
     if (id > 0) {
         g_hash_table_iter_init(&iter, crm_peer_cache);
         while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
             if(node->id == id) {
                 crm_trace("ID match: %u = %p", node->id, node);
                 by_id = node;
                 break;
             }
         }
     }
 
     node = by_id; /* Good default */
     if(by_id == by_name) {
         /* Nothing to do if they match (both NULL counts) */
         crm_trace("Consistent: %p for %u/%s", by_id, id, uname);
 
     } else if(by_id == NULL && by_name) {
         crm_trace("Only one: %p for %u/%s", by_name, id, uname);
 
         if(id && by_name->id) {
             crm_dump_peer_hash(LOG_WARNING, __FUNCTION__);
             crm_crit("Node %u and %u share the same name '%s'",
                      id, by_name->id, uname);
             node = NULL; /* Create a new one */
 
         } else {
             node = by_name;
         }
 
     } else if(by_name == NULL && by_id) {
         crm_trace("Only one: %p for %u/%s", by_id, id, uname);
 
         if(uname && by_id->uname) {
             crm_dump_peer_hash(LOG_WARNING, __FUNCTION__);
             crm_crit("Node '%s' and '%s' share the same cluster nodeid %u: assuming '%s' is correct",
                      uname, by_id->uname, id, uname);
         }
 
     } else if(uname && by_id->uname) {
         if(safe_str_eq(uname, by_id->uname)) {
             crm_notice("Node '%s' has changed its ID from %u to %u", by_id->uname, by_name->id, by_id->id);
             g_hash_table_foreach_remove(crm_peer_cache, crm_hash_find_by_data, by_name);
 
         } else {
             crm_warn("Node '%s' and '%s' share the same cluster nodeid: %u %s", by_id->uname, by_name->uname, id, uname);
             crm_dump_peer_hash(LOG_INFO, __FUNCTION__);
             crm_abort(__FILE__, __FUNCTION__, __LINE__, "member weirdness", TRUE, TRUE);
         }
 
     } else if(id && by_name->id) {
         crm_warn("Node %u and %u share the same name: '%s'", by_id->id, by_name->id, uname);
 
     } else {
         /* Simple merge */
 
         /* Only corosync based clusters use nodeid's
          *
          * The functions that call crm_update_peer_state() only know nodeid
          * so 'by_id' is authorative when merging
          *
          * Same for crm_update_peer_proc()
          */
         crm_dump_peer_hash(LOG_DEBUG, __FUNCTION__);
 
         crm_info("Merging %p into %p", by_name, by_id);
         g_hash_table_foreach_remove(crm_peer_cache, crm_hash_find_by_data, by_name);
     }
 
     return node;
 }
 
 #if SUPPORT_COROSYNC
 static guint
 crm_remove_conflicting_peer(crm_node_t *node)
 {
     int matches = 0;
     GHashTableIter iter;
     crm_node_t *existing_node = NULL;
 
     if (node->id == 0 || node->uname == NULL) {
         return 0;
     }
 
 #  if !SUPPORT_PLUGIN
     if (corosync_cmap_has_config("nodelist") != 0) {
         return 0;
     }
 #  endif
 
     g_hash_table_iter_init(&iter, crm_peer_cache);
     while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &existing_node)) {
         if (existing_node->id > 0
             && existing_node->id != node->id
             && existing_node->uname != NULL
             && strcasecmp(existing_node->uname, node->uname) == 0) {
 
             if (crm_is_peer_active(existing_node)) {
                 continue;
             }
 
             crm_warn("Removing cached offline node %u/%s which has conflicting uname with %u",
                      existing_node->id, existing_node->uname, node->id);
 
             g_hash_table_iter_remove(&iter);
             matches++;
         }
     }
 
     return matches;
 }
 #endif
 
 /* coverity[-alloc] Memory is referenced in one or both hashtables */
 crm_node_t *
 crm_get_peer(unsigned int id, const char *uname)
 {
     crm_node_t *node = NULL;
     char *uname_lookup = NULL;
 
     CRM_ASSERT(id > 0 || uname != NULL);
 
     crm_peer_init();
 
     node = crm_find_peer(id, uname);
 
     /* if uname wasn't provided, and find_peer did not turn up a uname based on id.
      * we need to do a lookup of the node name using the id in the cluster membership. */
     if ((node == NULL || node->uname == NULL) && (uname == NULL)) { 
         uname_lookup = get_node_name(id);
     }
 
     if (uname_lookup) {
         uname = uname_lookup;
         crm_trace("Inferred a name of '%s' for node %u", uname, id);
 
         /* try to turn up the node one more time now that we know the uname. */
         if (node == NULL) {
             node = crm_find_peer(id, uname);
         }
     }
 
 
     if (node == NULL) {
         char *uniqueid = crm_generate_uuid();
 
         node = calloc(1, sizeof(crm_node_t));
         CRM_ASSERT(node);
 
         crm_info("Created entry %s/%p for node %s/%u (%d total)",
                  uniqueid, node, uname, id, 1 + g_hash_table_size(crm_peer_cache));
         g_hash_table_replace(crm_peer_cache, uniqueid, node);
     }
 
     if(id > 0 && uname && (node->id == 0 || node->uname == NULL)) {
         crm_info("Node %u is now known as %s", id, uname);
     }
 
     if(id > 0 && node->id == 0) {
         node->id = id;
     }
 
     if (uname && (node->uname == NULL)) {
         crm_update_peer_uname(node, uname);
     }
 
     if(node->uuid == NULL) {
         const char *uuid = crm_peer_uuid(node);
 
         if (uuid) {
             crm_info("Node %u has uuid %s", id, uuid);
 
         } else {
             crm_info("Cannot obtain a UUID for node %u/%s", id, node->uname);
         }
     }
 
     free(uname_lookup);
 
     return node;
 }
 
 /*!
  * \internal
  * \brief Update all of a node's information (process list, state, etc.)
  *
  * \param[in] source      Caller's function name (for log messages)
  *
  * \return NULL if node was reaped from peer caches, pointer to node otherwise
  *
  * \note This function should not be called within a peer cache iteration,
  *       otherwise reaping could invalidate the iterator.
  */
 crm_node_t *
 crm_update_peer(const char *source, unsigned int id, uint64_t born, uint64_t seen, int32_t votes,
                 uint32_t children, const char *uuid, const char *uname, const char *addr,
                 const char *state)
 {
 #if SUPPORT_PLUGIN
     gboolean addr_changed = FALSE;
     gboolean votes_changed = FALSE;
 #endif
     crm_node_t *node = NULL;
 
     id = get_corosync_id(id, uuid);
     node = crm_get_peer(id, uname);
 
     CRM_ASSERT(node != NULL);
 
     if (node->uuid == NULL) {
         if (is_openais_cluster()) {
             /* Yes, overrule whatever was passed in */
             crm_peer_uuid(node);
 
         } else if (uuid != NULL) {
             node->uuid = strdup(uuid);
         }
     }
 
     if (children > 0) {
         if (crm_update_peer_proc(source, node, children, state) == NULL) {
             return NULL;
         }
     }
 
     if (state != NULL) {
         if (crm_update_peer_state(source, node, state, seen) == NULL) {
             return NULL;
         }
     }
 #if SUPPORT_HEARTBEAT
     if (born != 0) {
         node->born = born;
     }
 #endif
 
 #if SUPPORT_PLUGIN
     /* These were only used by the plugin */
     if (born != 0) {
         node->born = born;
     }
 
     if (votes > 0 && node->votes != votes) {
         votes_changed = TRUE;
         node->votes = votes;
     }
 
     if (addr != NULL) {
         if (node->addr == NULL || crm_str_eq(node->addr, addr, FALSE) == FALSE) {
             addr_changed = TRUE;
             free(node->addr);
             node->addr = strdup(addr);
         }
     }
     if (addr_changed || votes_changed) {
         crm_info("%s: Node %s: id=%u state=%s addr=%s%s votes=%d%s born=" U64T " seen=" U64T
                  " proc=%.32x", source, node->uname, node->id, node->state,
                  node->addr, addr_changed ? " (new)" : "", node->votes,
                  votes_changed ? " (new)" : "", node->born, node->last_seen, node->processes);
     }
 #endif
 
     return node;
 }
 
 /*!
  * \internal
  * \brief Update a node's uname
  *
  * \param[in] node        Node object to update
  * \param[in] uname       New name to set
  *
  * \note This function should not be called within a peer cache iteration,
  *       because in some cases it can remove conflicting cache entries,
  *       which would invalidate the iterator.
  */
 void
 crm_update_peer_uname(crm_node_t *node, const char *uname)
 {
     int i, len = strlen(uname);
 
     for (i = 0; i < len; i++) {
         if (uname[i] >= 'A' && uname[i] <= 'Z') {
             crm_warn("Node names with capitals are discouraged, consider changing '%s'",
                      uname);
             break;
         }
     }
 
     free(node->uname);
     node->uname = strdup(uname);
     if (crm_status_callback) {
         crm_status_callback(crm_status_uname, node, NULL);
     }
 
 #if SUPPORT_COROSYNC
     if (is_openais_cluster() && !is_set(node->flags, crm_remote_node)) {
         crm_remove_conflicting_peer(node);
     }
 #endif
 }
 
 /*!
  * \internal
  * \brief Update a node's process information (and potentially state)
  *
  * \param[in] source      Caller's function name (for log messages)
  * \param[in] node        Node object to update
  * \param[in] flag        Bitmask of new process information
  * \param[in] status      node status (online, offline, etc.)
  *
  * \return NULL if any node was reaped from peer caches, value of node otherwise
  *
  * \note If this function returns NULL, the supplied node object was likely
  *       freed and should not be used again. This function should not be
  *       called within a cache iteration if reaping is possible, otherwise
  *       reaping could invalidate the iterator.
  */
 crm_node_t *
 crm_update_peer_proc(const char *source, crm_node_t * node, uint32_t flag, const char *status)
 {
     uint32_t last = 0;
     gboolean changed = FALSE;
 
     CRM_CHECK(node != NULL, crm_err("%s: Could not set %s to %s for NULL",
                                     source, peer2text(flag), status); return NULL);
 
     /* Pacemaker doesn't spawn processes on remote nodes */
     if (is_set(node->flags, crm_remote_node)) {
         return node;
     }
 
     last = node->processes;
     if (status == NULL) {
         node->processes = flag;
         if (node->processes != last) {
             changed = TRUE;
         }
 
     } else if (safe_str_eq(status, ONLINESTATUS)) {
         if ((node->processes & flag) != flag) {
             set_bit(node->processes, flag);
             changed = TRUE;
         }
 #if SUPPORT_PLUGIN
     } else if (safe_str_eq(status, CRM_NODE_MEMBER)) {
         if (flag > 0 && node->processes != flag) {
             node->processes = flag;
             changed = TRUE;
         }
 #endif
 
     } else if (node->processes & flag) {
         clear_bit(node->processes, flag);
         changed = TRUE;
     }
 
     if (changed) {
         if (status == NULL && flag <= crm_proc_none) {
             crm_info("%s: Node %s[%u] - all processes are now offline", source, node->uname,
                      node->id);
         } else {
             crm_info("%s: Node %s[%u] - %s is now %s", source, node->uname, node->id,
                      peer2text(flag), status);
         }
 
         /* Call the client callback first, then update the peer state,
          * in case the node will be reaped
          */
         if (crm_status_callback) {
             crm_status_callback(crm_status_processes, node, &last);
         }
 
         /* The client callback shouldn't touch the peer caches,
          * but as a safety net, bail if the peer cache was destroyed.
          */
         if (crm_peer_cache == NULL) {
             return NULL;
         }
 
         if (crm_autoreap) {
             node = crm_update_peer_state(__FUNCTION__, node,
                                          is_set(node->processes, crm_get_cluster_proc())?
                                          CRM_NODE_MEMBER : CRM_NODE_LOST, 0);
         }
     } else {
         crm_trace("%s: Node %s[%u] - %s is unchanged (%s)", source, node->uname, node->id,
                   peer2text(flag), status);
     }
     return node;
 }
 
 void
 crm_update_peer_expected(const char *source, crm_node_t * node, const char *expected)
 {
     char *last = NULL;
     gboolean changed = FALSE;
 
     CRM_CHECK(node != NULL, crm_err("%s: Could not set 'expected' to %s", source, expected);
               return);
 
     /* Remote nodes don't participate in joins */
     if (is_set(node->flags, crm_remote_node)) {
         return;
     }
 
     last = node->expected;
     if (expected != NULL && safe_str_neq(node->expected, expected)) {
         node->expected = strdup(expected);
         changed = TRUE;
     }
 
     if (changed) {
         crm_info("%s: Node %s[%u] - expected state is now %s (was %s)", source, node->uname, node->id,
                  expected, last);
         free(last);
     } else {
         crm_trace("%s: Node %s[%u] - expected state is unchanged (%s)", source, node->uname,
                   node->id, expected);
     }
 }
 
 /*!
  * \internal
  * \brief Update a node's state and membership information
  *
  * \param[in] source      Caller's function name (for log messages)
  * \param[in] node        Node object to update
  * \param[in] state       Node's new state
  * \param[in] membership  Node's new membership ID
  * \param[in] iter        If not NULL, pointer to node's peer cache iterator
  *
  * \return NULL if any node was reaped, value of node otherwise
  *
  * \note If this function returns NULL, the supplied node object was likely
  *       freed and should not be used again. This function may be called from
  *       within a peer cache iteration if the iterator is supplied.
  */
 static crm_node_t *
 crm_update_peer_state_iter(const char *source, crm_node_t * node, const char *state, int membership, GHashTableIter *iter)
 {
     gboolean is_member;
 
     CRM_CHECK(node != NULL, crm_err("%s: Could not set 'state' to %s", source, state);
                             return NULL);
 
     is_member = safe_str_eq(state, CRM_NODE_MEMBER);
     if (membership && is_member) {
         node->last_seen = membership;
     }
 
     if (state && safe_str_neq(node->state, state)) {
         char *last = node->state;
         enum crm_status_type status_type = is_set(node->flags, crm_remote_node)?
                                            crm_status_rstate : crm_status_nstate;
 
         node->state = strdup(state);
         crm_notice("%s: Node %s[%u] - state is now %s (was %s)",
                    source, node->uname, node->id, state, last);
         if (crm_status_callback) {
             crm_status_callback(status_type, node, last);
         }
         free(last);
 
         if (crm_autoreap && !is_member && !is_set(node->flags, crm_remote_node)) {
             /* We only autoreap from the peer cache, not the remote peer cache,
              * because the latter should be managed only by
              * crm_remote_peer_cache_refresh().
              */
             if(iter) {
                 crm_notice("Purged 1 peer with id=%u and/or uname=%s from the membership cache", node->id, node->uname);
                 g_hash_table_iter_remove(iter);
 
             } else {
                 reap_crm_member(node->id, node->uname);
             }
             node = NULL;
         }
 
     } else {
         crm_trace("%s: Node %s[%u] - state is unchanged (%s)", source, node->uname, node->id,
                   state);
     }
     return node;
 }
 
 /*!
  * \brief Update a node's state and membership information
  *
  * \param[in] source      Caller's function name (for log messages)
  * \param[in] node        Node object to update
  * \param[in] state       Node's new state
  * \param[in] membership  Node's new membership ID
  *
  * \return NULL if any node was reaped, value of node otherwise
  *
  * \note If this function returns NULL, the supplied node object was likely
  *       freed and should not be used again. This function should not be
  *       called within a cache iteration if reaping is possible,
  *       otherwise reaping could invalidate the iterator.
  */
 crm_node_t *
 crm_update_peer_state(const char *source, crm_node_t * node, const char *state, int membership)
 {
     return crm_update_peer_state_iter(source, node, state, membership, NULL);
 }
 
 /*!
  * \internal
  * \brief Reap all nodes from cache whose membership information does not match
  *
  * \param[in] membership  Membership ID of nodes to keep
  */
 void
 crm_reap_unseen_nodes(uint64_t membership)
 {
     GHashTableIter iter;
     crm_node_t *node = NULL;
 
     crm_trace("Reaping unseen nodes...");
     g_hash_table_iter_init(&iter, crm_peer_cache);
     while (g_hash_table_iter_next(&iter, NULL, (gpointer *)&node)) {
         if (node->last_seen != membership) {
             if (node->state) {
                 /*
                  * Calling crm_update_peer_state_iter() allows us to
                  * remove the node from crm_peer_cache without
                  * invalidating our iterator
                  */
                 crm_update_peer_state_iter(__FUNCTION__, node, CRM_NODE_LOST, membership, &iter);
 
             } else {
                 crm_info("State of node %s[%u] is still unknown",
                          node->uname, node->id);
             }
         }
     }
 }
 
 int
 crm_terminate_member(int nodeid, const char *uname, void *unused)
 {
     /* Always use the synchronous, non-mainloop version */
     return stonith_api_kick(nodeid, uname, 120, TRUE);
 }
 
 int
 crm_terminate_member_no_mainloop(int nodeid, const char *uname, int *connection)
 {
     return stonith_api_kick(nodeid, uname, 120, TRUE);
 }