Page MenuHomeClusterLabs Projects

heartbeat.c
No OneTemporary

heartbeat.c

/*
* Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This software is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
/* put these first so that uuid_t is defined without conflicts */
#include <crm_internal.h>
#include <string.h>
#include <crm/crm.h>
#include <crm/cib.h>
#include <crm/msg_xml.h>
#include <crm/common/xml.h>
#include <crm/cluster.h>
#include <crmd_messages.h>
#include <crmd_fsa.h>
#include <fsa_proto.h>
#include <crmd_callbacks.h>
#include <tengine.h>
#include <membership.h>
#include <ocf/oc_event.h>
#include <ocf/oc_membership.h>
void oc_ev_special(const oc_ev_t *, oc_ev_class_t, int);
void ccm_event_detail(const oc_ev_membership_t * oc, oc_ed_t event);
gboolean crmd_ha_msg_dispatch(ll_cluster_t * cluster_conn, gpointer user_data);
void crmd_ccm_msg_callback(oc_ed_t event, void *cookie, size_t size, const void *data);
int ccm_dispatch(gpointer user_data);
#define CCM_EVENT_DETAIL 0
#define CCM_EVENT_DETAIL_PARTIAL 0
int (*ccm_api_callback_done) (void *cookie) = NULL;
int (*ccm_api_handle_event) (const oc_ev_t * token) = NULL;
static oc_ev_t *fsa_ev_token;
static void *ccm_library = NULL;
static int num_ccm_register_fails = 0;
static int max_ccm_register_fails = 30;
static void
ccm_connection_destroy(void *userdata)
{
}
/* A_CCM_CONNECT */
void
do_ccm_control(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
static struct mainloop_fd_callbacks ccm_fd_callbacks = {
.dispatch = ccm_dispatch,
.destroy = ccm_connection_destroy,
};
if (is_heartbeat_cluster()) {
int (*ccm_api_register) (oc_ev_t ** token) =
find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_register", 1);
int (*ccm_api_set_callback) (const oc_ev_t * token,
oc_ev_class_t class,
oc_ev_callback_t * fn,
oc_ev_callback_t ** prev_fn) =
find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_set_callback", 1);
void (*ccm_api_special) (const oc_ev_t *, oc_ev_class_t, int) =
find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_special", 1);
int (*ccm_api_activate) (const oc_ev_t * token, int *fd) =
find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_activate", 1);
int (*ccm_api_unregister) (oc_ev_t * token) =
find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_unregister", 1);
if (action & A_CCM_DISCONNECT) {
set_bit(fsa_input_register, R_CCM_DISCONNECTED);
(*ccm_api_unregister) (fsa_ev_token);
}
if (action & A_CCM_CONNECT) {
int ret;
int fsa_ev_fd;
gboolean did_fail = FALSE;
crm_trace("Registering with CCM");
clear_bit(fsa_input_register, R_CCM_DISCONNECTED);
ret = (*ccm_api_register) (&fsa_ev_token);
if (ret != 0) {
crm_warn("CCM registration failed");
did_fail = TRUE;
}
if (did_fail == FALSE) {
crm_trace("Setting up CCM callbacks");
ret = (*ccm_api_set_callback) (fsa_ev_token, OC_EV_MEMB_CLASS,
crmd_ccm_msg_callback, NULL);
if (ret != 0) {
crm_warn("CCM callback not set");
did_fail = TRUE;
}
}
if (did_fail == FALSE) {
(*ccm_api_special) (fsa_ev_token, OC_EV_MEMB_CLASS, 0 /*don't care */ );
crm_trace("Activating CCM token");
ret = (*ccm_api_activate) (fsa_ev_token, &fsa_ev_fd);
if (ret != 0) {
crm_warn("CCM Activation failed");
did_fail = TRUE;
}
}
if (did_fail) {
num_ccm_register_fails++;
(*ccm_api_unregister) (fsa_ev_token);
if (num_ccm_register_fails < max_ccm_register_fails) {
crm_warn("CCM Connection failed"
" %d times (%d max)", num_ccm_register_fails, max_ccm_register_fails);
crm_timer_start(wait_timer);
crmd_fsa_stall(FALSE);
return;
} else {
crm_err("CCM Activation failed %d (max) times", num_ccm_register_fails);
register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
return;
}
}
crm_info("CCM connection established... waiting for first callback");
mainloop_add_fd("heartbeat-ccm", G_PRIORITY_HIGH, fsa_ev_fd, fsa_ev_token,
&ccm_fd_callbacks);
}
}
if (action & ~(A_CCM_CONNECT | A_CCM_DISCONNECT)) {
crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__);
}
}
void
ccm_event_detail(const oc_ev_membership_t * oc, oc_ed_t event)
{
int lpc;
gboolean member = FALSE;
member = FALSE;
crm_trace("-----------------------");
crm_info("%s: trans=%d, nodes=%d, new=%d, lost=%d n_idx=%d, "
"new_idx=%d, old_idx=%d",
ccm_event_name(event),
oc->m_instance,
oc->m_n_member, oc->m_n_in, oc->m_n_out, oc->m_memb_idx, oc->m_in_idx, oc->m_out_idx);
#if !CCM_EVENT_DETAIL_PARTIAL
for (lpc = 0; lpc < oc->m_n_member; lpc++) {
crm_info("\tCURRENT: %s [nodeid=%d, born=%d]",
oc->m_array[oc->m_memb_idx + lpc].node_uname,
oc->m_array[oc->m_memb_idx + lpc].node_id,
oc->m_array[oc->m_memb_idx + lpc].node_born_on);
if (safe_str_eq(fsa_our_uname, oc->m_array[oc->m_memb_idx + lpc].node_uname)) {
member = TRUE;
}
}
if (member == FALSE) {
crm_warn("MY NODE IS NOT IN CCM THE MEMBERSHIP LIST");
}
#endif
for (lpc = 0; lpc < (int)oc->m_n_in; lpc++) {
crm_info("\tNEW: %s [nodeid=%d, born=%d]",
oc->m_array[oc->m_in_idx + lpc].node_uname,
oc->m_array[oc->m_in_idx + lpc].node_id,
oc->m_array[oc->m_in_idx + lpc].node_born_on);
}
for (lpc = 0; lpc < (int)oc->m_n_out; lpc++) {
crm_info("\tLOST: %s [nodeid=%d, born=%d]",
oc->m_array[oc->m_out_idx + lpc].node_uname,
oc->m_array[oc->m_out_idx + lpc].node_id,
oc->m_array[oc->m_out_idx + lpc].node_born_on);
}
crm_trace("-----------------------");
}
/* A_CCM_UPDATE_CACHE */
/*
* Take the opportunity to update the node status in the CIB as well
*/
void
do_ccm_update_cache(enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state,
oc_ed_t event, const oc_ev_membership_t * oc, xmlNode * xml)
{
unsigned long long instance = 0;
unsigned int lpc = 0;
if (is_heartbeat_cluster()) {
CRM_ASSERT(oc != NULL);
instance = oc->m_instance;
}
CRM_ASSERT(crm_peer_seq <= instance);
switch (cur_state) {
case S_STOPPING:
case S_TERMINATE:
case S_HALT:
crm_debug("Ignoring %s CCM event %llu, we're in state %s",
ccm_event_name(event), instance, fsa_state2string(cur_state));
return;
case S_ELECTION:
register_fsa_action(A_ELECTION_CHECK);
break;
default:
break;
}
if (is_heartbeat_cluster()) {
ccm_event_detail(oc, event);
/*--*-- Recently Dead Member Nodes --*--*/
for (lpc = 0; lpc < oc->m_n_out; lpc++) {
crm_update_ccm_node(oc, lpc + oc->m_out_idx, CRM_NODE_LOST, instance);
}
/*--*-- All Member Nodes --*--*/
for (lpc = 0; lpc < oc->m_n_member; lpc++) {
crm_update_ccm_node(oc, lpc + oc->m_memb_idx, CRM_NODE_MEMBER, instance);
}
heartbeat_cluster->llc_ops->client_status(heartbeat_cluster, NULL, crm_system_name, 0);
}
if (event == OC_EV_MS_EVICTED) {
crm_node_t *peer = crm_get_peer(0, fsa_our_uname);
crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_EVICTED, 0);
/* todo: drop back to S_PENDING instead */
/* get out... NOW!
*
* go via the error recovery process so that HA will
* restart us if required
*/
register_fsa_error_adv(cause, I_ERROR, NULL, NULL, __FUNCTION__);
}
post_cache_update(instance);
return;
}
int
ccm_dispatch(gpointer user_data)
{
int rc = 0;
oc_ev_t *ccm_token = (oc_ev_t *) user_data;
gboolean was_error = FALSE;
crm_trace("Invoked");
if (ccm_api_handle_event == NULL) {
ccm_api_handle_event =
find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_handle_event", 1);
}
rc = (*ccm_api_handle_event) (ccm_token);
if (rc != 0) {
if (is_set(fsa_input_register, R_CCM_DISCONNECTED) == FALSE) {
/* we signed out, so this is expected */
register_fsa_input(C_CCM_CALLBACK, I_ERROR, NULL);
crm_err("CCM connection appears to have failed: rc=%d.", rc);
}
was_error = TRUE;
}
trigger_fsa(fsa_source);
if (was_error) {
return -1;
}
return 0;
}
void
crmd_ccm_msg_callback(oc_ed_t event, void *cookie, size_t size, const void *data)
{
gboolean update_cache = FALSE;
const oc_ev_membership_t *membership = data;
gboolean update_quorum = FALSE;
crm_trace("Invoked");
CRM_ASSERT(data != NULL);
crm_info("Quorum %s after event=%s (id=%d)",
ccm_have_quorum(event) ? "(re)attained" : "lost",
ccm_event_name(event), membership->m_instance);
if (crm_peer_seq > membership->m_instance) {
crm_err("Membership instance ID went backwards! %llu->%d",
crm_peer_seq, membership->m_instance);
CRM_ASSERT(crm_peer_seq <= membership->m_instance);
return;
}
/*
* OC_EV_MS_NEW_MEMBERSHIP: membership with quorum
* OC_EV_MS_MS_INVALID: membership without quorum
* OC_EV_MS_NOT_PRIMARY: previous membership no longer valid
* OC_EV_MS_PRIMARY_RESTORED: previous membership restored
* OC_EV_MS_EVICTED: the client is evicted from ccm.
*/
switch (event) {
case OC_EV_MS_NEW_MEMBERSHIP:
case OC_EV_MS_INVALID:
update_cache = TRUE;
update_quorum = TRUE;
break;
case OC_EV_MS_NOT_PRIMARY:
break;
case OC_EV_MS_PRIMARY_RESTORED:
update_cache = TRUE;
crm_peer_seq = membership->m_instance;
break;
case OC_EV_MS_EVICTED:
update_quorum = TRUE;
register_fsa_input(C_FSA_INTERNAL, I_STOP, NULL);
crm_err("Shutting down after CCM event: %s", ccm_event_name(event));
break;
default:
crm_err("Unknown CCM event: %d", event);
}
if (update_quorum) {
crm_have_quorum = ccm_have_quorum(event);
if (crm_have_quorum == FALSE) {
/* did we just lose quorum? */
if (fsa_has_quorum) {
crm_info("Quorum lost: %s", ccm_event_name(event));
}
}
crm_update_quorum(crm_have_quorum, FALSE);
}
if (update_cache) {
crm_trace("Updating cache after event %s", ccm_event_name(event));
do_ccm_update_cache(C_CCM_CALLBACK, fsa_state, event, data, NULL);
} else if (event != OC_EV_MS_NOT_PRIMARY) {
crm_peer_seq = membership->m_instance;
register_fsa_action(A_TE_CANCEL);
}
if (ccm_api_callback_done == NULL) {
ccm_api_callback_done =
find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_callback_done", 1);
}
(*ccm_api_callback_done) (cookie);
return;
}
void
crmd_ha_status_callback(const char *node, const char *status, void *private)
{
xmlNode *update = NULL;
crm_node_t *peer = NULL;
crm_notice("Status update: Node %s now has status [%s]", node, status);
peer = crm_get_peer(0, node);
if (safe_str_eq(status, PINGSTATUS)) {
return;
}
if (safe_str_eq(status, DEADSTATUS)) {
/* this node is toast */
crm_update_peer_proc(__FUNCTION__, peer, crm_proc_crmd|crm_proc_heartbeat, OFFLINESTATUS);
} else {
crm_update_peer_proc(__FUNCTION__, peer, crm_proc_heartbeat, ONLINESTATUS);
}
trigger_fsa(fsa_source);
if (AM_I_DC) {
update = create_node_state_update(peer, node_update_cluster, NULL,
__FUNCTION__);
fsa_cib_anon_update(XML_CIB_TAG_STATUS, update,
cib_scope_local | cib_quorum_override | cib_can_create);
free_xml(update);
}
}
void
crmd_client_status_callback(const char *node, const char *client, const char *status, void *private)
{
crm_node_t *peer = NULL;
crm_trace("Invoked");
if (safe_str_neq(client, CRM_SYSTEM_CRMD)) {
return;
}
peer = crm_get_peer(0, node);
if (safe_str_neq(peer->state, CRM_NODE_MEMBER)) {
crm_warn("This peer is not a ccm member (yet). "
"Status ignored: Client %s/%s announced status [%s] (DC=%s)",
node, client, status, AM_I_DC ? "true" : "false");
return;
}
set_bit(fsa_input_register, R_PEER_DATA);
crm_notice("Status update: Client %s/%s now has status [%s] (DC=%s)",
node, client, status, AM_I_DC ? "true" : "false");
/* rest of the code, especially crm_update_peer_proc,
* does not know about JOINSTATUS, but expects ONLINESTATUS.
* See also cib/callbacks.c */
if (safe_str_eq(status, JOINSTATUS)) {
status = ONLINESTATUS;
} else if (safe_str_eq(status, LEAVESTATUS)) {
status = OFFLINESTATUS;
}
if (safe_str_eq(status, ONLINESTATUS)) {
/* remove the cached value in case it changed */
crm_trace("Uncaching UUID for %s", node);
free(peer->uuid);
peer->uuid = NULL;
}
crm_update_peer_proc(__FUNCTION__, peer, crm_proc_crmd, status);
if (AM_I_DC) {
xmlNode *update = NULL;
crm_trace("Got client status callback");
update = create_node_state_update(peer, node_update_peer, NULL,
__FUNCTION__);
fsa_cib_anon_update(XML_CIB_TAG_STATUS, update,
cib_scope_local | cib_quorum_override | cib_can_create);
free_xml(update);
}
}
void
crmd_ha_msg_callback(HA_Message * hamsg, void *private_data)
{
int level = LOG_DEBUG;
crm_node_t *from_node = NULL;
xmlNode *msg = convert_ha_message(NULL, hamsg, __FUNCTION__);
const char *from = crm_element_value(msg, F_ORIG);
const char *op = crm_element_value(msg, F_CRM_TASK);
const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM);
CRM_CHECK(from != NULL, crm_log_xml_err(msg, "anon"); goto bail);
crm_trace("HA[inbound]: %s from %s", op, from);
if (crm_peer_cache == NULL || crm_active_peers() == 0) {
crm_debug("Ignoring HA messages until we are"
" connected to the CCM (%s op from %s)", op, from);
crm_log_xml_trace(msg, "HA[inbound]: Ignore (No CCM)");
goto bail;
}
from_node = crm_get_peer(0, from);
if (crm_is_peer_active(from_node) == FALSE) {
if (safe_str_eq(op, CRM_OP_VOTE)) {
level = LOG_WARNING;
} else if (AM_I_DC && safe_str_eq(op, CRM_OP_JOIN_ANNOUNCE)) {
level = LOG_WARNING;
} else if (safe_str_eq(sys_from, CRM_SYSTEM_DC)) {
level = LOG_WARNING;
}
do_crm_log(level,
"Ignoring HA message (op=%s) from %s: not in our"
" membership list (size=%d)", op, from, crm_active_peers());
crm_log_xml_trace(msg, "HA[inbound]: CCM Discard");
} else {
crmd_ha_msg_filter(msg);
}
bail:
free_xml(msg);
return;
}
gboolean
crmd_ha_msg_dispatch(ll_cluster_t * cluster_conn, gpointer user_data)
{
IPC_Channel *channel = NULL;
gboolean stay_connected = TRUE;
crm_trace("Invoked");
if (cluster_conn != NULL) {
channel = cluster_conn->llc_ops->ipcchan(cluster_conn);
}
CRM_CHECK(cluster_conn != NULL,;);
CRM_CHECK(channel != NULL,;);
if (channel != NULL && IPC_ISRCONN(channel)) {
if (cluster_conn->llc_ops->msgready(cluster_conn) == 0) {
crm_trace("no message ready yet");
}
/* invoke the callbacks but don't block */
cluster_conn->llc_ops->rcvmsg(cluster_conn, 0);
}
if (channel == NULL || channel->ch_status != IPC_CONNECT) {
if (is_set(fsa_input_register, R_HA_DISCONNECTED) == FALSE) {
crm_crit("Lost connection to heartbeat service.");
} else {
crm_info("Lost connection to heartbeat service.");
}
trigger_fsa(fsa_source);
stay_connected = FALSE;
}
return stay_connected;
}

File Metadata

Mime Type
text/x-c
Expires
Sat, Jan 25, 5:53 AM (20 h, 3 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1321451
Default Alt Text
heartbeat.c (17 KB)

Event Timeline