diff --git a/cib/main.c b/cib/main.c index a2ba49b9bc..8cd1e222fc 100644 --- a/cib/main.c +++ b/cib/main.c @@ -1,598 +1,598 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if HAVE_LIBXML2 # include #endif #ifdef HAVE_GETOPT_H # include #endif #if HAVE_BZLIB_H # include #endif extern int init_remote_listener(int port, gboolean encrypted); extern gboolean stand_alone; gboolean cib_shutdown_flag = FALSE; int cib_status = pcmk_ok; #if SUPPORT_HEARTBEAT oc_ev_t *cib_ev_token; ll_cluster_t *hb_conn = NULL; extern void oc_ev_special(const oc_ev_t *, oc_ev_class_t, int); gboolean cib_register_ha(ll_cluster_t * hb_cluster, const char *client_name); #endif extern void terminate_cib(const char *caller, gboolean fast); GMainLoop *mainloop = NULL; const char *cib_root = CRM_CONFIG_DIR; char *cib_our_uname = NULL; gboolean preserve_status = FALSE; gboolean cib_writes_enabled = TRUE; int remote_fd = 0; int remote_tls_fd = 0; void usage(const char *cmd, int exit_status); int cib_init(void); void cib_shutdown(int nsig); gboolean startCib(const char *filename); extern int write_cib_contents(gpointer p); GHashTable *client_list = NULL; GHashTable *config_hash = NULL; GHashTable *local_notify_queue = NULL; char *channel1 = NULL; char *channel2 = NULL; char *channel3 = NULL; char *channel4 = NULL; char *channel5 = NULL; #define OPTARGS "maswr:V?" void cib_cleanup(void); static void cib_enable_writes(int nsig) { crm_info("(Re)enabling disk writes"); cib_writes_enabled = TRUE; } static void log_cib_client(gpointer key, gpointer value, gpointer user_data) { cib_client_t *a_client = value; crm_info("Client %s", crm_str(a_client->name)); } int main(int argc, char **argv) { int flag; int rc = 0; int argerr = 0; #ifdef HAVE_GETOPT_H int option_index = 0; /* *INDENT-OFF* */ static struct option long_options[] = { {"per-action-cib", 0, 0, 'a'}, {"stand-alone", 0, 0, 's'}, {"disk-writes", 0, 0, 'w'}, {"cib-root", 1, 0, 'r'}, {"verbose", 0, 0, 'V'}, {"help", 0, 0, '?'}, {"metadata", 0, 0, 'm'}, {0, 0, 0, 0} }; /* *INDENT-ON* */ #endif struct passwd *pwentry = NULL; crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE); mainloop_add_signal(SIGTERM, cib_shutdown); mainloop_add_signal(SIGPIPE, cib_enable_writes); cib_writer = mainloop_add_trigger(G_PRIORITY_LOW, write_cib_contents, NULL); crm_peer_init(); client_list = g_hash_table_new(crm_str_hash, g_str_equal); while (1) { #ifdef HAVE_GETOPT_H flag = getopt_long(argc, argv, OPTARGS, long_options, &option_index); #else flag = getopt(argc, argv, OPTARGS); #endif if (flag == -1) break; switch (flag) { case 'V': crm_bump_log_level(); break; case 's': stand_alone = TRUE; preserve_status = TRUE; cib_writes_enabled = FALSE; pwentry = getpwnam(CRM_DAEMON_USER); CRM_CHECK(pwentry != NULL, crm_perror(LOG_ERR, "Invalid uid (%s) specified", CRM_DAEMON_USER); return 100); rc = setgid(pwentry->pw_gid); if (rc < 0) { crm_perror(LOG_ERR, "Could not set group to %d", pwentry->pw_gid); return 100; } rc = setuid(pwentry->pw_uid); if (rc < 0) { crm_perror(LOG_ERR, "Could not set user to %d", pwentry->pw_uid); return 100; } break; case '?': /* Help message */ usage(crm_system_name, EX_OK); break; case 'w': cib_writes_enabled = TRUE; break; case 'r': cib_root = optarg; break; case 'm': cib_metadata(); return 0; default: ++argerr; break; } } if (argc - optind == 1 && safe_str_eq("metadata", argv[optind])) { cib_metadata(); return 0; } if (optind > argc) { ++argerr; } if (argerr) { usage(crm_system_name, EX_USAGE); } if (crm_is_writable(cib_root, NULL, CRM_DAEMON_USER, CRM_DAEMON_GROUP, FALSE) == FALSE) { crm_err("Bad permissions on %s. Terminating", cib_root); fprintf(stderr, "ERROR: Bad permissions on %s. See logs for details\n", cib_root); fflush(stderr); return 100; } /* read local config file */ rc = cib_init(); CRM_CHECK(g_hash_table_size(client_list) == 0, crm_warn("Not all clients gone at exit")); g_hash_table_foreach(client_list, log_cib_client, NULL); cib_cleanup(); #if SUPPORT_HEARTBEAT if (hb_conn) { hb_conn->llc_ops->delete(hb_conn); } #endif crm_info("Done"); return rc; } void cib_cleanup(void) { crm_peer_destroy(); if (local_notify_queue) { g_hash_table_destroy(local_notify_queue); } g_hash_table_destroy(config_hash); g_hash_table_destroy(client_list); free(cib_our_uname); #if HAVE_LIBXML2 crm_xml_cleanup(); #endif free(channel1); free(channel2); free(channel3); free(channel4); free(channel5); } unsigned long cib_num_ops = 0; const char *cib_stat_interval = "10min"; unsigned long cib_num_local = 0, cib_num_updates = 0, cib_num_fail = 0; unsigned long cib_bad_connects = 0, cib_num_timeouts = 0; #if SUPPORT_HEARTBEAT gboolean ccm_connect(void); static void ccm_connection_destroy(gpointer user_data) { crm_err("CCM connection failed... blocking while we reconnect"); CRM_ASSERT(ccm_connect()); return; } static void *ccm_library = NULL; gboolean ccm_connect(void) { gboolean did_fail = TRUE; int num_ccm_fails = 0; int max_ccm_fails = 30; int ret; int cib_ev_fd; int (*ccm_api_register) (oc_ev_t ** token) = find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_register", 1); int (*ccm_api_set_callback) (const oc_ev_t * token, oc_ev_class_t class, oc_ev_callback_t * fn, oc_ev_callback_t ** prev_fn) = find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_set_callback", 1); void (*ccm_api_special) (const oc_ev_t *, oc_ev_class_t, int) = find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_special", 1); int (*ccm_api_activate) (const oc_ev_t * token, int *fd) = find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_activate", 1); int (*ccm_api_unregister) (oc_ev_t * token) = find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_unregister", 1); static struct mainloop_fd_callbacks ccm_fd_callbacks = { .dispatch = cib_ccm_dispatch, .destroy = ccm_connection_destroy, }; while (did_fail) { did_fail = FALSE; crm_info("Registering with CCM..."); ret = (*ccm_api_register) (&cib_ev_token); if (ret != 0) { did_fail = TRUE; } if (did_fail == FALSE) { crm_trace("Setting up CCM callbacks"); ret = (*ccm_api_set_callback) (cib_ev_token, OC_EV_MEMB_CLASS, cib_ccm_msg_callback, NULL); if (ret != 0) { crm_warn("CCM callback not set"); did_fail = TRUE; } } if (did_fail == FALSE) { (*ccm_api_special) (cib_ev_token, OC_EV_MEMB_CLASS, 0); crm_trace("Activating CCM token"); ret = (*ccm_api_activate) (cib_ev_token, &cib_ev_fd); if (ret != 0) { crm_warn("CCM Activation failed"); did_fail = TRUE; } } if (did_fail) { num_ccm_fails++; (*ccm_api_unregister) (cib_ev_token); if (num_ccm_fails < max_ccm_fails) { crm_warn("CCM Connection failed %d times (%d max)", num_ccm_fails, max_ccm_fails); sleep(3); } else { crm_err("CCM Activation failed %d (max) times", num_ccm_fails); return FALSE; } } } crm_debug("CCM Activation passed... all set to go!"); mainloop_add_fd("heartbeat-ccm", cib_ev_fd, cib_ev_token, &ccm_fd_callbacks); return TRUE; } #endif #if SUPPORT_COROSYNC static gboolean cib_ais_dispatch(AIS_Message * wrapper, char *data, int sender) { xmlNode *xml = NULL; if (wrapper->header.id == crm_class_cluster) { xml = string2xml(data); if (xml == NULL) { goto bail; } crm_xml_add(xml, F_ORIG, wrapper->sender.uname); crm_xml_add_int(xml, F_SEQ, wrapper->id); cib_peer_callback(xml, NULL); } free_xml(xml); return TRUE; bail: crm_err("Invalid XML: '%.120s'", data); return TRUE; } static void cib_ais_destroy(gpointer user_data) { if (cib_shutdown_flag) { crm_info("Corosync disconnection complete"); } else { crm_err("Corosync connection lost! Exiting."); terminate_cib(__FUNCTION__, TRUE); } } #endif static void cib_peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data) { #if 0 /* crm_active_peers(crm_proc_cib) appears to give the wrong answer * sometimes, this might help figure out why */ if(type == crm_status_nstate) { crm_info("status: %s is now %s (was %s)", node->uname, node->state, (const char *)data); - if (safe_str_eq(CRMD_STATE_ACTIVE, node->state)) { + if (safe_str_eq(CRMD_JOINSTATE_MEMBER, node->state)) { return; } } else if(type == crm_status_processes) { uint32_t old = 0; if (data) { old = *(const uint32_t *)data; } if ((node->processes ^ old) & crm_proc_cib) { crm_info("status: cib process on %s is now %sactive", node->uname, is_set(node->processes, crm_proc_cib)?"":"in"); } else { return; } } else { return; } #endif if(cib_shutdown_flag && crm_active_peers() < 2 && g_hash_table_size(client_list) == 0) { crm_info("No more peers"); terminate_cib(__FUNCTION__, FALSE); } } #if SUPPORT_HEARTBEAT static void cib_ha_connection_destroy(gpointer user_data) { if (cib_shutdown_flag) { crm_info("Heartbeat disconnection complete... exiting"); terminate_cib(__FUNCTION__, FALSE); } else { crm_err("Heartbeat connection lost! Exiting."); terminate_cib(__FUNCTION__, TRUE); } } #endif int cib_init(void) { config_hash = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); if (startCib("cib.xml") == FALSE) { crm_crit("Cannot start CIB... terminating"); exit(1); } if (stand_alone == FALSE) { void *dispatch = NULL; void *destroy = NULL; if (is_openais_cluster()) { #if SUPPORT_COROSYNC destroy = cib_ais_destroy; dispatch = cib_ais_dispatch; #endif } else if(is_heartbeat_cluster()) { #if SUPPORT_HEARTBEAT dispatch = cib_ha_peer_callback; destroy = cib_ha_connection_destroy; #endif } if (crm_cluster_connect(&cib_our_uname, NULL, dispatch, destroy, #if SUPPORT_HEARTBEAT &hb_conn #else NULL #endif ) == FALSE) { crm_crit("Cannot sign in to the cluster... terminating"); exit(100); } if (is_openais_cluster()) { crm_set_status_callback(&cib_peer_update_callback); } #if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { gboolean was_error = FALSE; if (was_error == FALSE) { if (HA_OK != hb_conn->llc_ops->set_cstatus_callback(hb_conn, cib_client_status_callback, hb_conn)) { crm_err("Cannot set cstatus callback: %s", hb_conn->llc_ops->errmsg(hb_conn)); was_error = TRUE; } } if (was_error == FALSE) { was_error = (ccm_connect() == FALSE); } if (was_error == FALSE) { /* Async get client status information in the cluster */ crm_info("Requesting the list of configured nodes"); hb_conn->llc_ops->client_status(hb_conn, NULL, CRM_SYSTEM_CIB, -1); } } #endif } else { cib_our_uname = strdup("localhost"); } ipcs_ro = mainloop_add_ipc_server(cib_channel_ro, QB_IPC_NATIVE, &ipc_ro_callbacks); ipcs_rw = mainloop_add_ipc_server(cib_channel_rw, QB_IPC_NATIVE, &ipc_rw_callbacks); ipcs_shm = mainloop_add_ipc_server(cib_channel_shm, QB_IPC_SHM, &ipc_rw_callbacks); if (stand_alone) { cib_is_master = TRUE; } if (ipcs_ro != NULL && ipcs_rw != NULL && ipcs_shm != NULL) { /* Create the mainloop and run it... */ mainloop = g_main_new(FALSE); crm_info("Starting %s mainloop", crm_system_name); g_main_run(mainloop); } else { crm_err("Couldnt start all IPC channels, exiting."); return -1; } qb_ipcs_destroy(ipcs_ro); qb_ipcs_destroy(ipcs_rw); qb_ipcs_destroy(ipcs_shm); return 0; } void usage(const char *cmd, int exit_status) { FILE *stream; stream = exit_status ? stderr : stdout; fprintf(stream, "usage: %s [-%s]\n", cmd, OPTARGS); fprintf(stream, "\t--%s (-%c)\t\tTurn on debug info." " Additional instances increase verbosity\n", "verbose", 'V'); fprintf(stream, "\t--%s (-%c)\t\tThis help message\n", "help", '?'); fprintf(stream, "\t--%s (-%c)\t\tShow configurable cib options\n", "metadata", 'm'); fprintf(stream, "\t--%s (-%c)\tAdvanced use only\n", "per-action-cib", 'a'); fprintf(stream, "\t--%s (-%c)\tAdvanced use only\n", "stand-alone", 's'); fprintf(stream, "\t--%s (-%c)\tAdvanced use only\n", "disk-writes", 'w'); fprintf(stream, "\t--%s (-%c)\t\tAdvanced use only\n", "cib-root", 'r'); fflush(stream); exit(exit_status); } gboolean startCib(const char *filename) { gboolean active = FALSE; xmlNode *cib = readCibXmlFile(cib_root, filename, !preserve_status); CRM_ASSERT(cib != NULL); if (activateCibXml(cib, TRUE, "start") == 0) { int port = 0; const char *port_s = NULL; active = TRUE; cib_read_config(config_hash, cib); port_s = crm_element_value(cib, "remote-tls-port"); if (port_s) { port = crm_parse_int(port_s, "0"); remote_tls_fd = init_remote_listener(port, TRUE); } port_s = crm_element_value(cib, "remote-clear-port"); if (port_s) { port = crm_parse_int(port_s, "0"); remote_fd = init_remote_listener(port, FALSE); } crm_info("CIB Initialization completed successfully"); } return active; } diff --git a/crmd/callbacks.c b/crmd/callbacks.c index f25c802b57..5953935053 100644 --- a/crmd/callbacks.c +++ b/crmd/callbacks.c @@ -1,236 +1,240 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include void crmd_ha_connection_destroy(gpointer user_data); /* From join_dc... */ extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); void crmd_ha_connection_destroy(gpointer user_data) { crm_trace("Invoked"); if (is_set(fsa_input_register, R_HA_DISCONNECTED)) { /* we signed out, so this is expected */ crm_info("Heartbeat disconnection complete"); return; } crm_crit("Lost connection to heartbeat service!"); register_fsa_input(C_HA_DISCONNECT, I_ERROR, NULL); trigger_fsa(fsa_source); } void crmd_ha_msg_filter(xmlNode * msg) { if (AM_I_DC) { const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); if (safe_str_eq(sys_from, CRM_SYSTEM_DC)) { const char *from = crm_element_value(msg, F_ORIG); if (safe_str_neq(from, fsa_our_uname)) { int level = LOG_INFO; const char *op = crm_element_value(msg, F_CRM_TASK); /* make sure the election happens NOW */ if (fsa_state != S_ELECTION) { ha_msg_input_t new_input; level = LOG_WARNING; new_input.msg = msg; register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION, NULL, &new_input, __FUNCTION__); } do_crm_log(level, "Another DC detected: %s (op=%s)", from, op); goto done; } } } else { const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO); if (safe_str_eq(sys_to, CRM_SYSTEM_DC)) { return; } } /* crm_log_xml_trace("HA[inbound]", msg); */ route_message(C_HA_MESSAGE, msg); done: trigger_fsa(fsa_source); } extern gboolean process_lrm_event(lrmd_event_data_t * op); void lrm_op_callback(lrmd_event_data_t * op) { CRM_CHECK(op != NULL, return); process_lrm_event(op); } static void crmd_proc_update(crm_node_t * member, enum crm_proc_flag client) { const char *status = NULL; CRM_CHECK(member != NULL, return); status = (member->processes & client) ? ONLINESTATUS : OFFLINESTATUS; crm_notice("Status update: Client %s/%s now has status [%s] (DC=%s)", member->uname, peer2text(client), status, AM_I_DC ? "true" : crm_str(fsa_our_dc)); if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { return; } else if (fsa_state == S_STOPPING) { return; } if (safe_str_eq(member->uname, fsa_our_dc) && crm_is_peer_active(member) == FALSE) { /* Did the DC leave us? */ crm_info("Got client status callback - our DC is dead"); register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL); } else if (AM_I_DC) { - xmlNode *update = NULL; - - update = create_node_state(member->uname, - NULL, status, NULL, NULL, FALSE, __FUNCTION__); - - fsa_cib_anon_update(XML_CIB_TAG_STATUS, update, - cib_scope_local | cib_quorum_override | cib_can_create); +#if 0 + /* Everyone */ + populate_cib_nodes(node_update_quick|node_update_peer, __FUNCTION__); +#else + /* Just one */ + xmlNode *update = create_node_state( + member->uname, NULL, status, NULL, NULL, FALSE, __FUNCTION__); + + fsa_cib_anon_update( + XML_CIB_TAG_STATUS, update, cib_scope_local | cib_quorum_override | cib_can_create); free_xml(update); +#endif if ((member->processes & client) == 0) { erase_node_from_join(member->uname); check_join_state(fsa_state, __FUNCTION__); fail_incompletable_actions(transition_graph, member->uuid); } else { register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); } } trigger_fsa(fsa_source); } void peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data) { gboolean reset_status_entry = FALSE; uint32_t old = 0; set_bit(fsa_input_register, R_PEER_DATA); if (node->uname == NULL) { return; } switch (type) { case crm_status_uname: crm_info("status: %s is now %s", node->uname, node->state); /* reset_status_entry = TRUE; */ /* If we've never seen the node, then it also wont be in the status section */ break; case crm_status_nstate: crm_info("status: %s is now %s (was %s)", node->uname, node->state, (const char *)data); reset_status_entry = TRUE; break; case crm_status_processes: if (data) { old = *(const uint32_t *)data; } if ((node->processes ^ old) & proc_flags) { crmd_proc_update(node, proc_flags); } break; } /* Can this be removed now that do_cl_join_finalize_respond() does the same thing? */ - if (AM_I_DC && reset_status_entry && safe_str_eq(CRMD_STATE_ACTIVE, node->state)) { + if (AM_I_DC && reset_status_entry && safe_str_eq(CRMD_JOINSTATE_MEMBER, node->state)) { crm_action_t *down = match_down_event(0, node->uname, NULL); erase_status_tag(node->uname, XML_CIB_TAG_LRM, cib_scope_local); erase_status_tag(node->uname, XML_TAG_TRANSIENT_NODEATTRS, cib_scope_local); if (down) { const char *task = crm_element_value(down->xml, XML_LRM_ATTR_TASK); if (safe_str_eq(task, CRM_OP_FENCE)) { crm_info("Node return implies stonith of %s (action %d) completed", node->uname, down->id); down->confirmed = TRUE; } } /* TODO: potentially we also want to set XML_NODE_JOIN_STATE and XML_NODE_EXPECTED here */ } } void crmd_cib_connection_destroy(gpointer user_data) { CRM_CHECK(user_data == fsa_cib_conn,;); crm_trace("Invoked"); trigger_fsa(fsa_source); fsa_cib_conn->state = cib_disconnected; if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { crm_info("Connection to the CIB terminated..."); return; } /* eventually this will trigger a reconnect, not a shutdown */ crm_err("Connection to the CIB terminated..."); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); clear_bit(fsa_input_register, R_CIB_CONNECTED); return; } gboolean crm_fsa_trigger(gpointer user_data) { crm_trace("Invoked (queue len: %d)", g_list_length(fsa_message_queue)); s_crmd_fsa(C_FSA_INTERNAL); crm_trace("Exited (queue len: %d)", g_list_length(fsa_message_queue)); return TRUE; } diff --git a/crmd/cib.c b/crmd/cib.c index 75799c20ad..40b5aee720 100644 --- a/crmd/cib.c +++ b/crmd/cib.c @@ -1,217 +1,216 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include /* for access */ #include /* for calls to open */ #include /* for calls to open */ #include /* for calls to open */ #include /* for getpwuid */ #include /* for initgroups */ #include /* for getrlimit */ #include /* for getrlimit */ #include #include #include #include #include #include #include struct crm_subsystem_s *cib_subsystem = NULL; int cib_retries = 0; static void do_cib_updated(const char *event, xmlNode * msg) { int rc = -1; CRM_CHECK(msg != NULL, return); crm_element_value_int(msg, F_CIB_RC, &rc); if (rc < pcmk_ok) { crm_trace("Filter rc=%d (%s)", rc, pcmk_strerror(rc)); return; } if (get_xpath_object ("//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_CIB_TAG_CRMCONFIG, msg, LOG_TRACE) != NULL) { mainloop_set_trigger(config_read); } } static void revision_check_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { int cmp = -1; const char *revision = NULL; xmlNode *generation = NULL; if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return; } generation = output; CRM_CHECK(safe_str_eq(crm_element_name(generation), XML_TAG_CIB), crm_log_xml_err(output, __FUNCTION__); return); crm_trace("Checking our feature revision is allowed: %s", CIB_FEATURE_SET); revision = crm_element_value(generation, XML_ATTR_CRM_VERSION); cmp = compare_version(revision, CRM_FEATURE_SET); if (cmp > 0) { crm_err("This build (%s) does not support the current" " resource configuration", VERSION); crm_err("We can only support up to CRM feature set %s (current=%s)", CRM_FEATURE_SET, revision); crm_err("Shutting down the CRM"); /* go into a stall state */ register_fsa_error_adv(C_FSA_INTERNAL, I_SHUTDOWN, NULL, NULL, __FUNCTION__); return; } } static void do_cib_replaced(const char *event, xmlNode * msg) { crm_debug("Updating the CIB after a replace: DC=%s", AM_I_DC ? "true" : "false"); if (AM_I_DC == FALSE) { return; } else if (fsa_state == S_FINALIZE_JOIN && is_set(fsa_input_register, R_CIB_ASKED)) { /* no need to restart the join - we asked for this replace op */ return; } /* start the join process again so we get everyone's LRM status */ - populate_cib_nodes(TRUE); - do_update_cib_nodes(TRUE, __FUNCTION__); + populate_cib_nodes(node_update_quick|node_update_cluster|node_update_peer|node_update_join, __FUNCTION__); register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); } /* A_CIB_STOP, A_CIB_START, A_CIB_RESTART, */ void do_cib_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { struct crm_subsystem_s *this_subsys = cib_subsystem; long long stop_actions = A_CIB_STOP; long long start_actions = A_CIB_START; if (action & stop_actions) { crm_info("Disconnecting CIB"); clear_bit(fsa_input_register, R_CIB_CONNECTED); CRM_ASSERT(fsa_cib_conn != NULL); fsa_cib_conn->cmds->del_notify_callback(fsa_cib_conn, T_CIB_DIFF_NOTIFY, do_cib_updated); if (fsa_cib_conn->state != cib_disconnected) { fsa_cib_conn->cmds->set_slave(fsa_cib_conn, cib_scope_local); fsa_cib_conn->cmds->signoff(fsa_cib_conn); } } if (action & start_actions) { int rc = pcmk_ok; CRM_ASSERT(fsa_cib_conn != NULL); if (cur_state == S_STOPPING) { crm_err("Ignoring request to start %s after shutdown", this_subsys->name); return; } rc = fsa_cib_conn->cmds->signon(fsa_cib_conn, CRM_SYSTEM_CRMD, cib_command_nonblocking); if (rc != pcmk_ok) { /* a short wait that usually avoids stalling the FSA */ sleep(1); rc = fsa_cib_conn->cmds->signon(fsa_cib_conn, CRM_SYSTEM_CRMD, cib_command_nonblocking); } if (rc != pcmk_ok) { crm_info("Could not connect to the CIB service: %s", pcmk_strerror(rc)); } else if (pcmk_ok != fsa_cib_conn->cmds->set_connection_dnotify(fsa_cib_conn, crmd_cib_connection_destroy)) { crm_err("Could not set dnotify callback"); } else if (pcmk_ok != fsa_cib_conn->cmds->add_notify_callback(fsa_cib_conn, T_CIB_REPLACE_NOTIFY, do_cib_replaced)) { crm_err("Could not set CIB notification callback (replace)"); } else if (pcmk_ok != fsa_cib_conn->cmds->add_notify_callback(fsa_cib_conn, T_CIB_DIFF_NOTIFY, do_cib_updated)) { crm_err("Could not set CIB notification callback (update)"); } else { set_bit(fsa_input_register, R_CIB_CONNECTED); } if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { cib_retries++; crm_warn("Couldn't complete CIB registration %d" " times... pause and retry", cib_retries); if (cib_retries < 30) { crm_timer_start(wait_timer); crmd_fsa_stall(NULL); } else { crm_err("Could not complete CIB" " registration %d times..." " hard error", cib_retries); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } else { int call_id = 0; crm_info("CIB connection established"); call_id = fsa_cib_conn->cmds->query(fsa_cib_conn, NULL, NULL, cib_scope_local); add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, revision_check_callback); cib_retries = 0; } } } diff --git a/crmd/control.c b/crmd/control.c index ed0663337c..96ca88ca11 100644 --- a/crmd/control.c +++ b/crmd/control.c @@ -1,961 +1,914 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include qb_ipcs_service_t *ipcs = NULL; extern gboolean crm_connect_corosync(void); extern void crmd_ha_connection_destroy(gpointer user_data); void crm_shutdown(int nsig); gboolean crm_read_options(gpointer user_data); gboolean fsa_has_quorum = FALSE; GHashTable *ipc_clients = NULL; crm_trigger_t *fsa_source = NULL; crm_trigger_t *config_read = NULL; /* A_HA_CONNECT */ void do_ha_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean registered = FALSE; if (action & A_HA_DISCONNECT) { if (is_openais_cluster()) { crm_peer_destroy(); #if SUPPORT_COROSYNC terminate_ais_connection(); #endif crm_info("Disconnected from OpenAIS"); #if SUPPORT_HEARTBEAT } else if (fsa_cluster_conn != NULL) { set_bit(fsa_input_register, R_HA_DISCONNECTED); fsa_cluster_conn->llc_ops->signoff(fsa_cluster_conn, FALSE); crm_info("Disconnected from Heartbeat"); #endif } } if (action & A_HA_CONNECT) { crm_set_status_callback(&peer_update_callback); if (is_openais_cluster()) { #if SUPPORT_COROSYNC registered = crm_connect_corosync(); #endif } else if (is_heartbeat_cluster()) { #if SUPPORT_HEARTBEAT registered = crm_cluster_connect(&fsa_our_uname, &fsa_our_uuid, crmd_ha_msg_callback, crmd_ha_connection_destroy, &fsa_cluster_conn); #endif } #if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { crm_trace("Be informed of Node Status changes"); if (registered && fsa_cluster_conn->llc_ops->set_nstatus_callback(fsa_cluster_conn, crmd_ha_status_callback, fsa_cluster_conn) != HA_OK) { crm_err("Cannot set nstatus callback: %s", fsa_cluster_conn->llc_ops->errmsg(fsa_cluster_conn)); registered = FALSE; } crm_trace("Be informed of CRM Client Status changes"); if (registered && fsa_cluster_conn->llc_ops->set_cstatus_callback(fsa_cluster_conn, crmd_client_status_callback, fsa_cluster_conn) != HA_OK) { crm_err("Cannot set cstatus callback: %s", fsa_cluster_conn->llc_ops->errmsg(fsa_cluster_conn)); registered = FALSE; } if (registered) { crm_trace("Requesting an initial dump of CRMD client_status"); fsa_cluster_conn->llc_ops->client_status(fsa_cluster_conn, NULL, CRM_SYSTEM_CRMD, -1); } } #endif if (registered == FALSE) { set_bit(fsa_input_register, R_HA_DISCONNECTED); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return; } + populate_cib_nodes(node_update_none, __FUNCTION__); clear_bit(fsa_input_register, R_HA_DISCONNECTED); crm_info("Connected to the cluster"); } if (action & ~(A_HA_CONNECT | A_HA_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__); } } /* A_SHUTDOWN */ void do_shutdown(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { /* just in case */ set_bit(fsa_input_register, R_SHUTDOWN); if (is_heartbeat_cluster()) { if (is_set(fsa_input_register, pe_subsystem->flag_connected)) { crm_info("Terminating the %s", pe_subsystem->name); if (stop_subsystem(pe_subsystem, TRUE) == FALSE) { /* its gone... */ crm_err("Faking %s exit", pe_subsystem->name); clear_bit(fsa_input_register, pe_subsystem->flag_connected); } else { crm_info("Waiting for subsystems to exit"); crmd_fsa_stall(NULL); } } crm_info("All subsystems stopped, continuing"); } if (stonith_api) { /* Prevent it from comming up again */ clear_bit(fsa_input_register, R_ST_REQUIRED); crm_info("Disconnecting STONITH..."); stonith_api->cmds->disconnect(stonith_api); } } /* A_SHUTDOWN_REQ */ void do_shutdown_req(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { xmlNode *msg = NULL; crm_info("Sending shutdown request to %s", crm_str(fsa_our_dc)); msg = create_request(CRM_OP_SHUTDOWN_REQ, NULL, NULL, CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL); /* set_bit(fsa_input_register, R_STAYDOWN); */ if (send_cluster_message(NULL, crm_msg_crmd, msg, TRUE) == FALSE) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } free_xml(msg); } extern crm_ipc_t *attrd_ipc; extern char *max_generation_from; extern xmlNode *max_generation_xml; extern GHashTable *resource_history; extern GHashTable *voted; extern GHashTable *reload_hash; void log_connected_client(gpointer key, gpointer value, gpointer user_data); void log_connected_client(gpointer key, gpointer value, gpointer user_data) { crmd_client_t *client = value; crm_err("%s is still connected at exit", client->table_key); } static void free_mem(fsa_data_t * msg_data) { GListPtr gIter = NULL; crm_ipc_close(attrd_ipc); crm_ipc_destroy(attrd_ipc); g_main_loop_quit(crmd_mainloop); g_main_loop_unref(crmd_mainloop); #if SUPPORT_HEARTBEAT if (fsa_cluster_conn) { fsa_cluster_conn->llc_ops->delete(fsa_cluster_conn); fsa_cluster_conn = NULL; } #endif for(gIter = fsa_message_queue; gIter != NULL; gIter = gIter->next) { fsa_data_t *fsa_data = gIter->data; crm_info("Dropping %s: [ state=%s cause=%s origin=%s ]", fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); delete_fsa_input(fsa_data); } g_list_free(fsa_message_queue); delete_fsa_input(msg_data); if (ipc_clients) { crm_debug("Number of connected clients: %d", g_hash_table_size(ipc_clients)); /* g_hash_table_foreach(ipc_clients, log_connected_client, NULL); */ g_hash_table_destroy(ipc_clients); } empty_uuid_cache(); crm_peer_destroy(); clear_bit(fsa_input_register, R_MEMBERSHIP); if (te_subsystem->client && te_subsystem->client->ipc) { crm_debug("Full destroy: TE"); qb_ipcs_disconnect(te_subsystem->client->ipc); } free(te_subsystem); if (pe_subsystem->client && pe_subsystem->client->ipc) { crm_debug("Full destroy: PE"); qb_ipcs_disconnect(pe_subsystem->client->ipc); } free(pe_subsystem); free(cib_subsystem); if (integrated_nodes) { g_hash_table_destroy(integrated_nodes); } if (finalized_nodes) { g_hash_table_destroy(finalized_nodes); } if (confirmed_nodes) { g_hash_table_destroy(confirmed_nodes); } if (reload_hash) { g_hash_table_destroy(reload_hash); } if (resource_history) { g_hash_table_destroy(resource_history); } if (voted) { g_hash_table_destroy(voted); } cib_delete(fsa_cib_conn); fsa_cib_conn = NULL; if (fsa_lrm_conn) { lrmd_api_delete(fsa_lrm_conn); fsa_lrm_conn = NULL; } free(transition_timer); free(integration_timer); free(finalization_timer); free(election_trigger); free(election_timeout); free(shutdown_escalation_timer); free(wait_timer); free(recheck_timer); free(fsa_our_dc_version); free(fsa_our_uname); free(fsa_our_uuid); free(fsa_our_dc); free(max_generation_from); free_xml(max_generation_xml); crm_xml_cleanup(); } /* A_EXIT_0, A_EXIT_1 */ void do_exit(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int exit_code = 0; int log_level = LOG_INFO; const char *exit_type = "gracefully"; if (action & A_EXIT_1) { exit_code = 1; log_level = LOG_ERR; exit_type = "forcefully"; } verify_stopped(cur_state, LOG_ERR); do_crm_log(log_level, "Performing %s - %s exiting the CRMd", fsa_action2string(action), exit_type); if (is_set(fsa_input_register, R_IN_RECOVERY)) { crm_err("Could not recover from internal error"); exit_code = 2; } if (is_set(fsa_input_register, R_STAYDOWN)) { crm_warn("Inhibiting respawn by Heartbeat"); exit_code = 100; } crm_info("[%s] stopped (%d)", crm_system_name, exit_code); free_mem(msg_data); exit(exit_code); } /* A_STARTUP */ void do_startup(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int was_error = 0; int interval = 1; /* seconds between DC heartbeats */ crm_debug("Registering Signal Handlers"); mainloop_add_signal(SIGTERM, crm_shutdown); fsa_source = mainloop_add_trigger(G_PRIORITY_HIGH, crm_fsa_trigger, NULL); config_read = mainloop_add_trigger(G_PRIORITY_HIGH, crm_read_options, NULL); ipc_clients = g_hash_table_new(crm_str_hash, g_str_equal); crm_debug("Creating CIB and LRM objects"); fsa_cib_conn = cib_new(); fsa_lrm_conn = lrmd_api_new(); /* set up the timers */ transition_timer = calloc(1, sizeof(fsa_timer_t)); integration_timer = calloc(1, sizeof(fsa_timer_t)); finalization_timer = calloc(1, sizeof(fsa_timer_t)); election_trigger = calloc(1, sizeof(fsa_timer_t)); election_timeout = calloc(1, sizeof(fsa_timer_t)); shutdown_escalation_timer = calloc(1, sizeof(fsa_timer_t)); wait_timer = calloc(1, sizeof(fsa_timer_t)); recheck_timer = calloc(1, sizeof(fsa_timer_t)); interval = interval * 1000; if (election_trigger != NULL) { election_trigger->source_id = 0; election_trigger->period_ms = -1; election_trigger->fsa_input = I_DC_TIMEOUT; election_trigger->callback = crm_timer_popped; election_trigger->repeat = FALSE; } else { was_error = TRUE; } if (election_timeout != NULL) { election_timeout->source_id = 0; election_timeout->period_ms = -1; election_timeout->fsa_input = I_ELECTION_DC; election_timeout->callback = crm_timer_popped; election_timeout->repeat = FALSE; } else { was_error = TRUE; } if (transition_timer != NULL) { transition_timer->source_id = 0; transition_timer->period_ms = -1; transition_timer->fsa_input = I_PE_CALC; transition_timer->callback = crm_timer_popped; transition_timer->repeat = FALSE; } else { was_error = TRUE; } if (integration_timer != NULL) { integration_timer->source_id = 0; integration_timer->period_ms = -1; integration_timer->fsa_input = I_INTEGRATED; integration_timer->callback = crm_timer_popped; integration_timer->repeat = FALSE; } else { was_error = TRUE; } if (finalization_timer != NULL) { finalization_timer->source_id = 0; finalization_timer->period_ms = -1; finalization_timer->fsa_input = I_FINALIZED; finalization_timer->callback = crm_timer_popped; finalization_timer->repeat = FALSE; /* for possible enabling... a bug in the join protocol left * a slave in S_PENDING while we think its in S_NOT_DC * * raising I_FINALIZED put us into a transition loop which is * never resolved. * in this loop we continually send probes which the node * NACK's because its in S_PENDING * * if we have nodes where heartbeat is active but the * CRM is not... then this will be handled in the * integration phase */ finalization_timer->fsa_input = I_ELECTION; } else { was_error = TRUE; } if (shutdown_escalation_timer != NULL) { shutdown_escalation_timer->source_id = 0; shutdown_escalation_timer->period_ms = -1; shutdown_escalation_timer->fsa_input = I_STOP; shutdown_escalation_timer->callback = crm_timer_popped; shutdown_escalation_timer->repeat = FALSE; } else { was_error = TRUE; } if (wait_timer != NULL) { wait_timer->source_id = 0; wait_timer->period_ms = 2000; wait_timer->fsa_input = I_NULL; wait_timer->callback = crm_timer_popped; wait_timer->repeat = FALSE; } else { was_error = TRUE; } if (recheck_timer != NULL) { recheck_timer->source_id = 0; recheck_timer->period_ms = -1; recheck_timer->fsa_input = I_PE_CALC; recheck_timer->callback = crm_timer_popped; recheck_timer->repeat = FALSE; } else { was_error = TRUE; } /* set up the sub systems */ cib_subsystem = calloc(1, sizeof(struct crm_subsystem_s)); te_subsystem = calloc(1, sizeof(struct crm_subsystem_s)); pe_subsystem = calloc(1, sizeof(struct crm_subsystem_s)); if (cib_subsystem != NULL) { cib_subsystem->pid = -1; cib_subsystem->name = CRM_SYSTEM_CIB; cib_subsystem->flag_connected = R_CIB_CONNECTED; cib_subsystem->flag_required = R_CIB_REQUIRED; } else { was_error = TRUE; } if (te_subsystem != NULL) { te_subsystem->pid = -1; te_subsystem->name = CRM_SYSTEM_TENGINE; te_subsystem->flag_connected = R_TE_CONNECTED; te_subsystem->flag_required = R_TE_REQUIRED; } else { was_error = TRUE; } if (pe_subsystem != NULL) { pe_subsystem->pid = -1; pe_subsystem->path = CRM_DAEMON_DIR; pe_subsystem->name = CRM_SYSTEM_PENGINE; pe_subsystem->command = CRM_DAEMON_DIR "/" CRM_SYSTEM_PENGINE; pe_subsystem->args = NULL; pe_subsystem->flag_connected = R_PE_CONNECTED; pe_subsystem->flag_required = R_PE_REQUIRED; } else { was_error = TRUE; } if (was_error == FALSE && is_heartbeat_cluster()) { if(start_subsystem(pe_subsystem) == FALSE) { was_error = TRUE; } } if (was_error) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } welcomed_nodes = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); integrated_nodes = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); finalized_nodes = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); confirmed_nodes = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); } static int32_t crmd_ipc_accept(qb_ipcs_connection_t *c, uid_t uid, gid_t gid) { crmd_client_t *blank_client = NULL; #if ENABLE_ACL struct group *crm_grp = NULL; #endif crm_trace("Connecting %p for uid=%d gid=%d", c, uid, gid); blank_client = calloc(1, sizeof(crmd_client_t)); CRM_ASSERT(blank_client != NULL); crm_trace("Created client: %p", blank_client); blank_client->ipc = c; blank_client->sub_sys = NULL; blank_client->uuid = NULL; blank_client->table_key = NULL; #if ENABLE_ACL crm_grp = getgrnam(CRM_DAEMON_GROUP); if (crm_grp) { qb_ipcs_connection_auth_set(c, -1, crm_grp->gr_gid, S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP); } blank_client->user = uid2username(uid); #endif qb_ipcs_context_set(c, blank_client); return 0; } static void crmd_ipc_created(qb_ipcs_connection_t *c) { crm_trace("Client %p connected", c); } static int32_t crmd_ipc_dispatch(qb_ipcs_connection_t *c, void *data, size_t size) { crmd_client_t *client = qb_ipcs_context_get(c); xmlNode *msg = crm_ipcs_recv(c, data, size); xmlNode *ack = create_xml_node(NULL, "ack"); crm_trace("Invoked: %s", client->table_key); crm_ipcs_send(c, ack, FALSE); /* TODO: Do not unconditionally send this */ free_xml(ack); if (msg == NULL) { return 0; } #if ENABLE_ACL determine_request_user(client->user, msg, F_CRM_USER); #endif crm_trace("Processing msg from %s", client->table_key); crm_log_xml_trace(msg, "CRMd[inbound]"); if (crmd_authorize_message(msg, client)) { route_message(C_IPC_MESSAGE, msg); } trigger_fsa(fsa_source); free_xml(msg); return 0; } static int32_t crmd_ipc_closed(qb_ipcs_connection_t *c) { return 0; } static void crmd_ipc_destroy(qb_ipcs_connection_t *c) { crmd_client_t *client = qb_ipcs_context_get(c); if (client == NULL) { crm_trace("No client to delete"); return; } process_client_disconnect(client); crm_trace("Disconnecting client %s (%p)", client->table_key, client); free(client->table_key); free(client->sub_sys); free(client->uuid); free(client->user); free(client); trigger_fsa(fsa_source); } /* A_STOP */ void do_stop(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { if (is_heartbeat_cluster()) { stop_subsystem(pe_subsystem, FALSE); } mainloop_del_ipc_server(ipcs); register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } /* A_STARTED */ void do_started(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { static struct qb_ipcs_service_handlers crmd_callbacks = { .connection_accept = crmd_ipc_accept, .connection_created = crmd_ipc_created, .msg_process = crmd_ipc_dispatch, .connection_closed = crmd_ipc_closed, .connection_destroyed = crmd_ipc_destroy }; if (cur_state != S_STARTING) { crm_err("Start cancelled... %s", fsa_state2string(cur_state)); return; } else if (is_set(fsa_input_register, R_MEMBERSHIP) == FALSE) { crm_info("Delaying start, no membership data (%.16llx)", R_MEMBERSHIP); crmd_fsa_stall(NULL); return; } else if (is_set(fsa_input_register, R_LRM_CONNECTED) == FALSE) { crm_info("Delaying start, LRM not connected (%.16llx)", R_LRM_CONNECTED); crmd_fsa_stall(NULL); return; } else if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { crm_info("Delaying start, CIB not connected (%.16llx)", R_CIB_CONNECTED); crmd_fsa_stall(NULL); return; } else if (is_set(fsa_input_register, R_READ_CONFIG) == FALSE) { crm_info("Delaying start, Config not read (%.16llx)", R_READ_CONFIG); crmd_fsa_stall(NULL); return; } else if (is_set(fsa_input_register, R_PEER_DATA) == FALSE) { /* try reading from HA */ crm_info("Delaying start, No peer data (%.16llx)", R_PEER_DATA); #if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { HA_Message *msg = NULL; crm_trace("Looking for a HA message"); msg = fsa_cluster_conn->llc_ops->readmsg(fsa_cluster_conn, 0); if (msg != NULL) { crm_trace("There was a HA message"); ha_msg_del(msg); } } #endif crmd_fsa_stall(NULL); return; } crm_debug("Init server comms"); ipcs = mainloop_add_ipc_server(CRM_SYSTEM_CRMD, QB_IPC_NATIVE, &crmd_callbacks); if (ipcs == NULL) { crm_err("Couldn't start IPC server"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } if (stonith_reconnect == NULL) { int dummy; stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, te_connect_stonith, &dummy); } set_bit(fsa_input_register, R_ST_REQUIRED); mainloop_set_trigger(stonith_reconnect); crm_notice("The local CRM is operational"); clear_bit(fsa_input_register, R_STARTING); register_fsa_input(msg_data->fsa_cause, I_PENDING, NULL); } /* A_RECOVER */ void do_recover(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { set_bit(fsa_input_register, R_IN_RECOVERY); crm_err("Action %s (%.16llx) not supported", fsa_action2string(action), action); register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } /* *INDENT-OFF* */ pe_cluster_option crmd_opts[] = { /* name, old-name, validate, default, description */ { "dc-version", NULL, "string", NULL, "none", NULL, "Version of Pacemaker on the cluster's DC.", "Includes the hash which identifies the exact Mercurial changeset it was built from. Used for diagnostic purposes." }, { "cluster-infrastructure", NULL, "string", NULL, "heartbeat", NULL, "The messaging stack on which Pacemaker is currently running.", "Used for informational and diagnostic purposes." }, { XML_CONFIG_ATTR_DC_DEADTIME, "dc_deadtime", "time", NULL, "20s", &check_time, "How long to wait for a response from other nodes during startup.", "The \"correct\" value will depend on the speed/load of your network and the type of switches used." }, { XML_CONFIG_ATTR_RECHECK, "cluster_recheck_interval", "time", "Zero disables polling. Positive values are an interval in seconds (unless other SI units are specified. eg. 5min)", "15min", &check_timer, "Polling interval for time based changes to options, resource parameters and constraints.", "The Cluster is primarily event driven, however the configuration can have elements that change based on time." " To ensure these changes take effect, we can optionally poll the cluster's status for changes." }, { XML_CONFIG_ATTR_ELECTION_FAIL, "election_timeout", "time", NULL, "2min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { XML_CONFIG_ATTR_FORCE_QUIT, "shutdown_escalation", "time", NULL, "20min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer, "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-transition-delay", NULL, "time", NULL, "0s", &check_timer, "*** Advanced Use Only ***\nEnabling this option will slow down cluster recovery under all conditions", "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\nUseful if your configuration is sensitive to the order in which ping updates arrive." }, { XML_ATTR_EXPECTED_VOTES, NULL, "integer", NULL, "2", &check_number, "The number of nodes expected to be in the cluster", "Used to calculate quorum in openais based clusters." }, }; /* *INDENT-ON* */ void crmd_metadata(void) { config_metadata("CRM Daemon", "1.0", "CRM Daemon Options", "This is a fake resource that details the options that can be configured for the CRM Daemon.", crmd_opts, DIMOF(crmd_opts)); } static void verify_crmd_options(GHashTable * options) { verify_all_options(options, crmd_opts, DIMOF(crmd_opts)); } static const char * crmd_pref(GHashTable * options, const char *name) { return get_cluster_pref(options, crmd_opts, DIMOF(crmd_opts), name); } static void config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { const char *value = NULL; GHashTable *config_hash = NULL; ha_time_t *now = new_ha_date(TRUE); if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; crm_err("Local CIB query resulted in an error: %s", pcmk_strerror(rc)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); if (rc == -EACCES || rc == -pcmk_err_dtd_validation) { crm_err("The cluster is mis-configured - shutting down and staying down"); set_bit(fsa_input_register, R_STAYDOWN); } goto bail; } crm_debug("Call %d : Parsing CIB options", call_id); config_hash = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); unpack_instance_attributes(output, output, XML_CIB_TAG_PROPSET, NULL, config_hash, CIB_OPTIONS_FIRST, FALSE, now); verify_crmd_options(config_hash); value = crmd_pref(config_hash, XML_CONFIG_ATTR_DC_DEADTIME); election_trigger->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, XML_CONFIG_ATTR_FORCE_QUIT); shutdown_escalation_timer->period_ms = crm_get_msec(value); crm_debug("Shutdown escalation occurs after: %dms", shutdown_escalation_timer->period_ms); value = crmd_pref(config_hash, XML_CONFIG_ATTR_ELECTION_FAIL); election_timeout->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, XML_CONFIG_ATTR_RECHECK); recheck_timer->period_ms = crm_get_msec(value); crm_debug("Checking for expired actions every %dms", recheck_timer->period_ms); value = crmd_pref(config_hash, "crmd-transition-delay"); transition_timer->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, "crmd-integration-timeout"); integration_timer->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, "crmd-finalization-timeout"); finalization_timer->period_ms = crm_get_msec(value); #if SUPPORT_COROSYNC if (is_classic_ais_cluster()) { value = crmd_pref(config_hash, XML_ATTR_EXPECTED_VOTES); crm_debug("Sending expected-votes=%s to corosync", value); send_ais_text(crm_class_quorum, value, TRUE, NULL, crm_msg_ais); } #endif set_bit(fsa_input_register, R_READ_CONFIG); crm_trace("Triggering FSA: %s", __FUNCTION__); mainloop_set_trigger(fsa_source); g_hash_table_destroy(config_hash); bail: free_ha_date(now); } gboolean crm_read_options(gpointer user_data) { int call_id = fsa_cib_conn->cmds->query(fsa_cib_conn, XML_CIB_TAG_CRMCONFIG, NULL, cib_scope_local); add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, config_query_callback); crm_trace("Querying the CIB... call %d", call_id); return TRUE; } /* A_READCONFIG */ void do_read_config(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { mainloop_set_trigger(config_read); } void crm_shutdown(int nsig) { if (crmd_mainloop != NULL && g_main_is_running(crmd_mainloop)) { if (is_set(fsa_input_register, R_SHUTDOWN)) { crm_err("Escalating the shutdown"); register_fsa_input_before(C_SHUTDOWN, I_ERROR, NULL); } else { set_bit(fsa_input_register, R_SHUTDOWN); register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL); if (shutdown_escalation_timer->period_ms < 1) { const char *value = crmd_pref(NULL, XML_CONFIG_ATTR_FORCE_QUIT); int msec = crm_get_msec(value); crm_debug("Using default shutdown escalation: %dms", msec); shutdown_escalation_timer->period_ms = msec; } /* cant rely on this... */ crm_notice("Requesting shutdown, upper limit is %dms", shutdown_escalation_timer->period_ms); crm_timer_start(shutdown_escalation_timer); } } else { crm_info("exit from shutdown"); exit(EX_OK); } } void default_cib_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; crm_err("CIB Update failed: %s", pcmk_strerror(rc)); crm_log_xml_warn(output, "update:failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } - -static void -create_cib_node_definition(gpointer key, gpointer value, gpointer user_data) -{ - crm_node_t *node = value; - xmlNode *cib_nodes = user_data; - xmlNode *cib_new_node = NULL; - - crm_trace("Creating node entry for %s/%s", node->uname, node->uuid); - cib_new_node = create_xml_node(cib_nodes, XML_CIB_TAG_NODE); - crm_xml_add(cib_new_node, XML_ATTR_ID, node->uuid); - crm_xml_add(cib_new_node, XML_ATTR_UNAME, node->uname); -} - -void -populate_cib_nodes(gboolean quick) -{ - int call_id = 0; - gboolean from_hashtable = TRUE; - xmlNode *cib_node_list = create_xml_node(NULL, XML_CIB_TAG_NODES); - -#if SUPPORT_HEARTBEAT - if (quick == FALSE && is_heartbeat_cluster()) { - from_hashtable = heartbeat_initialize_nodelist(fsa_cluster_conn, FALSE, cib_node_list); - } -#endif - -#if SUPPORT_COROSYNC - if (quick == FALSE && is_corosync_cluster()) { - from_hashtable = corosync_initialize_nodelist(NULL, FALSE, cib_node_list); - } -#endif - - if(from_hashtable) { - /* if(uname_is_uuid()) { */ - /* g_hash_table_foreach(crm_peer_id_cache, create_cib_node_definition, cib_node_list); */ - /* } else { */ - g_hash_table_foreach(crm_peer_cache, create_cib_node_definition, cib_node_list); - /* } */ - } - - crm_trace("Populating section from %s", from_hashtable?"hashtable":"cluster"); - - fsa_cib_update(XML_CIB_TAG_NODES, cib_node_list, cib_scope_local | cib_quorum_override, call_id, NULL); - add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, default_cib_update_callback); - - free_xml(cib_node_list); -} diff --git a/crmd/crmd_utils.h b/crmd/crmd_utils.h index 46470067d6..00b0e5c3a9 100644 --- a/crmd/crmd_utils.h +++ b/crmd/crmd_utils.h @@ -1,94 +1,104 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef CRMD_UTILS__H # define CRMD_UTILS__H # include # include # include /* For CIB_OP_MODIFY */ # define CLIENT_EXIT_WAIT 30 # define FAKE_TE_ID "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" extern void process_client_disconnect(crmd_client_t * curr_client); # define fsa_cib_update(section, data, options, call_id, user_name) \ if(fsa_cib_conn != NULL) { \ call_id = cib_internal_op( \ fsa_cib_conn, CIB_OP_MODIFY, NULL, section, data, \ NULL, options, user_name); \ \ } else { \ crm_err("No CIB connection available"); \ } # define fsa_cib_anon_update(section, data, options) \ if(fsa_cib_conn != NULL) { \ fsa_cib_conn->cmds->modify( \ fsa_cib_conn, section, data, options); \ \ } else { \ crm_err("No CIB connection available"); \ } extern gboolean fsa_has_quorum; extern int last_peer_update; -extern gboolean crm_timer_stop(fsa_timer_t * timer); -extern gboolean crm_timer_start(fsa_timer_t * timer); -extern gboolean crm_timer_popped(gpointer data); -extern gboolean is_timer_started(fsa_timer_t *timer); +enum node_update_flags +{ + node_update_none = 0x0000, + node_update_quick = 0x0001, + node_update_cluster = 0x0010, + node_update_peer = 0x0020, + node_update_join = 0x0040, + node_update_expected = 0x0100, +}; -extern xmlNode *create_node_state(const char *uname, const char *in_cluster, - const char *is_peer, const char *join_state, - const char *exp_state, gboolean clear_shutdown, const char *src); +gboolean crm_timer_stop(fsa_timer_t * timer); +gboolean crm_timer_start(fsa_timer_t * timer); +gboolean crm_timer_popped(gpointer data); +gboolean is_timer_started(fsa_timer_t *timer); -extern gboolean stop_subsystem(struct crm_subsystem_s *centry, gboolean force_quit); -extern gboolean start_subsystem(struct crm_subsystem_s *centry); +xmlNode *create_node_state(const char *uname, const char *in_cluster, + const char *is_peer, const char *join_state, + const char *exp_state, gboolean clear_shutdown, const char *src); -extern void fsa_dump_actions(long long action, const char *text); -extern void fsa_dump_inputs(int log_level, const char *text, long long input_register); +gboolean stop_subsystem(struct crm_subsystem_s *centry, gboolean force_quit); +gboolean start_subsystem(struct crm_subsystem_s *centry); -extern gboolean update_dc(xmlNode * msg); -extern void erase_node_from_join(const char *node); -extern void populate_cib_nodes(gboolean quick); -extern void crm_update_quorum(gboolean quorum, gboolean force_update); -extern void erase_status_tag(const char *uname, const char *tag, int options); -extern void update_attrd(const char *host, const char *name, const char *value, +void fsa_dump_actions(long long action, const char *text); +void fsa_dump_inputs(int log_level, const char *text, long long input_register); + +gboolean update_dc(xmlNode * msg); +void erase_node_from_join(const char *node); +void populate_cib_nodes(enum node_update_flags flags, const char *source); +void crm_update_quorum(gboolean quorum, gboolean force_update); +void erase_status_tag(const char *uname, const char *tag, int options); +void update_attrd(const char *host, const char *name, const char *value, const char *user_name); -extern const char *get_timer_desc(fsa_timer_t * timer); +const char *get_timer_desc(fsa_timer_t * timer); gboolean too_many_st_failures(void); # define start_transition(state) do { \ switch(state) { \ case S_TRANSITION_ENGINE: \ register_fsa_action(A_TE_CANCEL); \ break; \ case S_POLICY_ENGINE: \ case S_IDLE: \ register_fsa_input(C_FSA_INTERNAL, I_PE_CALC, NULL); \ break; \ default: \ crm_debug("NOT starting a new transition in state %s", \ fsa_state2string(fsa_state)); \ break; \ } \ } while(0) #endif diff --git a/crmd/fsa.c b/crmd/fsa.c index fd265f684c..ab1de3359b 100644 --- a/crmd/fsa.c +++ b/crmd/fsa.c @@ -1,688 +1,687 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include char *fsa_our_dc = NULL; cib_t *fsa_cib_conn = NULL; char *fsa_our_dc_version = NULL; lrmd_t *fsa_lrm_conn; char *fsa_our_uuid = NULL; char *fsa_our_uname = NULL; #if SUPPORT_HEARTBEAT ll_cluster_t *fsa_cluster_conn; #endif fsa_timer_t *wait_timer = NULL; fsa_timer_t *recheck_timer = NULL; fsa_timer_t *election_trigger = NULL; fsa_timer_t *election_timeout = NULL; fsa_timer_t *transition_timer = NULL; fsa_timer_t *integration_timer = NULL; fsa_timer_t *finalization_timer = NULL; fsa_timer_t *shutdown_escalation_timer = NULL; volatile gboolean do_fsa_stall = FALSE; volatile long long fsa_input_register = 0; volatile long long fsa_actions = A_NOTHING; volatile enum crmd_fsa_state fsa_state = S_STARTING; extern uint highest_born_on; extern uint num_join_invites; extern GHashTable *welcomed_nodes; extern GHashTable *finalized_nodes; extern GHashTable *confirmed_nodes; extern GHashTable *integrated_nodes; extern void initialize_join(gboolean before); #define DOT_PREFIX "actions:trace: " #define do_dot_log(fmt, args...) crm_trace( fmt, ##args) long long do_state_transition(long long actions, enum crmd_fsa_state cur_state, enum crmd_fsa_state next_state, fsa_data_t * msg_data); void dump_rsc_info(void); void dump_rsc_info_callback(const xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data); void ghash_print_node(gpointer key, gpointer value, gpointer user_data); void s_crmd_fsa_actions(fsa_data_t * fsa_data); void log_fsa_input(fsa_data_t * stored_msg); void init_dotfile(void); void init_dotfile(void) { do_dot_log(DOT_PREFIX "digraph \"g\" {"); do_dot_log(DOT_PREFIX " size = \"30,30\""); do_dot_log(DOT_PREFIX " graph ["); do_dot_log(DOT_PREFIX " fontsize = \"12\""); do_dot_log(DOT_PREFIX " fontname = \"Times-Roman\""); do_dot_log(DOT_PREFIX " fontcolor = \"black\""); do_dot_log(DOT_PREFIX " bb = \"0,0,398.922306,478.927856\""); do_dot_log(DOT_PREFIX " color = \"black\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX " node ["); do_dot_log(DOT_PREFIX " fontsize = \"12\""); do_dot_log(DOT_PREFIX " fontname = \"Times-Roman\""); do_dot_log(DOT_PREFIX " fontcolor = \"black\""); do_dot_log(DOT_PREFIX " shape = \"ellipse\""); do_dot_log(DOT_PREFIX " color = \"black\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX " edge ["); do_dot_log(DOT_PREFIX " fontsize = \"12\""); do_dot_log(DOT_PREFIX " fontname = \"Times-Roman\""); do_dot_log(DOT_PREFIX " fontcolor = \"black\""); do_dot_log(DOT_PREFIX " color = \"black\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX "// special nodes"); do_dot_log(DOT_PREFIX " \"S_PENDING\" "); do_dot_log(DOT_PREFIX " ["); do_dot_log(DOT_PREFIX " color = \"blue\""); do_dot_log(DOT_PREFIX " fontcolor = \"blue\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX " \"S_TERMINATE\" "); do_dot_log(DOT_PREFIX " ["); do_dot_log(DOT_PREFIX " color = \"red\""); do_dot_log(DOT_PREFIX " fontcolor = \"red\""); do_dot_log(DOT_PREFIX " ]"); do_dot_log(DOT_PREFIX "// DC only nodes"); do_dot_log(DOT_PREFIX " \"S_INTEGRATION\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_POLICY_ENGINE\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_TRANSITION_ENGINE\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_RELEASE_DC\" [ fontcolor = \"green\" ]"); do_dot_log(DOT_PREFIX " \"S_IDLE\" [ fontcolor = \"green\" ]"); } static void do_fsa_action(fsa_data_t * fsa_data, long long an_action, void (*function) (long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t * msg_data)) { fsa_actions &= ~an_action; crm_trace(DOT_PREFIX "\t// %s", fsa_action2string(an_action)); function(an_action, fsa_data->fsa_cause, fsa_state, fsa_data->fsa_input, fsa_data); } static long long startup_actions = A_STARTUP | A_CIB_START | A_LRM_CONNECT | A_CCM_CONNECT | A_HA_CONNECT | A_READCONFIG | A_STARTED | A_CL_JOIN_QUERY; enum crmd_fsa_state s_crmd_fsa(enum crmd_fsa_cause cause) { fsa_data_t *fsa_data = NULL; long long register_copy = fsa_input_register; long long new_actions = A_NOTHING; enum crmd_fsa_state last_state = fsa_state; crm_trace("FSA invoked with Cause: %s\tState: %s", fsa_cause2string(cause), fsa_state2string(fsa_state)); do_fsa_stall = FALSE; if (is_message() == FALSE && fsa_actions != A_NOTHING) { /* fake the first message so we can get into the loop */ fsa_data = calloc(1, sizeof(fsa_data_t)); fsa_data->fsa_input = I_NULL; fsa_data->fsa_cause = C_FSA_INTERNAL; fsa_data->origin = __FUNCTION__; fsa_data->data_type = fsa_dt_none; fsa_message_queue = g_list_append(fsa_message_queue, fsa_data); fsa_data = NULL; } while (is_message() && do_fsa_stall == FALSE) { crm_trace("Checking messages (%d remaining)", g_list_length(fsa_message_queue)); fsa_data = get_message(); CRM_CHECK(fsa_data != NULL, continue); log_fsa_input(fsa_data); /* add any actions back to the queue */ fsa_actions |= fsa_data->actions; /* get the next batch of actions */ new_actions = crmd_fsa_actions[fsa_data->fsa_input][fsa_state]; fsa_actions |= new_actions; if (fsa_data->fsa_input != I_NULL && fsa_data->fsa_input != I_ROUTER) { crm_debug("Processing %s: [ state=%s cause=%s origin=%s ]", fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); } #ifdef FSA_TRACE if (new_actions != A_NOTHING) { crm_trace("Adding FSA actions %.16llx for %s/%s", new_actions, fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state)); fsa_dump_actions(new_actions, "\tFSA scheduled"); } else if (fsa_data->fsa_input != I_NULL && new_actions == A_NOTHING) { crm_debug("No action specified for input,state (%s,%s)", fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state)); } if (fsa_data->actions != A_NOTHING) { crm_trace("Adding input actions %.16llx for %s/%s", new_actions, fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state)); fsa_dump_actions(fsa_data->actions, "\tInput scheduled"); } #endif /* logging : *before* the state is changed */ if (is_set(fsa_actions, A_ERROR)) { do_fsa_action(fsa_data, A_ERROR, do_log); } if (is_set(fsa_actions, A_WARN)) { do_fsa_action(fsa_data, A_WARN, do_log); } if (is_set(fsa_actions, A_LOG)) { do_fsa_action(fsa_data, A_LOG, do_log); } /* update state variables */ last_state = fsa_state; fsa_state = crmd_fsa_state[fsa_data->fsa_input][fsa_state]; /* * Remove certain actions during shutdown */ if (fsa_state == S_STOPPING || ((fsa_input_register & R_SHUTDOWN) == R_SHUTDOWN)) { clear_bit(fsa_actions, startup_actions); } /* * Hook for change of state. * Allows actions to be added or removed when entering a state */ if (last_state != fsa_state) { fsa_actions = do_state_transition(fsa_actions, last_state, fsa_state, fsa_data); } else { do_dot_log(DOT_PREFIX "\t// FSA input: State=%s \tCause=%s" " \tInput=%s \tOrigin=%s() \tid=%d", fsa_state2string(fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_input2string(fsa_data->fsa_input), fsa_data->origin, fsa_data->id); } /* start doing things... */ s_crmd_fsa_actions(fsa_data); delete_fsa_input(fsa_data); fsa_data = NULL; } if (g_list_length(fsa_message_queue) > 0 || fsa_actions != A_NOTHING || do_fsa_stall) { crm_debug("Exiting the FSA: queue=%d, fsa_actions=0x%llx, stalled=%s", g_list_length(fsa_message_queue), fsa_actions, do_fsa_stall ? "true" : "false"); } else { crm_trace("Exiting the FSA"); } /* cleanup inputs? */ if (register_copy != fsa_input_register) { long long same = register_copy & fsa_input_register; fsa_dump_inputs(LOG_DEBUG, "Added input:", fsa_input_register ^ same); fsa_dump_inputs(LOG_DEBUG, "Removed input:", register_copy ^ same); } fsa_dump_queue(LOG_DEBUG); return fsa_state; } void s_crmd_fsa_actions(fsa_data_t * fsa_data) { /* * Process actions in order of priority but do only one * action at a time to avoid complicating the ordering. */ CRM_CHECK(fsa_data != NULL, return); while (fsa_actions != A_NOTHING && do_fsa_stall == FALSE) { /* regular action processing in order of action priority * * Make sure all actions that connect to required systems * are performed first */ if (fsa_actions & A_ERROR) { do_fsa_action(fsa_data, A_ERROR, do_log); } else if (fsa_actions & A_WARN) { do_fsa_action(fsa_data, A_WARN, do_log); } else if (fsa_actions & A_LOG) { do_fsa_action(fsa_data, A_LOG, do_log); /* get out of here NOW! before anything worse happens */ } else if (fsa_actions & A_EXIT_1) { do_fsa_action(fsa_data, A_EXIT_1, do_exit); /* sub-system restart */ } else if ((fsa_actions & O_LRM_RECONNECT) == O_LRM_RECONNECT) { do_fsa_action(fsa_data, O_LRM_RECONNECT, do_lrm_control); } else if ((fsa_actions & O_CIB_RESTART) == O_CIB_RESTART) { do_fsa_action(fsa_data, O_CIB_RESTART, do_cib_control); } else if ((fsa_actions & O_PE_RESTART) == O_PE_RESTART) { do_fsa_action(fsa_data, O_PE_RESTART, do_pe_control); } else if ((fsa_actions & O_TE_RESTART) == O_TE_RESTART) { do_fsa_action(fsa_data, O_TE_RESTART, do_te_control); /* essential start tasks */ } else if (fsa_actions & A_STARTUP) { do_fsa_action(fsa_data, A_STARTUP, do_startup); } else if (fsa_actions & A_CIB_START) { do_fsa_action(fsa_data, A_CIB_START, do_cib_control); } else if (fsa_actions & A_HA_CONNECT) { do_fsa_action(fsa_data, A_HA_CONNECT, do_ha_control); } else if (fsa_actions & A_READCONFIG) { do_fsa_action(fsa_data, A_READCONFIG, do_read_config); /* sub-system start/connect */ } else if (fsa_actions & A_LRM_CONNECT) { do_fsa_action(fsa_data, A_LRM_CONNECT, do_lrm_control); } else if (fsa_actions & A_CCM_CONNECT) { #if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { do_fsa_action(fsa_data, A_CCM_CONNECT, do_ccm_control); } #endif fsa_actions &= ~A_CCM_CONNECT; } else if (fsa_actions & A_TE_START) { do_fsa_action(fsa_data, A_TE_START, do_te_control); } else if (fsa_actions & A_PE_START) { do_fsa_action(fsa_data, A_PE_START, do_pe_control); /* Timers */ /* else if(fsa_actions & O_DC_TIMER_RESTART) { do_fsa_action(fsa_data, O_DC_TIMER_RESTART, do_timer_control) */ ; } else if (fsa_actions & A_DC_TIMER_STOP) { do_fsa_action(fsa_data, A_DC_TIMER_STOP, do_timer_control); } else if (fsa_actions & A_INTEGRATE_TIMER_STOP) { do_fsa_action(fsa_data, A_INTEGRATE_TIMER_STOP, do_timer_control); } else if (fsa_actions & A_INTEGRATE_TIMER_START) { do_fsa_action(fsa_data, A_INTEGRATE_TIMER_START, do_timer_control); } else if (fsa_actions & A_FINALIZE_TIMER_STOP) { do_fsa_action(fsa_data, A_FINALIZE_TIMER_STOP, do_timer_control); } else if (fsa_actions & A_FINALIZE_TIMER_START) { do_fsa_action(fsa_data, A_FINALIZE_TIMER_START, do_timer_control); /* * Highest priority actions */ } else if (fsa_actions & A_MSG_ROUTE) { do_fsa_action(fsa_data, A_MSG_ROUTE, do_msg_route); } else if (fsa_actions & A_RECOVER) { do_fsa_action(fsa_data, A_RECOVER, do_recover); } else if (fsa_actions & A_CL_JOIN_RESULT) { do_fsa_action(fsa_data, A_CL_JOIN_RESULT, do_cl_join_finalize_respond); } else if (fsa_actions & A_CL_JOIN_REQUEST) { do_fsa_action(fsa_data, A_CL_JOIN_REQUEST, do_cl_join_offer_respond); } else if (fsa_actions & A_SHUTDOWN_REQ) { do_fsa_action(fsa_data, A_SHUTDOWN_REQ, do_shutdown_req); } else if (fsa_actions & A_ELECTION_VOTE) { do_fsa_action(fsa_data, A_ELECTION_VOTE, do_election_vote); } else if (fsa_actions & A_ELECTION_COUNT) { do_fsa_action(fsa_data, A_ELECTION_COUNT, do_election_count_vote); } else if (fsa_actions & A_LRM_EVENT) { do_fsa_action(fsa_data, A_LRM_EVENT, do_lrm_event); /* * High priority actions */ } else if (fsa_actions & A_STARTED) { do_fsa_action(fsa_data, A_STARTED, do_started); } else if (fsa_actions & A_CL_JOIN_QUERY) { do_fsa_action(fsa_data, A_CL_JOIN_QUERY, do_cl_join_query); } else if (fsa_actions & A_DC_TIMER_START) { do_fsa_action(fsa_data, A_DC_TIMER_START, do_timer_control); /* * Medium priority actions */ } else if (fsa_actions & A_DC_TAKEOVER) { do_fsa_action(fsa_data, A_DC_TAKEOVER, do_dc_takeover); } else if (fsa_actions & A_DC_RELEASE) { do_fsa_action(fsa_data, A_DC_RELEASE, do_dc_release); } else if (fsa_actions & A_DC_JOIN_FINAL) { do_fsa_action(fsa_data, A_DC_JOIN_FINAL, do_dc_join_final); } else if (fsa_actions & A_ELECTION_CHECK) { do_fsa_action(fsa_data, A_ELECTION_CHECK, do_election_check); } else if (fsa_actions & A_ELECTION_START) { do_fsa_action(fsa_data, A_ELECTION_START, do_election_vote); } else if (fsa_actions & A_TE_HALT) { do_fsa_action(fsa_data, A_TE_HALT, do_te_invoke); } else if (fsa_actions & A_TE_CANCEL) { do_fsa_action(fsa_data, A_TE_CANCEL, do_te_invoke); } else if (fsa_actions & A_DC_JOIN_OFFER_ALL) { do_fsa_action(fsa_data, A_DC_JOIN_OFFER_ALL, do_dc_join_offer_all); } else if (fsa_actions & A_DC_JOIN_OFFER_ONE) { do_fsa_action(fsa_data, A_DC_JOIN_OFFER_ONE, do_dc_join_offer_all); } else if (fsa_actions & A_DC_JOIN_PROCESS_REQ) { do_fsa_action(fsa_data, A_DC_JOIN_PROCESS_REQ, do_dc_join_filter_offer); } else if (fsa_actions & A_DC_JOIN_PROCESS_ACK) { do_fsa_action(fsa_data, A_DC_JOIN_PROCESS_ACK, do_dc_join_ack); /* * Low(er) priority actions * Make sure the CIB is always updated before invoking the * PE, and the PE before the TE */ } else if (fsa_actions & A_DC_JOIN_FINALIZE) { do_fsa_action(fsa_data, A_DC_JOIN_FINALIZE, do_dc_join_finalize); } else if (fsa_actions & A_LRM_INVOKE) { do_fsa_action(fsa_data, A_LRM_INVOKE, do_lrm_invoke); } else if (fsa_actions & A_PE_INVOKE) { do_fsa_action(fsa_data, A_PE_INVOKE, do_pe_invoke); } else if (fsa_actions & A_TE_INVOKE) { do_fsa_action(fsa_data, A_TE_INVOKE, do_te_invoke); } else if (fsa_actions & A_CL_JOIN_ANNOUNCE) { do_fsa_action(fsa_data, A_CL_JOIN_ANNOUNCE, do_cl_join_announce); /* sub-system stop */ } else if (fsa_actions & A_DC_RELEASED) { do_fsa_action(fsa_data, A_DC_RELEASED, do_dc_release); } else if (fsa_actions & A_PE_STOP) { do_fsa_action(fsa_data, A_PE_STOP, do_pe_control); } else if (fsa_actions & A_TE_STOP) { do_fsa_action(fsa_data, A_TE_STOP, do_te_control); } else if (fsa_actions & A_SHUTDOWN) { do_fsa_action(fsa_data, A_SHUTDOWN, do_shutdown); } else if (fsa_actions & A_LRM_DISCONNECT) { do_fsa_action(fsa_data, A_LRM_DISCONNECT, do_lrm_control); } else if (fsa_actions & A_CCM_DISCONNECT) { #if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { do_fsa_action(fsa_data, A_CCM_DISCONNECT, do_ccm_control); } #endif fsa_actions &= ~A_CCM_DISCONNECT; } else if (fsa_actions & A_HA_DISCONNECT) { do_fsa_action(fsa_data, A_HA_DISCONNECT, do_ha_control); } else if (fsa_actions & A_CIB_STOP) { do_fsa_action(fsa_data, A_CIB_STOP, do_cib_control); } else if (fsa_actions & A_STOP) { do_fsa_action(fsa_data, A_STOP, do_stop); /* exit gracefully */ } else if (fsa_actions & A_EXIT_0) { do_fsa_action(fsa_data, A_EXIT_0, do_exit); /* Error checking and reporting */ } else { crm_err("Action %s (0x%llx) not supported ", fsa_action2string(fsa_actions), fsa_actions); register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, fsa_data, NULL, __FUNCTION__); } } } void log_fsa_input(fsa_data_t * stored_msg) { crm_trace("Processing queued input %d", stored_msg->id); if (stored_msg->fsa_cause == C_CCM_CALLBACK) { crm_trace("FSA processing CCM callback from %s", stored_msg->origin); } else if (stored_msg->fsa_cause == C_LRM_OP_CALLBACK) { crm_trace("FSA processing LRM callback from %s", stored_msg->origin); } else if (stored_msg->data == NULL) { crm_trace("FSA processing input from %s", stored_msg->origin); } else { ha_msg_input_t *ha_input = fsa_typed_data_adv(stored_msg, fsa_dt_ha_msg, __FUNCTION__); crm_trace("FSA processing XML message from %s", stored_msg->origin); crm_log_xml_trace(ha_input->xml, "FSA message data"); } } long long do_state_transition(long long actions, enum crmd_fsa_state cur_state, enum crmd_fsa_state next_state, fsa_data_t * msg_data) { int level = LOG_INFO; long long tmp = actions; gboolean clear_recovery_bit = TRUE; enum crmd_fsa_cause cause = msg_data->fsa_cause; enum crmd_fsa_input current_input = msg_data->fsa_input; const char *state_from = fsa_state2string(cur_state); const char *state_to = fsa_state2string(next_state); const char *input = fsa_input2string(current_input); CRM_LOG_ASSERT(cur_state != next_state); do_dot_log(DOT_PREFIX "\t%s -> %s [ label=%s cause=%s origin=%s ]", state_from, state_to, input, fsa_cause2string(cause), msg_data->origin); if(cur_state == S_IDLE || next_state == S_IDLE) { level = LOG_NOTICE; } else if(cur_state == S_NOT_DC || next_state == S_NOT_DC) { level = LOG_NOTICE; } else if(cur_state == S_ELECTION) { level = LOG_NOTICE; } else if(next_state == S_RECOVERY) { level = LOG_WARNING; } do_crm_log(level, "State transition %s -> %s [ input=%s cause=%s origin=%s ]", state_from, state_to, input, fsa_cause2string(cause), msg_data->origin); /* the last two clauses might cause trouble later */ if (election_timeout != NULL && next_state != S_ELECTION && cur_state != S_RELEASE_DC) { crm_timer_stop(election_timeout); /* } else { */ /* crm_timer_start(election_timeout); */ } #if 0 if ((fsa_input_register & R_SHUTDOWN)) { set_bit(tmp, A_DC_TIMER_STOP); } #endif if (next_state == S_INTEGRATION) { set_bit(tmp, A_INTEGRATE_TIMER_START); } else { set_bit(tmp, A_INTEGRATE_TIMER_STOP); } if (next_state == S_FINALIZE_JOIN) { set_bit(tmp, A_FINALIZE_TIMER_START); } else { set_bit(tmp, A_FINALIZE_TIMER_STOP); } if (next_state != S_PENDING) { set_bit(tmp, A_DC_TIMER_STOP); } if (next_state != S_ELECTION) { highest_born_on = 0; } if (next_state != S_IDLE) { crm_timer_stop(recheck_timer); } if (cur_state == S_FINALIZE_JOIN && next_state == S_POLICY_ENGINE) { - populate_cib_nodes(FALSE); - do_update_cib_nodes(TRUE, __FUNCTION__); + populate_cib_nodes(node_update_quick|node_update_cluster|node_update_peer|node_update_join, __FUNCTION__); } switch (next_state) { case S_PENDING: fsa_cib_conn->cmds->set_slave(fsa_cib_conn, cib_scope_local); /* fall through */ case S_ELECTION: crm_trace("Resetting our DC to NULL on transition to %s", fsa_state2string(next_state)); update_dc(NULL); break; case S_NOT_DC: election_trigger->counter = 0; if (is_set(fsa_input_register, R_SHUTDOWN)) { crm_info("(Re)Issuing shutdown request now" " that we have a new DC"); set_bit(tmp, A_SHUTDOWN_REQ); } CRM_LOG_ASSERT(fsa_our_dc != NULL); if (fsa_our_dc == NULL) { crm_err("Reached S_NOT_DC without a DC" " being recorded"); } break; case S_RECOVERY: clear_recovery_bit = FALSE; break; case S_FINALIZE_JOIN: CRM_LOG_ASSERT(AM_I_DC); if (cause == C_TIMER_POPPED) { crm_warn("Progressed to state %s after %s", fsa_state2string(next_state), fsa_cause2string(cause)); } if (g_hash_table_size(welcomed_nodes) > 0) { char *msg = strdup(" Welcome reply not received from"); crm_warn("%u cluster nodes failed to respond" " to the join offer.", g_hash_table_size(welcomed_nodes)); g_hash_table_foreach(welcomed_nodes, ghash_print_node, msg); free(msg); } else { crm_debug("All %d cluster nodes " "responded to the join offer.", g_hash_table_size(integrated_nodes)); } break; case S_POLICY_ENGINE: election_trigger->counter = 0; CRM_LOG_ASSERT(AM_I_DC); if (cause == C_TIMER_POPPED) { crm_info("Progressed to state %s after %s", fsa_state2string(next_state), fsa_cause2string(cause)); } if (g_hash_table_size(finalized_nodes) > 0) { char *msg = strdup(" Confirm not received from"); crm_err("%u cluster nodes failed to confirm" " their join.", g_hash_table_size(finalized_nodes)); g_hash_table_foreach(finalized_nodes, ghash_print_node, msg); free(msg); } else if (g_hash_table_size(confirmed_nodes) == crm_active_peers()) { crm_debug("All %u cluster nodes are" " eligible to run resources.", crm_active_peers()); } else if (g_hash_table_size(confirmed_nodes) > crm_active_peers()) { crm_err("We have more confirmed nodes than our membership does: %d vs. %d", g_hash_table_size(confirmed_nodes), crm_active_peers()); register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); } else if (saved_ccm_membership_id != crm_peer_seq) { crm_info("Membership changed: %llu -> %llu - join restart", saved_ccm_membership_id, crm_peer_seq); register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); } else { crm_warn("Only %u of %u cluster " "nodes are eligible to run resources - continue %d", g_hash_table_size(confirmed_nodes), crm_active_peers(), g_hash_table_size(welcomed_nodes)); } /* initialize_join(FALSE); */ break; case S_STOPPING: case S_TERMINATE: /* possibly redundant */ set_bit(fsa_input_register, R_SHUTDOWN); break; case S_IDLE: CRM_LOG_ASSERT(AM_I_DC); dump_rsc_info(); if (is_set(fsa_input_register, R_SHUTDOWN)) { crm_info("(Re)Issuing shutdown request now" " that we are the DC"); set_bit(tmp, A_SHUTDOWN_REQ); } if (recheck_timer->period_ms > 0) { crm_debug("Starting %s", get_timer_desc(recheck_timer)); crm_timer_start(recheck_timer); } break; default: break; } if (clear_recovery_bit && next_state != S_PENDING) { tmp &= ~A_RECOVER; } else if (clear_recovery_bit == FALSE) { tmp |= A_RECOVER; } if (tmp != actions) { /* fsa_dump_actions(actions ^ tmp, "New actions"); */ actions = tmp; } return actions; } void dump_rsc_info(void) { } void ghash_print_node(gpointer key, gpointer value, gpointer user_data) { const char *text = user_data; const char *uname = key; const char *value_s = value; crm_info("%s: %s %s", text, uname, value_s); } diff --git a/crmd/join_dc.c b/crmd/join_dc.c index e5ddbf1cba..e16a21b5ed 100644 --- a/crmd/join_dc.c +++ b/crmd/join_dc.c @@ -1,665 +1,665 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include GHashTable *welcomed_nodes = NULL; GHashTable *integrated_nodes = NULL; GHashTable *finalized_nodes = NULL; GHashTable *confirmed_nodes = NULL; char *max_epoch = NULL; char *max_generation_from = NULL; xmlNode *max_generation_xml = NULL; void initialize_join(gboolean before); gboolean finalize_join_for(gpointer key, gpointer value, gpointer user_data); void finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data); gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); static int current_join_id = 0; unsigned long long saved_ccm_membership_id = 0; void initialize_join(gboolean before) { /* clear out/reset a bunch of stuff */ crm_debug("join-%d: Initializing join data (flag=%s)", current_join_id, before ? "true" : "false"); g_hash_table_destroy(welcomed_nodes); g_hash_table_destroy(integrated_nodes); g_hash_table_destroy(finalized_nodes); g_hash_table_destroy(confirmed_nodes); if (before) { if (max_generation_from != NULL) { free(max_generation_from); max_generation_from = NULL; } if (max_generation_xml != NULL) { free_xml(max_generation_xml); max_generation_xml = NULL; } clear_bit(fsa_input_register, R_HAVE_CIB); clear_bit(fsa_input_register, R_CIB_ASKED); } welcomed_nodes = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); integrated_nodes = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); finalized_nodes = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); confirmed_nodes = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); } void erase_node_from_join(const char *uname) { gboolean w = FALSE, i = FALSE, f = FALSE, c = FALSE; if (uname == NULL) { return; } if (welcomed_nodes != NULL) { w = g_hash_table_remove(welcomed_nodes, uname); } if (integrated_nodes != NULL) { i = g_hash_table_remove(integrated_nodes, uname); } if (finalized_nodes != NULL) { f = g_hash_table_remove(finalized_nodes, uname); } if (confirmed_nodes != NULL) { c = g_hash_table_remove(confirmed_nodes, uname); } if (w || i || f || c) { crm_debug("Removed node %s from join calculations:" " welcomed=%d itegrated=%d finalized=%d confirmed=%d", uname, w, i, f, c); } } static void join_make_offer(gpointer key, gpointer value, gpointer user_data) { const char *join_to = NULL; const crm_node_t *member = value; CRM_ASSERT(member != NULL); if (crm_is_peer_active(member) == FALSE) { crm_trace("Not making an offer to %s: not active", member->uname); return; } join_to = member->uname; if (join_to == NULL) { crm_err("No recipient for welcome message"); return; } erase_node_from_join(join_to); if (saved_ccm_membership_id != crm_peer_seq) { saved_ccm_membership_id = crm_peer_seq; crm_info("Making join offers based on membership %llu", crm_peer_seq); } if (crm_is_peer_active(member)) { xmlNode *offer = create_request(CRM_OP_JOIN_OFFER, NULL, join_to, CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL); char *join_offered = crm_itoa(current_join_id); crm_xml_add_int(offer, F_CRM_JOIN_ID, current_join_id); /* send the welcome */ crm_debug("join-%d: Sending offer to %s", current_join_id, join_to); send_cluster_message(join_to, crm_msg_crmd, offer, TRUE); free_xml(offer); g_hash_table_insert(welcomed_nodes, strdup(join_to), join_offered); } else { crm_info("Peer process on %s is not active (yet?): %.8lx %d", join_to, (long)member->processes, g_hash_table_size(crm_peer_cache)); } } /* A_DC_JOIN_OFFER_ALL */ void do_dc_join_offer_all(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { /* reset everyones status back to down or in_ccm in the CIB * * any nodes that are active in the CIB but not in the CCM list * will be seen as offline by the PE anyway */ current_join_id++; initialize_join(TRUE); /* do_update_cib_nodes(TRUE, __FUNCTION__); */ update_dc(NULL); if (cause == C_HA_MESSAGE && current_input == I_NODE_JOIN) { crm_info("A new node joined the cluster"); } g_hash_table_foreach(crm_peer_cache, join_make_offer, NULL); /* dont waste time by invoking the PE yet; */ crm_info("join-%d: Waiting on %d outstanding join acks", current_join_id, g_hash_table_size(welcomed_nodes)); } /* A_DC_JOIN_OFFER_ONE */ void do_dc_join_offer_one(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_node_t *member; ha_msg_input_t *welcome = NULL; const char *op = NULL; const char *join_to = NULL; if (msg_data->data) { welcome = fsa_typed_data(fsa_dt_ha_msg); } else { crm_info("A new node joined - wait until it contacts us"); return; } if (welcome == NULL) { crm_err("Attempt to send welcome message without a message to reply to!"); return; } join_to = crm_element_value(welcome->msg, F_CRM_HOST_FROM); if (join_to == NULL) { crm_err("Attempt to send welcome message without a host to reply to!"); return; } member = crm_get_peer(0, join_to); if (crm_is_peer_active(member) == FALSE) { crm_err("%s is not a fully active member of our partition", join_to); return; } op = crm_element_value(welcome->msg, F_CRM_TASK); if (join_to != NULL && (cur_state == S_INTEGRATION || cur_state == S_FINALIZE_JOIN)) { /* note: it _is_ possible that a node will have been * sick or starting up when the original offer was made. * however, it will either re-announce itself in due course * _or_ we can re-store the original offer on the client. */ crm_trace("(Re-)offering membership to %s...", join_to); } crm_info("join-%d: Processing %s request from %s in state %s", current_join_id, op, join_to, fsa_state2string(cur_state)); join_make_offer(NULL, member, NULL); /* always offer to the DC (ourselves) * this ensures the correct value for max_generation_from */ member = crm_get_peer(0, fsa_our_uname); join_make_offer(NULL, member, NULL); /* this was a genuine join request, cancel any existing * transition and invoke the PE */ start_transition(fsa_state); /* dont waste time by invoking the pe yet; */ crm_debug("Waiting on %d outstanding join acks for join-%d", g_hash_table_size(welcomed_nodes), current_join_id); } static int compare_int_fields(xmlNode * left, xmlNode * right, const char *field) { const char *elem_l = crm_element_value(left, field); const char *elem_r = crm_element_value(right, field); int int_elem_l = crm_int_helper(elem_l, NULL); int int_elem_r = crm_int_helper(elem_r, NULL); if (int_elem_l < int_elem_r) { return -1; } else if (int_elem_l > int_elem_r) { return 1; } return 0; } /* A_DC_JOIN_PROCESS_REQ */ void do_dc_join_filter_offer(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { xmlNode *generation = NULL; int cmp = 0; int join_id = -1; gboolean ack_nack_bool = TRUE; const char *ack_nack = CRMD_JOINSTATE_MEMBER; ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg); const char *join_from = crm_element_value(join_ack->msg, F_CRM_HOST_FROM); const char *ref = crm_element_value(join_ack->msg, XML_ATTR_REFERENCE); crm_node_t *join_node = crm_get_peer(0, join_from); crm_debug("Processing req from %s", join_from); generation = join_ack->xml; crm_element_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id); if (max_generation_xml != NULL && generation != NULL) { int lpc = 0; const char *attributes[] = { XML_ATTR_GENERATION_ADMIN, XML_ATTR_GENERATION, XML_ATTR_NUMUPDATES, }; for (lpc = 0; cmp == 0 && lpc < DIMOF(attributes); lpc++) { cmp = compare_int_fields(max_generation_xml, generation, attributes[lpc]); } } if (join_id != current_join_id) { crm_debug("Invalid response from %s: join-%d vs. join-%d", join_from, join_id, current_join_id); check_join_state(cur_state, __FUNCTION__); return; } else if (join_node == NULL || crm_is_peer_active(join_node) == FALSE) { crm_err("Node %s is not a member", join_from); ack_nack_bool = FALSE; } else if (generation == NULL) { crm_err("Generation was NULL"); ack_nack_bool = FALSE; } else if (max_generation_xml == NULL) { max_generation_xml = copy_xml(generation); max_generation_from = strdup(join_from); } else if (cmp < 0 || (cmp == 0 && safe_str_eq(join_from, fsa_our_uname))) { crm_debug("%s has a better generation number than" " the current max %s", join_from, max_generation_from); if (max_generation_xml) { crm_log_xml_debug(max_generation_xml, "Max generation"); } crm_log_xml_debug(generation, "Their generation"); free(max_generation_from); free_xml(max_generation_xml); max_generation_from = strdup(join_from); max_generation_xml = copy_xml(join_ack->xml); } if (ack_nack_bool == FALSE) { /* NACK this client */ ack_nack = CRMD_JOINSTATE_NACK; crm_err("join-%d: NACK'ing node %s (ref %s)", join_id, join_from, ref); } else { crm_debug("join-%d: Welcoming node %s (ref %s)", join_id, join_from, ref); } - /* add them to our list of CRMD_STATE_ACTIVE nodes */ + /* add them to our list of CRMD_JOINSTATE_MEMBER nodes */ g_hash_table_insert(integrated_nodes, strdup(join_from), strdup(ack_nack)); crm_debug("%u nodes have been integrated into join-%d", g_hash_table_size(integrated_nodes), join_id); g_hash_table_remove(welcomed_nodes, join_from); if (check_join_state(cur_state, __FUNCTION__) == FALSE) { /* dont waste time by invoking the PE yet; */ crm_debug("join-%d: Still waiting on %d outstanding offers", join_id, g_hash_table_size(welcomed_nodes)); } } /* A_DC_JOIN_FINALIZE */ void do_dc_join_finalize(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { char *sync_from = NULL; int rc = pcmk_ok; /* This we can do straight away and avoid clients timing us out * while we compute the latest CIB */ crm_debug("Finializing join-%d for %d clients", current_join_id, g_hash_table_size(integrated_nodes)); if (g_hash_table_size(integrated_nodes) == 0) { /* If we don't even have ourself, start again */ register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION_DC, NULL, NULL, __FUNCTION__); return; } clear_bit(fsa_input_register, R_HAVE_CIB); if (max_generation_from == NULL || safe_str_eq(max_generation_from, fsa_our_uname)) { set_bit(fsa_input_register, R_HAVE_CIB); } if (is_set(fsa_input_register, R_IN_TRANSITION)) { crm_warn("join-%d: We are still in a transition." " Delaying until the TE completes.", current_join_id); crmd_fsa_stall(NULL); return; } if (is_set(fsa_input_register, R_HAVE_CIB) == FALSE) { /* ask for the agreed best CIB */ sync_from = strdup(max_generation_from); crm_log_xml_debug(max_generation_xml, "Requesting version"); set_bit(fsa_input_register, R_CIB_ASKED); } else { /* Send _our_ CIB out to everyone */ sync_from = strdup(fsa_our_uname); } crm_info("join-%d: Syncing the CIB from %s to the rest of the cluster", current_join_id, sync_from); rc = fsa_cib_conn->cmds->sync_from(fsa_cib_conn, sync_from, NULL, cib_quorum_override); fsa_cib_conn->cmds->register_callback(fsa_cib_conn, rc, 60, FALSE, sync_from, "finalize_sync_callback", finalize_sync_callback); } void finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { CRM_LOG_ASSERT(-EPERM != rc); clear_bit(fsa_input_register, R_CIB_ASKED); if (rc != pcmk_ok) { do_crm_log((rc == -pcmk_err_old_data ? LOG_WARNING : LOG_ERR), "Sync from %s failed: %s", (char *)user_data, pcmk_strerror(rc)); /* restart the whole join process */ register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION_DC, NULL, NULL, __FUNCTION__); } else if (AM_I_DC && fsa_state == S_FINALIZE_JOIN) { set_bit(fsa_input_register, R_HAVE_CIB); clear_bit(fsa_input_register, R_CIB_ASKED); /* make sure dc_uuid is re-set to us */ if (check_join_state(fsa_state, __FUNCTION__) == FALSE) { crm_debug("Notifying %d clients of join-%d results", g_hash_table_size(integrated_nodes), current_join_id); g_hash_table_foreach_remove(integrated_nodes, finalize_join_for, NULL); } } else { crm_debug("No longer the DC in S_FINALIZE_JOIN: %s/%s", AM_I_DC ? "DC" : "CRMd", fsa_state2string(fsa_state)); } free(user_data); } static void join_update_complete_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; if (rc == pcmk_ok) { crm_debug("Join update %d complete", call_id); check_join_state(fsa_state, __FUNCTION__); } else { crm_err("Join update %d failed", call_id); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } /* A_DC_JOIN_PROCESS_ACK */ void do_dc_join_ack(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int join_id = -1; int call_id = 0; ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg); const char *join_id_s = NULL; const char *join_state = NULL; const char *op = crm_element_value(join_ack->msg, F_CRM_TASK); const char *join_from = crm_element_value(join_ack->msg, F_CRM_HOST_FROM); if (safe_str_neq(op, CRM_OP_JOIN_CONFIRM)) { crm_debug("Ignoring op=%s message from %s", op, join_from); return; } crm_element_value_int(join_ack->msg, F_CRM_JOIN_ID, &join_id); join_id_s = crm_element_value(join_ack->msg, F_CRM_JOIN_ID); /* now update them to "member" */ crm_trace("Processing ack from %s", join_from); join_state = (const char *) g_hash_table_lookup(finalized_nodes, join_from); if (join_state == NULL) { crm_err("Join not in progress: ignoring join-%d from %s", join_id, join_from); return; } else if (safe_str_neq(join_state, CRMD_JOINSTATE_MEMBER)) { crm_err("Node %s wasnt invited to join the cluster", join_from); g_hash_table_remove(finalized_nodes, join_from); return; } else if (join_id != current_join_id) { crm_err("Invalid response from %s: join-%d vs. join-%d", join_from, join_id, current_join_id); g_hash_table_remove(finalized_nodes, join_from); return; } g_hash_table_remove(finalized_nodes, join_from); if (g_hash_table_lookup(confirmed_nodes, join_from) != NULL) { crm_err("join-%d: hash already contains confirmation from %s", join_id, join_from); } g_hash_table_insert(confirmed_nodes, strdup(join_from), strdup(join_id_s)); crm_info("join-%d: Updating node state to %s for %s", join_id, CRMD_JOINSTATE_MEMBER, join_from); /* update CIB with the current LRM status from the node * We dont need to notify the TE of these updates, a transition will * be started in due time */ erase_status_tag(join_from, XML_CIB_TAG_LRM, cib_scope_local); fsa_cib_update(XML_CIB_TAG_STATUS, join_ack->xml, cib_scope_local | cib_quorum_override | cib_can_create, call_id, NULL); add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, join_update_complete_callback); crm_debug("join-%d: Registered callback for LRM update %d", join_id, call_id); } gboolean finalize_join_for(gpointer key, gpointer value, gpointer user_data) { const char *join_to = NULL; const char *join_state = NULL; xmlNode *acknak = NULL; xmlNode *tmp1 = NULL; crm_node_t *join_node = NULL; if (key == NULL || value == NULL) { return TRUE; } join_to = (const char *)key; join_state = (const char *)value; /* make sure a node entry exists for the new node */ crm_trace("Creating node entry for %s", join_to); tmp1 = create_xml_node(NULL, XML_CIB_TAG_NODE); set_uuid(tmp1, XML_ATTR_UUID, join_to); crm_xml_add(tmp1, XML_ATTR_UNAME, join_to); fsa_cib_anon_update(XML_CIB_TAG_NODES, tmp1, cib_scope_local | cib_quorum_override | cib_can_create); free_xml(tmp1); join_node = crm_get_peer(0, join_to); if (crm_is_peer_active(join_node) == FALSE) { /* * NACK'ing nodes that the membership layer doesn't know about yet * simply creates more churn * * Better to leave them waiting and let the join restart when * the new membership event comes in * * All other NACKs (due to versions etc) should still be processed */ return TRUE; } /* send the ack/nack to the node */ acknak = create_request(CRM_OP_JOIN_ACKNAK, NULL, join_to, CRM_SYSTEM_CRMD, CRM_SYSTEM_DC, NULL); crm_xml_add_int(acknak, F_CRM_JOIN_ID, current_join_id); /* set the ack/nack */ if (safe_str_eq(join_state, CRMD_JOINSTATE_MEMBER)) { crm_debug("join-%d: ACK'ing join request from %s, state %s", current_join_id, join_to, join_state); crm_xml_add(acknak, CRM_OP_JOIN_ACKNAK, XML_BOOLEAN_TRUE); g_hash_table_insert(finalized_nodes, strdup(join_to), strdup(CRMD_JOINSTATE_MEMBER)); } else { crm_warn("join-%d: NACK'ing join request from %s, state %s", current_join_id, join_to, join_state); crm_xml_add(acknak, CRM_OP_JOIN_ACKNAK, XML_BOOLEAN_FALSE); } send_cluster_message(join_to, crm_msg_crmd, acknak, TRUE); free_xml(acknak); return TRUE; } void ghash_print_node(gpointer key, gpointer value, gpointer user_data); gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source) { crm_debug("Invoked by %s in state: %s", source, fsa_state2string(cur_state)); if (saved_ccm_membership_id != crm_peer_seq) { crm_debug("%s: Membership changed since join started: %llu -> %llu", source, saved_ccm_membership_id, crm_peer_seq); register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL); } else if (cur_state == S_INTEGRATION) { if (g_hash_table_size(welcomed_nodes) == 0) { crm_debug("join-%d: Integration of %d peers complete: %s", current_join_id, g_hash_table_size(integrated_nodes), source); register_fsa_input_before(C_FSA_INTERNAL, I_INTEGRATED, NULL); return TRUE; } } else if (cur_state == S_FINALIZE_JOIN) { if (is_set(fsa_input_register, R_HAVE_CIB) == FALSE) { crm_debug("join-%d: Delaying I_FINALIZED until we have the CIB", current_join_id); return TRUE; } else if (g_hash_table_size(integrated_nodes) == 0 && g_hash_table_size(finalized_nodes) == 0) { crm_debug("join-%d complete: %s", current_join_id, source); register_fsa_input_later(C_FSA_INTERNAL, I_FINALIZED, NULL); } else if (g_hash_table_size(integrated_nodes) != 0 && g_hash_table_size(finalized_nodes) != 0) { char *msg = NULL; crm_err("join-%d: Waiting on %d integrated nodes" " AND %d finalized nodes", current_join_id, g_hash_table_size(integrated_nodes), g_hash_table_size(finalized_nodes)); msg = strdup("Integrated node"); g_hash_table_foreach(integrated_nodes, ghash_print_node, msg); free(msg); msg = strdup("Finalized node"); g_hash_table_foreach(finalized_nodes, ghash_print_node, msg); free(msg); } else if (g_hash_table_size(integrated_nodes) != 0) { crm_debug("join-%d: Still waiting on %d integrated nodes", current_join_id, g_hash_table_size(integrated_nodes)); } else if (g_hash_table_size(finalized_nodes) != 0) { crm_debug("join-%d: Still waiting on %d finalized nodes", current_join_id, g_hash_table_size(finalized_nodes)); } } return FALSE; } void do_dc_join_final(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { crm_debug("Ensuring DC, quorum and node attributes are up-to-date"); update_attrd(NULL, NULL, NULL, NULL); crm_update_quorum(crm_have_quorum, TRUE); } diff --git a/crmd/lrm.c b/crmd/lrm.c index 78523f28d4..3fd0e97eb9 100644 --- a/crmd/lrm.c +++ b/crmd/lrm.c @@ -1,2002 +1,1995 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #define START_DELAY_THRESHOLD 5 * 60 * 1000 typedef struct resource_history_s { char *id; lrmd_rsc_info_t rsc; lrmd_event_data_t *last; lrmd_event_data_t *failed; GList *recurring_op_list; /* Resources must be stopped using the same * parameters they were started with. This hashtable * holds the parameters that should be used for the next stop * cmd on this resource. */ GHashTable *stop_params; } rsc_history_t; struct recurring_op_s { char *rsc_id; char *op_type; char *op_key; int call_id; int interval; int last_rc; int last_op_status; gboolean remove; gboolean cancelled; }; struct pending_deletion_op_s { char *rsc; ha_msg_input_t *input; }; struct delete_event_s { int rc; const char *rsc; }; GHashTable *resource_history = NULL; GHashTable *pending_ops = NULL; GHashTable *deletion_ops = NULL; int num_lrm_register_fails = 0; int max_lrm_register_fails = 30; gboolean process_lrm_event(lrmd_event_data_t * op); gboolean is_rsc_active(const char *rsc_id); gboolean build_active_RAs(xmlNode * rsc_list); static gboolean stop_recurring_actions(gpointer key, gpointer value, gpointer user_data); static int delete_rsc_status(const char *rsc_id, int call_options, const char *user_name); static lrmd_event_data_t *construct_op(xmlNode * rsc_op, const char *rsc_id, const char *operation); void do_lrm_rsc_op(lrmd_rsc_info_t * rsc, const char *operation, xmlNode * msg, xmlNode * request); void send_direct_ack(const char *to_host, const char *to_sys, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op, const char *rsc_id); static void lrm_connection_destroy(void) { if (is_set(fsa_input_register, R_LRM_CONNECTED)) { crm_crit("LRM Connection failed"); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); clear_bit(fsa_input_register, R_LRM_CONNECTED); } else { crm_info("LRM Connection disconnected"); } } static void free_deletion_op(gpointer value) { struct pending_deletion_op_s *op = value; free(op->rsc); delete_ha_msg_input(op->input); free(op); } static void free_recurring_op(gpointer value) { struct recurring_op_s *op = (struct recurring_op_s *)value; free(op->rsc_id); free(op->op_type); free(op->op_key); free(op); } static char * make_stop_id(const char *rsc, int call_id) { char *op_id = NULL; op_id = calloc(1, strlen(rsc) + 34); if (op_id != NULL) { snprintf(op_id, strlen(rsc) + 34, "%s:%d", rsc, call_id); } return op_id; } static void copy_instance_keys(gpointer key, gpointer value, gpointer user_data) { if (strstr(key, CRM_META "_") == NULL) { g_hash_table_replace(user_data, strdup((const char *)key), strdup((const char *)value)); } } static void copy_meta_keys(gpointer key, gpointer value, gpointer user_data) { if (strstr(key, CRM_META "_") != NULL) { g_hash_table_replace(user_data, strdup((const char *)key), strdup((const char *)value)); } } static void history_cache_destroy(gpointer data) { rsc_history_t *entry = data; if (entry->stop_params) { g_hash_table_destroy(entry->stop_params); } free(entry->rsc.type); free(entry->rsc.class); free(entry->rsc.provider); lrmd_free_event(entry->failed); lrmd_free_event(entry->last); free(entry->id); free(entry); } static void update_history_cache(lrmd_rsc_info_t * rsc, lrmd_event_data_t * op) { int target_rc = 0; rsc_history_t *entry = NULL; if (op->rsc_deleted) { crm_debug("Purged history for '%s' after %s", op->rsc_id, op->op_type); delete_rsc_status(op->rsc_id, cib_quorum_override, NULL); return; } if (safe_str_eq(op->op_type, RSC_NOTIFY)) { return; } crm_debug("Updating history for '%s' with %s op", op->rsc_id, op->op_type); entry = g_hash_table_lookup(resource_history, op->rsc_id); if (entry == NULL && rsc) { entry = calloc(1, sizeof(rsc_history_t)); entry->id = strdup(op->rsc_id); g_hash_table_insert(resource_history, entry->id, entry); entry->rsc.id = entry->id; entry->rsc.type = strdup(rsc->type); entry->rsc.class = strdup(rsc->class); if (rsc->provider) { entry->rsc.provider = strdup(rsc->provider); } else { entry->rsc.provider = NULL; } } else if (entry == NULL) { crm_info("Resource %s no longer exists, not updating cache", op->rsc_id); return; } target_rc = rsc_op_expected_rc(op); if (op->op_status == PCMK_LRM_OP_CANCELLED) { if (op->interval > 0) { GList *gIter, *gIterNext; crm_trace("Removing cancelled recurring op: %s_%s_%d", op->rsc_id, op->op_type, op->interval); for (gIter = entry->recurring_op_list; gIter != NULL; gIter = gIterNext) { lrmd_event_data_t *existing = gIter->data; gIterNext = gIter->next; if (safe_str_eq(op->rsc_id, existing->rsc_id) && safe_str_eq(op->op_type, existing->op_type) && op->interval == existing->interval) { lrmd_free_event(existing); entry->recurring_op_list = g_list_delete_link(entry->recurring_op_list, gIter); } } return; } else { crm_trace("Skipping %s_%s_%d rc=%d, status=%d", op->rsc_id, op->op_type, op->interval, op->rc, op->op_status); } } else if (did_rsc_op_fail(op, target_rc)) { /* We must store failed monitors here * - otherwise the block below will cause them to be forgetten them when a stop happens */ if (entry->failed) { lrmd_free_event(entry->failed); } entry->failed = lrmd_copy_event(op); } else if (op->interval == 0) { if (entry->last) { lrmd_free_event(entry->last); } entry->last = lrmd_copy_event(op); if (op->params && (safe_str_eq(CRMD_ACTION_START, op->op_type) || safe_str_eq(CRMD_ACTION_STATUS, op->op_type))) { if (entry->stop_params) { g_hash_table_destroy(entry->stop_params); } entry->stop_params = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); g_hash_table_foreach(op->params, copy_instance_keys, entry->stop_params); } } if (op->interval > 0) { crm_trace("Adding recurring op: %s_%s_%d", op->rsc_id, op->op_type, op->interval); entry->recurring_op_list = g_list_prepend(entry->recurring_op_list, lrmd_copy_event(op)); } else if (entry->recurring_op_list && safe_str_eq(op->op_type, RSC_STATUS) == FALSE) { GList *gIter = entry->recurring_op_list; crm_trace("Dropping %d recurring ops because of: %s_%s_%d", g_list_length(gIter), op->rsc_id, op->op_type, op->interval); for (; gIter != NULL; gIter = gIter->next) { lrmd_free_event(gIter->data); } g_list_free(entry->recurring_op_list); entry->recurring_op_list = NULL; } } /* A_LRM_CONNECT */ void do_lrm_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { if (fsa_lrm_conn == NULL) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return; } if (action & A_LRM_DISCONNECT) { if (verify_stopped(cur_state, LOG_INFO) == FALSE) { if(action == A_LRM_DISCONNECT) { crmd_fsa_stall(NULL); return; } } if (is_set(fsa_input_register, R_LRM_CONNECTED)) { clear_bit(fsa_input_register, R_LRM_CONNECTED); fsa_lrm_conn->cmds->disconnect(fsa_lrm_conn); crm_info("Disconnected from the LRM"); } g_hash_table_destroy(resource_history); resource_history = NULL; g_hash_table_destroy(deletion_ops); deletion_ops = NULL; g_hash_table_destroy(pending_ops); pending_ops = NULL; } if (action & A_LRM_CONNECT) { int ret = pcmk_ok; deletion_ops = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, free_deletion_op); pending_ops = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, free_recurring_op); resource_history = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, history_cache_destroy); crm_debug("Connecting to the LRM"); ret = fsa_lrm_conn->cmds->connect(fsa_lrm_conn, CRM_SYSTEM_CRMD, NULL); if (ret != pcmk_ok) { if (++num_lrm_register_fails < max_lrm_register_fails) { crm_warn("Failed to sign on to the LRM %d" " (%d max) times", num_lrm_register_fails, max_lrm_register_fails); crm_timer_start(wait_timer); crmd_fsa_stall(NULL); return; } } if (ret == pcmk_ok) { crm_trace("LRM: set_lrm_callback..."); fsa_lrm_conn->cmds->set_callback(fsa_lrm_conn, lrm_op_callback); } if (ret != pcmk_ok) { crm_err("Failed to sign on to the LRM %d" " (max) times", num_lrm_register_fails); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return; } set_bit(fsa_input_register, R_LRM_CONNECTED); crm_debug("LRM connection established"); } if (action & ~(A_LRM_CONNECT | A_LRM_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__); } } static void ghash_print_pending(gpointer key, gpointer value, gpointer user_data) { const char *stop_id = key; int *log_level = user_data; struct recurring_op_s *pending = value; do_crm_log(*log_level, "Pending action: %s (%s)", stop_id, pending->op_key); } static void ghash_print_pending_for_rsc(gpointer key, gpointer value, gpointer user_data) { const char *stop_id = key; char *rsc = user_data; struct recurring_op_s *pending = value; if (safe_str_eq(rsc, pending->rsc_id)) { crm_notice("%sction %s (%s) incomplete at shutdown", pending->interval == 0 ? "A" : "Recurring a", stop_id, pending->op_key); } } static void ghash_count_pending(gpointer key, gpointer value, gpointer user_data) { int *counter = user_data; struct recurring_op_s *pending = value; if (pending->interval > 0) { /* Ignore recurring actions in the shutdown calculations */ return; } (*counter)++; } gboolean verify_stopped(enum crmd_fsa_state cur_state, int log_level) { int counter = 0; gboolean rc = TRUE; GHashTableIter gIter; rsc_history_t *entry = NULL; crm_debug("Checking for active resources before exit"); if (cur_state == S_TERMINATE) { log_level = LOG_ERR; } if (pending_ops) { if (is_set(fsa_input_register, R_LRM_CONNECTED)) { /* Only log/complain about non-recurring actions */ g_hash_table_foreach_remove(pending_ops, stop_recurring_actions, NULL); } g_hash_table_foreach(pending_ops, ghash_count_pending, &counter); } if (counter > 0) { rc = FALSE; do_crm_log(log_level, "%d pending LRM operations at shutdown%s", counter, cur_state == S_TERMINATE ? "" : "... waiting"); if (cur_state == S_TERMINATE || !is_set(fsa_input_register, R_SENT_RSC_STOP)) { g_hash_table_foreach(pending_ops, ghash_print_pending, &log_level); } goto bail; } if (resource_history == NULL) { goto bail; } g_hash_table_iter_init(&gIter, resource_history); while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { if (is_rsc_active(entry->id) == FALSE) { continue; } crm_err("Resource %s was active at shutdown." " You may ignore this error if it is unmanaged.", entry->id); g_hash_table_foreach(pending_ops, ghash_print_pending_for_rsc, entry->id); } bail: set_bit(fsa_input_register, R_SENT_RSC_STOP); if (cur_state == S_TERMINATE) { rc = TRUE; } return rc; } static char * get_rsc_metadata(const char *type, const char *class, const char *provider) { char *metadata = NULL; CRM_CHECK(type != NULL, return NULL); CRM_CHECK(class != NULL, return NULL); if (provider == NULL) { provider = "heartbeat"; } crm_trace("Retreiving metadata for %s::%s:%s", type, class, provider); fsa_lrm_conn->cmds->get_metadata(fsa_lrm_conn, class, provider, type, &metadata, 0); if (metadata) { /* copy the metadata because the LRM likes using * g_alloc instead of cl_malloc */ char *m_copy = strdup(metadata); g_free(metadata); metadata = m_copy; } else { crm_warn("No metadata found for %s::%s:%s", type, class, provider); } return metadata; } typedef struct reload_data_s { char *key; char *metadata; time_t last_query; gboolean can_reload; GListPtr restart_list; } reload_data_t; static void g_hash_destroy_reload(gpointer data) { reload_data_t *reload = data; free(reload->key); free(reload->metadata); g_list_free_full(reload->restart_list, free); free(reload); } GHashTable *reload_hash = NULL; static GListPtr get_rsc_restart_list(lrmd_rsc_info_t * rsc, lrmd_event_data_t * op) { int len = 0; char *key = NULL; char *copy = NULL; const char *value = NULL; const char *provider = NULL; xmlNode *param = NULL; xmlNode *params = NULL; xmlNode *actions = NULL; xmlNode *metadata = NULL; time_t now = time(NULL); reload_data_t *reload = NULL; if (reload_hash == NULL) { reload_hash = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, g_hash_destroy_reload); } provider = rsc->provider; if (provider == NULL) { provider = "heartbeat"; } len = strlen(rsc->type) + strlen(rsc->class) + strlen(provider) + 4; key = malloc( len); snprintf(key, len, "%s::%s:%s", rsc->type, rsc->class, provider); reload = g_hash_table_lookup(reload_hash, key); if (reload && ((now - 9) > reload->last_query) && safe_str_eq(op->op_type, RSC_START)) { reload = NULL; /* re-query */ } if (reload == NULL) { xmlNode *action = NULL; reload = calloc(1, sizeof(reload_data_t)); g_hash_table_replace(reload_hash, key, reload); reload->last_query = now; reload->key = key; key = NULL; reload->metadata = get_rsc_metadata(rsc->type, rsc->class, provider); metadata = string2xml(reload->metadata); if (metadata == NULL) { crm_err("Metadata for %s::%s:%s is not valid XML", rsc->provider, rsc->class, rsc->type); goto cleanup; } actions = find_xml_node(metadata, "actions", TRUE); for (action = __xml_first_child(actions); action != NULL; action = __xml_next(action)) { if (crm_str_eq((const char *)action->name, "action", TRUE)) { value = crm_element_value(action, "name"); if (safe_str_eq("reload", value)) { reload->can_reload = TRUE; break; } } } if (reload->can_reload == FALSE) { goto cleanup; } params = find_xml_node(metadata, "parameters", TRUE); for (param = __xml_first_child(params); param != NULL; param = __xml_next(param)) { if (crm_str_eq((const char *)param->name, "parameter", TRUE)) { value = crm_element_value(param, "unique"); if (crm_is_true(value)) { value = crm_element_value(param, "name"); if (value == NULL) { crm_err("%s: NULL param", key); continue; } crm_debug("Attr %s is not reloadable", value); copy = strdup(value); CRM_CHECK(copy != NULL, continue); reload->restart_list = g_list_append(reload->restart_list, copy); } } } } cleanup: free(key); free_xml(metadata); return reload->restart_list; } static void append_restart_list(lrmd_rsc_info_t * rsc, lrmd_event_data_t * op, xmlNode * update, const char *version) { int len = 0; char *list = NULL; char *digest = NULL; const char *value = NULL; gboolean non_empty = FALSE; xmlNode *restart = NULL; GListPtr restart_list = NULL; GListPtr lpc = NULL; if (op->interval > 0) { /* monitors are not reloadable */ return; } else if (op->params == NULL) { crm_debug("%s has no parameters", ID(update)); return; } else if (rsc == NULL) { return; } else if (crm_str_eq(CRMD_ACTION_STOP, op->op_type, TRUE)) { /* Stopped resources don't need to be reloaded */ return; } else if (compare_version("1.0.8", version) > 0) { /* Caller version does not support reloads */ return; } restart_list = get_rsc_restart_list(rsc, op); if (restart_list == NULL) { /* Resource does not support reloads */ return; } restart = create_xml_node(NULL, XML_TAG_PARAMS); for (lpc = restart_list; lpc != NULL; lpc = lpc->next) { const char *param = (const char *)lpc->data; int start = len; CRM_CHECK(param != NULL, continue); value = g_hash_table_lookup(op->params, param); if (value != NULL) { non_empty = TRUE; crm_xml_add(restart, param, value); } len += strlen(param) + 2; list = realloc(list, len + 1); sprintf(list + start, " %s ", param); } digest = calculate_operation_digest(restart, version); crm_xml_add(update, XML_LRM_ATTR_OP_RESTART, list); crm_xml_add(update, XML_LRM_ATTR_RESTART_DIGEST, digest); #if 0 crm_debug("%s: %s, %s", rsc->id, digest, list); if (non_empty) { crm_log_xml_debug(restart, "restart digest source"); } #endif free_xml(restart); free(digest); free(list); } static gboolean build_operation_update(xmlNode * parent, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op, const char *src) { int target_rc = 0; xmlNode *xml_op = NULL; const char *caller_version = CRM_FEATURE_SET; if (op == NULL) { return FALSE; } else if (AM_I_DC) { } else if (fsa_our_dc_version != NULL) { caller_version = fsa_our_dc_version; } else if (op->params == NULL) { caller_version = fsa_our_dc_version; } else { /* there is a small risk in formerly mixed clusters that * it will be sub-optimal. * however with our upgrade policy, the update we send * should still be completely supported anyway */ caller_version = g_hash_table_lookup(op->params, XML_ATTR_CRM_VERSION); crm_debug("Falling back to operation originator version: %s", caller_version); } target_rc = rsc_op_expected_rc(op); xml_op = create_operation_update(parent, op, caller_version, target_rc, src, LOG_DEBUG); if (xml_op) { append_restart_list(rsc, op, xml_op, caller_version); } return TRUE; } gboolean is_rsc_active(const char *rsc_id) { rsc_history_t *entry = NULL; crm_trace("Processing lrmd_rsc_info_t entry %s", rsc_id); entry = g_hash_table_lookup(resource_history, rsc_id); if (entry == NULL || entry->last == NULL) { return FALSE; } if (entry->last->rc == PCMK_EXECRA_OK && safe_str_eq(entry->last->op_type, CRMD_ACTION_STOP)) { return FALSE; } else if (entry->last->rc == PCMK_EXECRA_OK && safe_str_eq(entry->last->op_type, CRMD_ACTION_MIGRATE)) { /* a stricter check is too complex... * leave that to the PE */ return FALSE; } else if (entry->last->rc == PCMK_EXECRA_NOT_RUNNING) { return FALSE; } else if (entry->last->interval == 0 && entry->last->rc == PCMK_EXECRA_NOT_CONFIGURED) { /* Badly configured resources can't be reliably stopped */ return FALSE; } return TRUE; } gboolean build_active_RAs(xmlNode * rsc_list) { GHashTableIter iter; rsc_history_t *entry = NULL; g_hash_table_iter_init(&iter, resource_history); while (g_hash_table_iter_next(&iter, NULL, (void **)&entry)) { GList *gIter = NULL; xmlNode *xml_rsc = create_xml_node(rsc_list, XML_LRM_TAG_RESOURCE); crm_xml_add(xml_rsc, XML_ATTR_ID, entry->id); crm_xml_add(xml_rsc, XML_ATTR_TYPE, entry->rsc.type); crm_xml_add(xml_rsc, XML_AGENT_ATTR_CLASS, entry->rsc.class); crm_xml_add(xml_rsc, XML_AGENT_ATTR_PROVIDER, entry->rsc.provider); build_operation_update(xml_rsc, &(entry->rsc), entry->last, __FUNCTION__); build_operation_update(xml_rsc, &(entry->rsc), entry->failed, __FUNCTION__); for (gIter = entry->recurring_op_list; gIter != NULL; gIter = gIter->next) { build_operation_update(xml_rsc, &(entry->rsc), gIter->data, __FUNCTION__); } } return FALSE; } xmlNode * do_lrm_query(gboolean is_replace) { - gboolean shut_down = FALSE; xmlNode *xml_result = NULL; xmlNode *xml_state = NULL; xmlNode *xml_data = NULL; xmlNode *rsc_list = NULL; - const char *exp_state = CRMD_STATE_ACTIVE; - - if (is_set(fsa_input_register, R_SHUTDOWN)) { - exp_state = CRMD_STATE_INACTIVE; - shut_down = TRUE; - } xml_state = create_node_state(fsa_our_uname, XML_BOOLEAN_TRUE, - ONLINESTATUS, CRMD_JOINSTATE_MEMBER, exp_state, - !shut_down, __FUNCTION__); + ONLINESTATUS, CRMD_JOINSTATE_MEMBER, CRMD_JOINSTATE_MEMBER, + is_not_set(fsa_input_register, R_SHUTDOWN), __FUNCTION__); xml_data = create_xml_node(xml_state, XML_CIB_TAG_LRM); crm_xml_add(xml_data, XML_ATTR_ID, fsa_our_uuid); rsc_list = create_xml_node(xml_data, XML_LRM_TAG_RESOURCES); /* Build a list of active (not always running) resources */ build_active_RAs(rsc_list); xml_result = create_cib_fragment(xml_state, XML_CIB_TAG_STATUS); crm_log_xml_trace(xml_state, "Current state of the LRM"); free_xml(xml_state); return xml_result; } static void notify_deleted(ha_msg_input_t * input, const char *rsc_id, int rc) { lrmd_event_data_t *op = NULL; const char *from_sys = crm_element_value(input->msg, F_CRM_SYS_FROM); const char *from_host = crm_element_value(input->msg, F_CRM_HOST_FROM); crm_info("Notifying %s on %s that %s was%s deleted", from_sys, from_host, rsc_id, rc == pcmk_ok ? "" : " not"); op = construct_op(input->xml, rsc_id, CRMD_ACTION_DELETE); CRM_ASSERT(op != NULL); if (rc == pcmk_ok) { op->op_status = PCMK_LRM_OP_DONE; op->rc = PCMK_EXECRA_OK; } else { op->op_status = PCMK_LRM_OP_ERROR; op->rc = PCMK_EXECRA_UNKNOWN_ERROR; } send_direct_ack(from_host, from_sys, NULL, op, rsc_id); lrmd_free_event(op); if (safe_str_neq(from_sys, CRM_SYSTEM_TENGINE)) { /* this isn't expected - trigger a new transition */ time_t now = time(NULL); char *now_s = crm_itoa(now); crm_debug("Triggering a refresh after %s deleted %s from the LRM", from_sys, rsc_id); update_attr_delegate( fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL, "last-lrm-refresh", now_s, FALSE, NULL); free(now_s); } } static gboolean lrm_remove_deleted_rsc(gpointer key, gpointer value, gpointer user_data) { struct delete_event_s *event = user_data; struct pending_deletion_op_s *op = value; if (safe_str_eq(event->rsc, op->rsc)) { notify_deleted(op->input, event->rsc, event->rc); return TRUE; } return FALSE; } static gboolean lrm_remove_deleted_op(gpointer key, gpointer value, gpointer user_data) { const char *rsc = user_data; struct recurring_op_s *pending = value; if (safe_str_eq(rsc, pending->rsc_id)) { crm_info("Removing op %s:%d for deleted resource %s", pending->op_key, pending->call_id, rsc); return TRUE; } return FALSE; } /* * Remove the rsc from the CIB * * Avoids refreshing the entire LRM section of this host */ #define rsc_template "//"XML_CIB_TAG_STATE"[@uname='%s']//"XML_LRM_TAG_RESOURCE"[@id='%s']" static int delete_rsc_status(const char *rsc_id, int call_options, const char *user_name) { char *rsc_xpath = NULL; int max = 0; int rc = pcmk_ok; CRM_CHECK(rsc_id != NULL, return -ENXIO); max = strlen(rsc_template) + strlen(rsc_id) + strlen(fsa_our_uname) + 1; rsc_xpath = calloc(1, max); snprintf(rsc_xpath, max, rsc_template, fsa_our_uname, rsc_id); rc = cib_internal_op(fsa_cib_conn, CIB_OP_DELETE, NULL, rsc_xpath, NULL, NULL, call_options | cib_xpath, user_name); free(rsc_xpath); return rc; } static void delete_rsc_entry(ha_msg_input_t * input, const char *rsc_id, GHashTableIter *rsc_gIter, int rc, const char *user_name) { struct delete_event_s event; CRM_CHECK(rsc_id != NULL, return); if (rc == pcmk_ok) { char *rsc_id_copy = strdup(rsc_id); if (rsc_gIter) g_hash_table_iter_remove(rsc_gIter); else g_hash_table_remove(resource_history, rsc_id_copy); crm_debug("sync: Sending delete op for %s", rsc_id_copy); delete_rsc_status(rsc_id_copy, cib_quorum_override, user_name); g_hash_table_foreach_remove(pending_ops, lrm_remove_deleted_op, rsc_id_copy); free(rsc_id_copy); } if (input) { notify_deleted(input, rsc_id, rc); } event.rc = rc; event.rsc = rsc_id; g_hash_table_foreach_remove(deletion_ops, lrm_remove_deleted_rsc, &event); } /* * Remove the op from the CIB * * Avoids refreshing the entire LRM section of this host */ #define op_template "//"XML_CIB_TAG_STATE"[@uname='%s']//"XML_LRM_TAG_RESOURCE"[@id='%s']/"XML_LRM_TAG_RSC_OP"[@id='%s']" #define op_call_template "//"XML_CIB_TAG_STATE"[@uname='%s']//"XML_LRM_TAG_RESOURCE"[@id='%s']/"XML_LRM_TAG_RSC_OP"[@id='%s' and @"XML_LRM_ATTR_CALLID"='%d']" static void delete_op_entry(lrmd_event_data_t * op, const char *rsc_id, const char *key, int call_id) { xmlNode *xml_top = NULL; if (op != NULL) { xml_top = create_xml_node(NULL, XML_LRM_TAG_RSC_OP); crm_xml_add_int(xml_top, XML_LRM_ATTR_CALLID, op->call_id); crm_xml_add(xml_top, XML_ATTR_TRANSITION_KEY, op->user_data); if(op->interval > 0) { char *op_id = generate_op_key(op->rsc_id, op->op_type, op->interval); /* Avoid deleting last_failure too (if it was a result of this recurring op failing) */ crm_xml_add(xml_top, XML_ATTR_ID, op_id); free(op_id); } crm_debug("async: Sending delete op for %s_%s_%d (call=%d)", op->rsc_id, op->op_type, op->interval, op->call_id); fsa_cib_conn->cmds->delete(fsa_cib_conn, XML_CIB_TAG_STATUS, xml_top, cib_quorum_override); } else if (rsc_id != NULL && key != NULL) { int max = 0; char *op_xpath = NULL; if (call_id > 0) { max = strlen(op_call_template) + strlen(rsc_id) + strlen(fsa_our_uname) + strlen(key) + 10; op_xpath = calloc(1, max); snprintf(op_xpath, max, op_call_template, fsa_our_uname, rsc_id, key, call_id); } else { max = strlen(op_template) + strlen(rsc_id) + strlen(fsa_our_uname) + strlen(key) + 1; op_xpath = calloc(1, max); snprintf(op_xpath, max, op_template, fsa_our_uname, rsc_id, key); } crm_debug("sync: Sending delete op for %s (call=%d)", rsc_id, call_id); fsa_cib_conn->cmds->delete(fsa_cib_conn, op_xpath, NULL, cib_quorum_override | cib_xpath); free(op_xpath); } else { crm_err("Not enough information to delete op entry: rsc=%p key=%p", rsc_id, key); return; } crm_log_xml_trace(xml_top, "op:cancel"); free_xml(xml_top); } void lrm_clear_last_failure(const char *rsc_id) { char *attr = NULL; GHashTableIter iter; rsc_history_t *entry = NULL; attr = generate_op_key(rsc_id, "last_failure", 0); delete_op_entry(NULL, rsc_id, attr, 0); free(attr); if (!resource_history) { return; } g_hash_table_iter_init(&iter, resource_history); while (g_hash_table_iter_next(&iter, NULL, (void **)&entry)) { if (safe_str_eq(rsc_id, entry->id)) { lrmd_free_event(entry->failed); entry->failed = NULL; } } } static gboolean cancel_op(const char *rsc_id, const char *key, int op, gboolean remove) { int rc = pcmk_ok; struct recurring_op_s *pending = NULL; CRM_CHECK(op != 0, return FALSE); CRM_CHECK(rsc_id != NULL, return FALSE); if (key == NULL) { key = make_stop_id(rsc_id, op); } pending = g_hash_table_lookup(pending_ops, key); if (pending) { if (remove && pending->remove == FALSE) { pending->remove = TRUE; crm_debug("Scheduling %s for removal", key); } if (pending->cancelled) { crm_debug("Operation %s already cancelled", key); return TRUE; } pending->cancelled = TRUE; } else { crm_info("No pending op found for %s", key); return TRUE; } crm_debug("Cancelling op %d for %s (%s)", op, rsc_id, key); rc = fsa_lrm_conn->cmds->cancel(fsa_lrm_conn, pending->rsc_id, pending->op_type, pending->interval); if (rc == pcmk_ok) { crm_debug("Op %d for %s (%s): cancelled", op, rsc_id, key); } else { crm_debug("Op %d for %s (%s): Nothing to cancel", op, rsc_id, key); /* The caller needs to make sure the entry is * removed from the pending_ops list * * Usually by returning TRUE inside the worker function * supplied to g_hash_table_foreach_remove() * * Not removing the entry from pending_ops will block * the node from shutting down */ return FALSE; } return TRUE; } struct cancel_data { gboolean done; gboolean remove; const char *key; lrmd_rsc_info_t *rsc; }; static gboolean cancel_action_by_key(gpointer key, gpointer value, gpointer user_data) { struct cancel_data *data = user_data; struct recurring_op_s *op = (struct recurring_op_s *)value; if (safe_str_eq(op->op_key, data->key)) { data->done = TRUE; if (cancel_op(data->rsc->id, key, op->call_id, data->remove) == FALSE) { return TRUE; } } return FALSE; } static gboolean cancel_op_key(lrmd_rsc_info_t * rsc, const char *key, gboolean remove) { struct cancel_data data; CRM_CHECK(rsc != NULL, return FALSE); CRM_CHECK(key != NULL, return FALSE); data.key = key; data.rsc = rsc; data.done = FALSE; data.remove = remove; g_hash_table_foreach_remove(pending_ops, cancel_action_by_key, &data); return data.done; } static lrmd_rsc_info_t * get_lrm_resource(xmlNode * resource, xmlNode * op_msg, gboolean do_create) { lrmd_rsc_info_t *rsc = NULL; const char *id = ID(resource); const char *type = crm_element_value(resource, XML_ATTR_TYPE); const char *class = crm_element_value(resource, XML_AGENT_ATTR_CLASS); const char *provider = crm_element_value(resource, XML_AGENT_ATTR_PROVIDER); const char *long_id = crm_element_value(resource, XML_ATTR_ID_LONG); crm_trace("Retrieving %s from the LRM.", id); CRM_CHECK(id != NULL, return NULL); rsc = fsa_lrm_conn->cmds->get_rsc_info(fsa_lrm_conn, id, 0); if (!rsc && long_id) { rsc = fsa_lrm_conn->cmds->get_rsc_info(fsa_lrm_conn, long_id, 0); } if (!rsc && do_create) { CRM_CHECK(class != NULL, return NULL); CRM_CHECK(type != NULL, return NULL); crm_trace("Adding rsc %s before operation", id); fsa_lrm_conn->cmds->register_rsc(fsa_lrm_conn, id, class, provider, type, 0); rsc = fsa_lrm_conn->cmds->get_rsc_info(fsa_lrm_conn, id, 0); if (!rsc) { fsa_data_t *msg_data = NULL; crm_err("Could not add resource %s to LRM", id); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); } } return rsc; } static void delete_resource(const char *id, lrmd_rsc_info_t * rsc, GHashTableIter *gIter, const char *sys, const char *host, const char *user, ha_msg_input_t * request) { int rc = pcmk_ok; crm_info("Removing resource %s for %s (%s) on %s", id, sys, user ? user : "internal", host); if (rsc) { rc = fsa_lrm_conn->cmds->unregister_rsc(fsa_lrm_conn, id, 0); } if (rc == pcmk_ok) { crm_trace("Resource '%s' deleted", id); } else if (rc == -EINPROGRESS) { crm_info("Deletion of resource '%s' pending", id); if (request) { struct pending_deletion_op_s *op = NULL; char *ref = crm_element_value_copy(request->msg, XML_ATTR_REFERENCE); op = calloc(1, sizeof(struct pending_deletion_op_s)); op->rsc = strdup(rsc->id); op->input = copy_ha_msg_input(request); g_hash_table_insert(deletion_ops, ref, op); } return; } else { crm_warn("Deletion of resource '%s' for %s (%s) on %s failed: %d", id, sys, user ? user : "internal", host, rc); } delete_rsc_entry(request, id, gIter, rc, user); } /* A_LRM_INVOKE */ void do_lrm_invoke(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean done = FALSE; gboolean create_rsc = TRUE; const char *crm_op = NULL; const char *from_sys = NULL; const char *from_host = NULL; const char *operation = NULL; ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg); const char *user_name = NULL; #if ENABLE_ACL user_name = crm_element_value(input->msg, F_CRM_USER); crm_trace("LRM command from user '%s'", user_name); #endif crm_op = crm_element_value(input->msg, F_CRM_TASK); from_sys = crm_element_value(input->msg, F_CRM_SYS_FROM); if (safe_str_neq(from_sys, CRM_SYSTEM_TENGINE)) { from_host = crm_element_value(input->msg, F_CRM_HOST_FROM); } crm_trace("LRM command from: %s", from_sys); if (safe_str_eq(crm_op, CRM_OP_LRM_DELETE)) { operation = CRMD_ACTION_DELETE; } else if (safe_str_eq(operation, CRM_OP_LRM_REFRESH)) { crm_op = CRM_OP_LRM_REFRESH; } else if (safe_str_eq(crm_op, CRM_OP_LRM_FAIL)) { rsc_history_t *entry = NULL; lrmd_event_data_t *op = NULL; lrmd_rsc_info_t *rsc = NULL; xmlNode *xml_rsc = find_xml_node(input->xml, XML_CIB_TAG_RESOURCE, TRUE); CRM_CHECK(xml_rsc != NULL, return); /* The lrmd can not fail a resource, it does not understand the * concept of success or failure in relation to a resource, it simply * executes operations and reports the results. We determine what a failure is. * Becaues of this, if we want to fail a resource we have to fake what we * understand a failure to look like. * * To do this we create a fake lrmd operation event for the resource * we want to fail. We then pass that event to the lrmd client callback * so it will be processed as if it actually came from the lrmd. */ op = construct_op(input->xml, ID(xml_rsc), "asyncmon"); free((char *) op->user_data); op->user_data = NULL; entry = g_hash_table_lookup(resource_history, op->rsc_id); /* Make sure the call id is greater than the last successful operation, * otherwise the failure will not result in a possible recovery of the resource * as it could appear the failure occurred before the successful start */ if (entry && entry->last) { op->call_id = entry->last->call_id + 1; if (op->call_id < 0) { op->call_id = 1; } } op->interval = 0; op->op_status = PCMK_LRM_OP_DONE; op->rc = PCMK_EXECRA_UNKNOWN_ERROR; CRM_ASSERT(op != NULL); # if ENABLE_ACL if (user_name && is_privileged(user_name) == FALSE) { crm_err("%s does not have permission to fail %s", user_name, ID(xml_rsc)); send_direct_ack(from_host, from_sys, NULL, op, ID(xml_rsc)); lrmd_free_event(op); return; } # endif rsc = get_lrm_resource(xml_rsc, input->xml, create_rsc); if (rsc) { crm_info("Failing resource %s...", rsc->id); process_lrm_event(op); op->op_status = PCMK_LRM_OP_DONE; op->rc = PCMK_EXECRA_OK; lrmd_free_rsc_info(rsc); } else { crm_info("Cannot find/create resource in order to fail it..."); crm_log_xml_warn(input->msg, "bad input"); } send_direct_ack(from_host, from_sys, NULL, op, ID(xml_rsc)); lrmd_free_event(op); return; } else if (input->xml != NULL) { operation = crm_element_value(input->xml, XML_LRM_ATTR_TASK); } if (safe_str_eq(crm_op, CRM_OP_LRM_REFRESH)) { int rc = pcmk_ok; xmlNode *fragment = do_lrm_query(TRUE); crm_info("Forcing a local LRM refresh"); fsa_cib_update(XML_CIB_TAG_STATUS, fragment, cib_quorum_override, rc, user_name); free_xml(fragment); } else if (safe_str_eq(crm_op, CRM_OP_LRM_QUERY)) { xmlNode *data = do_lrm_query(FALSE); xmlNode *reply = create_reply(input->msg, data); if (relay_message(reply, TRUE) == FALSE) { crm_err("Unable to route reply"); crm_log_xml_err(reply, "reply"); } free_xml(reply); free_xml(data); } else if (safe_str_eq(operation, CRM_OP_PROBED)) { update_attrd(NULL, CRM_OP_PROBED, XML_BOOLEAN_TRUE, user_name); } else if (safe_str_eq(crm_op, CRM_OP_REPROBE)) { GHashTableIter gIter; rsc_history_t *entry = NULL; crm_notice("Forcing the status of all resources to be redetected"); g_hash_table_iter_init(&gIter, resource_history); while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) { delete_resource(entry->id, &entry->rsc, &gIter, from_sys, from_host, user_name, NULL); } /* Now delete the copy in the CIB */ erase_status_tag(fsa_our_uname, XML_CIB_TAG_LRM, cib_scope_local); /* And finally, _delete_ the value in attrd * Setting it to FALSE results in the PE sending us back here again */ update_attrd(NULL, CRM_OP_PROBED, NULL, user_name); } else if (operation != NULL) { lrmd_rsc_info_t *rsc = NULL; xmlNode *params = NULL; xmlNode *xml_rsc = find_xml_node(input->xml, XML_CIB_TAG_RESOURCE, TRUE); CRM_CHECK(xml_rsc != NULL, return); /* only the first 16 chars are used by the LRM */ params = find_xml_node(input->xml, XML_TAG_ATTRS, TRUE); if (safe_str_eq(operation, CRMD_ACTION_DELETE)) { create_rsc = FALSE; } rsc = get_lrm_resource(xml_rsc, input->xml, create_rsc); if (rsc == NULL && create_rsc) { crm_err("Invalid resource definition"); crm_log_xml_warn(input->msg, "bad input"); } else if (rsc == NULL) { lrmd_event_data_t *op = NULL; crm_notice("Not creating resource for a %s event: %s", operation, ID(input->xml)); delete_rsc_entry(input, ID(xml_rsc), NULL, pcmk_ok, user_name); op = construct_op(input->xml, ID(xml_rsc), operation); op->op_status = PCMK_LRM_OP_DONE; op->rc = PCMK_EXECRA_OK; CRM_ASSERT(op != NULL); send_direct_ack(from_host, from_sys, NULL, op, ID(xml_rsc)); lrmd_free_event(op); } else if (safe_str_eq(operation, CRMD_ACTION_CANCEL)) { lrmd_event_data_t *op = NULL; char *op_key = NULL; char *meta_key = NULL; int call = 0; const char *call_id = NULL; const char *op_task = NULL; const char *op_interval = NULL; CRM_CHECK(params != NULL, crm_log_xml_warn(input->xml, "Bad command"); return); meta_key = crm_meta_name(XML_LRM_ATTR_INTERVAL); op_interval = crm_element_value(params, meta_key); free(meta_key); meta_key = crm_meta_name(XML_LRM_ATTR_TASK); op_task = crm_element_value(params, meta_key); free(meta_key); meta_key = crm_meta_name(XML_LRM_ATTR_CALLID); call_id = crm_element_value(params, meta_key); free(meta_key); CRM_CHECK(op_task != NULL, crm_log_xml_warn(input->xml, "Bad command"); return); CRM_CHECK(op_interval != NULL, crm_log_xml_warn(input->xml, "Bad command"); return); op = construct_op(input->xml, rsc->id, op_task); CRM_ASSERT(op != NULL); op_key = generate_op_key(rsc->id, op_task, crm_parse_int(op_interval, "0")); crm_debug("PE requested op %s (call=%s) be cancelled", op_key, call_id ? call_id : "NA"); call = crm_parse_int(call_id, "0"); if (call == 0) { /* the normal case when the PE cancels a recurring op */ done = cancel_op_key(rsc, op_key, TRUE); } else { /* the normal case when the PE cancels an orphan op */ done = cancel_op(rsc->id, NULL, call, TRUE); } if (done == FALSE) { crm_debug("Nothing known about operation %d for %s", call, op_key); delete_op_entry(NULL, rsc->id, op_key, call); /* needed?? surely not otherwise the cancel_op_(_key) wouldn't * have failed in the first place */ g_hash_table_remove(pending_ops, op_key); } op->rc = PCMK_EXECRA_OK; op->op_status = PCMK_LRM_OP_DONE; send_direct_ack(from_host, from_sys, rsc, op, rsc->id); free(op_key); lrmd_free_event(op); } else if (safe_str_eq(operation, CRMD_ACTION_DELETE)) { int cib_rc = pcmk_ok; CRM_ASSERT(rsc != NULL); cib_rc = delete_rsc_status(rsc->id, cib_dryrun | cib_sync_call, user_name); if (cib_rc != pcmk_ok) { lrmd_event_data_t *op = NULL; crm_err ("Attempt of deleting resource status '%s' from CIB for %s (user=%s) on %s failed: (rc=%d) %s", rsc->id, from_sys, user_name ? user_name : "unknown", from_host, cib_rc, pcmk_strerror(cib_rc)); op = construct_op(input->xml, rsc->id, operation); op->op_status = PCMK_LRM_OP_ERROR; if (cib_rc == -EACCES) { op->rc = PCMK_EXECRA_INSUFFICIENT_PRIV; } else { op->rc = PCMK_EXECRA_UNKNOWN_ERROR; } send_direct_ack(from_host, from_sys, NULL, op, rsc->id); lrmd_free_event(op); return; } delete_resource(rsc->id, rsc, NULL, from_sys, from_host, user_name, input); } else if (rsc != NULL) { do_lrm_rsc_op(rsc, operation, input->xml, input->msg); } lrmd_free_rsc_info(rsc); } else { crm_err("Operation was neither a lrm_query, nor a rsc op. %s", crm_str(crm_op)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } static lrmd_event_data_t * construct_op(xmlNode * rsc_op, const char *rsc_id, const char *operation) { lrmd_event_data_t *op = NULL; const char *op_delay = NULL; const char *op_timeout = NULL; const char *op_interval = NULL; GHashTable *params = NULL; const char *transition = NULL; CRM_LOG_ASSERT(rsc_id != NULL); op = calloc(1, sizeof(lrmd_event_data_t)); op->type = lrmd_event_exec_complete; op->op_type = strdup(operation); op->op_status = PCMK_LRM_OP_PENDING; op->rc = -1; op->rsc_id = strdup(rsc_id); op->interval = 0; op->timeout = 0; op->start_delay = 0; if (rsc_op == NULL) { CRM_LOG_ASSERT(safe_str_eq(CRMD_ACTION_STOP, operation)); op->user_data = NULL; /* the stop_all_resources() case * by definition there is no DC (or they'd be shutting * us down). * So we should put our version here. */ op->params = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); g_hash_table_insert(op->params, strdup(XML_ATTR_CRM_VERSION), strdup(CRM_FEATURE_SET)); crm_trace("Constructed %s op for %s", operation, rsc_id); return op; } params = xml2list(rsc_op); g_hash_table_remove(params, CRM_META "_op_target_rc"); op_delay = crm_meta_value(params, XML_OP_ATTR_START_DELAY); op_timeout = crm_meta_value(params, XML_ATTR_TIMEOUT); op_interval = crm_meta_value(params, XML_LRM_ATTR_INTERVAL); op->interval = crm_parse_int(op_interval, "0"); op->timeout = crm_parse_int(op_timeout, "0"); op->start_delay = crm_parse_int(op_delay, "0"); if (safe_str_neq(operation, RSC_STOP)) { op->params = params; } else { rsc_history_t *entry = g_hash_table_lookup(resource_history, rsc_id); /* If we do not have stop parameters cached, use * whatever we are given */ if (!entry || !entry->stop_params) { op->params = params; } else { /* Copy the cached parameter list so that we stop the resource * with the old attributes, not the new ones */ op->params = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); g_hash_table_foreach(params, copy_meta_keys, op->params); g_hash_table_foreach(entry->stop_params, copy_instance_keys, op->params); g_hash_table_destroy(params); params = NULL; } } /* sanity */ if (op->interval < 0) { op->interval = 0; } if (op->timeout <= 0) { op->timeout = op->interval; } if (op->start_delay < 0) { op->start_delay = 0; } transition = crm_element_value(rsc_op, XML_ATTR_TRANSITION_KEY); CRM_CHECK(transition != NULL, return op); op->user_data = strdup(transition); if (op->interval != 0) { if (safe_str_eq(operation, CRMD_ACTION_START) || safe_str_eq(operation, CRMD_ACTION_STOP)) { crm_err("Start and Stop actions cannot have an interval: %d", op->interval); op->interval = 0; } } crm_trace("Constructed %s op for %s: interval=%d", operation, rsc_id, op->interval); return op; } void send_direct_ack(const char *to_host, const char *to_sys, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op, const char *rsc_id) { xmlNode *reply = NULL; xmlNode *update, *iter; xmlNode *fragment; CRM_CHECK(op != NULL, return); if (op->rsc_id == NULL) { CRM_LOG_ASSERT(rsc_id != NULL); op->rsc_id = strdup(rsc_id); } if (to_sys == NULL) { to_sys = CRM_SYSTEM_TENGINE; } update = create_node_state(fsa_our_uname, NULL, NULL, NULL, NULL, FALSE, __FUNCTION__); iter = create_xml_node(update, XML_CIB_TAG_LRM); crm_xml_add(iter, XML_ATTR_ID, fsa_our_uuid); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCES); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCE); crm_xml_add(iter, XML_ATTR_ID, op->rsc_id); build_operation_update(iter, rsc, op, __FUNCTION__); fragment = create_cib_fragment(update, XML_CIB_TAG_STATUS); reply = create_request(CRM_OP_INVOKE_LRM, fragment, to_host, to_sys, CRM_SYSTEM_LRMD, NULL); crm_log_xml_trace(update, "ACK Update"); crm_debug("ACK'ing resource op %s_%s_%d from %s: %s", op->rsc_id, op->op_type, op->interval, op->user_data, crm_element_value(reply, XML_ATTR_REFERENCE)); if (relay_message(reply, TRUE) == FALSE) { crm_log_xml_err(reply, "Unable to route reply"); } free_xml(fragment); free_xml(update); free_xml(reply); } static gboolean stop_recurring_action_by_rsc(gpointer key, gpointer value, gpointer user_data) { lrmd_rsc_info_t *rsc = user_data; struct recurring_op_s *op = (struct recurring_op_s *)value; if (op->interval != 0 && safe_str_eq(op->rsc_id, rsc->id)) { if (cancel_op(rsc->id, key, op->call_id, FALSE) == FALSE) { return TRUE; } } return FALSE; } static gboolean stop_recurring_actions(gpointer key, gpointer value, gpointer user_data) { gboolean remove = FALSE; struct recurring_op_s *op = (struct recurring_op_s *)value; if (op->interval != 0) { remove = cancel_op(op->rsc_id, key, op->call_id, FALSE); } return remove; } void do_lrm_rsc_op(lrmd_rsc_info_t * rsc, const char *operation, xmlNode * msg, xmlNode * request) { int call_id = 0; char *op_id = NULL; lrmd_event_data_t *op = NULL; lrmd_key_value_t *params = NULL; fsa_data_t *msg_data = NULL; const char *transition = NULL; CRM_CHECK(rsc != NULL, return); if (msg != NULL) { transition = crm_element_value(msg, XML_ATTR_TRANSITION_KEY); if (transition == NULL) { crm_log_xml_err(msg, "Missing transition number"); } } op = construct_op(msg, rsc->id, operation); /* stop the monitor before stopping the resource */ if (crm_str_eq(operation, CRMD_ACTION_STOP, TRUE) || crm_str_eq(operation, CRMD_ACTION_DEMOTE, TRUE) || crm_str_eq(operation, CRMD_ACTION_PROMOTE, TRUE) || crm_str_eq(operation, CRMD_ACTION_MIGRATE, TRUE)) { g_hash_table_foreach_remove(pending_ops, stop_recurring_action_by_rsc, rsc); } /* now do the op */ crm_debug("Performing key=%s op=%s_%s_%d", transition, rsc->id, operation, op->interval); if (fsa_state != S_NOT_DC && fsa_state != S_POLICY_ENGINE && fsa_state != S_TRANSITION_ENGINE) { if (safe_str_neq(operation, "fail") && safe_str_neq(operation, CRMD_ACTION_STOP)) { crm_info("Discarding attempt to perform action %s on %s" " in state %s", operation, rsc->id, fsa_state2string(fsa_state)); op->rc = 99; op->op_status = PCMK_LRM_OP_ERROR; send_direct_ack(NULL, NULL, rsc, op, rsc->id); lrmd_free_event(op); free(op_id); return; } } op_id = generate_op_key(rsc->id, op->op_type, op->interval); if (op->interval > 0) { /* cancel it so we can then restart it without conflict */ cancel_op_key(rsc, op_id, FALSE); } if (op->params) { char *key = NULL; char *value = NULL; GHashTableIter iter; g_hash_table_iter_init(&iter, op->params); while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) { params = lrmd_key_value_add(params, key, value); } } call_id = fsa_lrm_conn->cmds->exec(fsa_lrm_conn, rsc->id, op->op_type, op->user_data, op->interval, op->timeout, op->start_delay, 0, params); if (call_id <= 0) { crm_err("Operation %s on %s failed: %d", operation, rsc->id, call_id); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); } else { /* record all operations so we can wait * for them to complete during shutdown */ char *call_id_s = make_stop_id(rsc->id, call_id); struct recurring_op_s *pending = NULL; pending = calloc(1, sizeof(struct recurring_op_s)); crm_trace("Recording pending op: %d - %s %s", call_id, op_id, call_id_s); pending->call_id = call_id; pending->interval = op->interval; pending->op_type = strdup(operation); pending->op_key = strdup(op_id); pending->rsc_id = strdup(rsc->id); pending->last_rc = -1; /* All rc are positive, -1 indicates the last rc has not been set. */ pending->last_op_status = -2; g_hash_table_replace(pending_ops, call_id_s, pending); if (op->interval > 0 && op->start_delay > START_DELAY_THRESHOLD) { char *uuid = NULL; int dummy = 0, target_rc = 0; crm_info("Faking confirmation of %s: execution postponed for over 5 minutes", op_id); decode_transition_key(op->user_data, &uuid, &dummy, &dummy, &target_rc); free(uuid); op->rc = target_rc; op->op_status = PCMK_LRM_OP_DONE; send_direct_ack(NULL, NULL, rsc, op, rsc->id); } } free(op_id); lrmd_free_event(op); return; } static void cib_rsc_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { switch (rc) { case pcmk_ok: case -pcmk_err_diff_failed: case -pcmk_err_diff_resync: crm_trace("Resource update %d complete: rc=%d", call_id, rc); break; default: crm_warn("Resource update %d failed: (rc=%d) %s", call_id, rc, pcmk_strerror(rc)); } } static int do_update_resource(lrmd_rsc_info_t * rsc, lrmd_event_data_t * op) { /* */ int rc = pcmk_ok; xmlNode *update, *iter = NULL; int call_opt = cib_quorum_override; CRM_CHECK(op != NULL, return 0); if (fsa_state == S_ELECTION || fsa_state == S_PENDING) { crm_info("Sending update to local CIB in state: %s", fsa_state2string(fsa_state)); call_opt |= cib_scope_local; } iter = create_xml_node(iter, XML_CIB_TAG_STATUS); update = iter; iter = create_xml_node(iter, XML_CIB_TAG_STATE); set_uuid(iter, XML_ATTR_UUID, fsa_our_uname); crm_xml_add(iter, XML_ATTR_UNAME, fsa_our_uname); crm_xml_add(iter, XML_ATTR_ORIGIN, __FUNCTION__); iter = create_xml_node(iter, XML_CIB_TAG_LRM); crm_xml_add(iter, XML_ATTR_ID, fsa_our_uuid); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCES); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCE); crm_xml_add(iter, XML_ATTR_ID, op->rsc_id); build_operation_update(iter, rsc, op, __FUNCTION__); if (rsc) { crm_xml_add(iter, XML_ATTR_TYPE, rsc->type); crm_xml_add(iter, XML_AGENT_ATTR_CLASS, rsc->class); crm_xml_add(iter, XML_AGENT_ATTR_PROVIDER, rsc->provider); CRM_CHECK(rsc->type != NULL, crm_err("Resource %s has no value for type", op->rsc_id)); CRM_CHECK(rsc->class != NULL, crm_err("Resource %s has no value for class", op->rsc_id)); } else { crm_warn("Resource %s no longer exists in the lrmd", op->rsc_id); goto cleanup; } /* make it an asyncronous call and be done with it * * Best case: * the resource state will be discovered during * the next signup or election. * * Bad case: * we are shutting down and there is no DC at the time, * but then why were we shutting down then anyway? * (probably because of an internal error) * * Worst case: * we get shot for having resources "running" when the really weren't * * the alternative however means blocking here for too long, which * isnt acceptable */ fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, rc, NULL); /* the return code is a call number, not an error code */ crm_trace("Sent resource state update message: %d", rc); fsa_cib_conn->cmds->register_callback(fsa_cib_conn, rc, 60, FALSE, NULL, "cib_rsc_callback", cib_rsc_callback); cleanup: free_xml(update); return rc; } void do_lrm_event(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t * msg_data) { CRM_CHECK(FALSE, return); } gboolean process_lrm_event(lrmd_event_data_t * op) { char *op_id = NULL; char *op_key = NULL; int update_id = 0; int log_level = LOG_ERR; gboolean removed = FALSE; lrmd_rsc_info_t *rsc = NULL; struct recurring_op_s *pending = NULL; CRM_CHECK(op != NULL, return FALSE); if (op->type == lrmd_event_disconnect) { lrm_connection_destroy(); return TRUE; } else if (op->type != lrmd_event_exec_complete) { return TRUE; } CRM_CHECK(op->rsc_id != NULL, return FALSE); op_id = make_stop_id(op->rsc_id, op->call_id); pending = g_hash_table_lookup(pending_ops, op_id); /* ignore recurring ops that have not changed. */ if (op->interval && pending && (pending->last_rc == op->rc) && (pending->last_op_status == op->op_status)) { free(op_id); return TRUE; } if (pending) { pending->last_rc = op->rc; pending->last_op_status = op->op_status; } op_key = generate_op_key(op->rsc_id, op->op_type, op->interval); rsc = fsa_lrm_conn->cmds->get_rsc_info(fsa_lrm_conn, op->rsc_id, 0); switch (op->op_status) { case PCMK_LRM_OP_ERROR: case PCMK_LRM_OP_PENDING: case PCMK_LRM_OP_NOTSUPPORTED: break; case PCMK_LRM_OP_CANCELLED: log_level = LOG_INFO; break; case PCMK_LRM_OP_DONE: log_level = LOG_NOTICE; break; case PCMK_LRM_OP_TIMEOUT: log_level = LOG_DEBUG_3; crm_err("LRM operation %s (%d) %s (timeout=%dms)", op_key, op->call_id, services_lrm_status_str(op->op_status), op->timeout); break; default: crm_err("Mapping unknown status (%d) to ERROR", op->op_status); op->op_status = PCMK_LRM_OP_ERROR; } if (op->op_status == PCMK_LRM_OP_ERROR && (op->rc == PCMK_EXECRA_RUNNING_MASTER || op->rc == PCMK_EXECRA_NOT_RUNNING)) { /* Leave it up to the TE/PE to decide if this is an error */ op->op_status = PCMK_LRM_OP_DONE; log_level = LOG_INFO; } if (op->op_status != PCMK_LRM_OP_CANCELLED) { if (safe_str_eq(op->op_type, RSC_NOTIFY)) { /* Keep notify ops out of the CIB */ send_direct_ack(NULL, NULL, NULL, op, op->rsc_id); } else { update_id = do_update_resource(rsc, op); } } else if (op->interval == 0) { /* This will occur when "crm resource cleanup" is called while actions are in-flight */ crm_err("Op %s (call=%d): Cancelled", op_key, op->call_id); send_direct_ack(NULL, NULL, NULL, op, op->rsc_id); } else if (pending == NULL) { /* Operations that are cancelled may safely be removed * from the pending op list before the lrmd completion event * is received. Only report non-cancelled ops here. */ if (op->op_status != PCMK_LRM_OP_CANCELLED) { crm_err("Op %s (call=%d): No 'pending' entry", op_key, op->call_id); } } else if (op->user_data == NULL) { crm_err("Op %s (call=%d): No user data", op_key, op->call_id); } else if (pending->remove) { delete_op_entry(op, op->rsc_id, op_key, op->call_id); } else { /* Before a stop is called, no need to direct ack */ crm_trace("Op %s (call=%d): no delete event required", op_key, op->call_id); } if ((op->interval == 0) && g_hash_table_remove(pending_ops, op_id)) { removed = TRUE; crm_trace("Op %s (call=%d, stop-id=%s): Confirmed", op_key, op->call_id, op_id); } if (op->op_status == PCMK_LRM_OP_DONE) { do_crm_log(log_level, "LRM operation %s (call=%d, rc=%d, cib-update=%d, confirmed=%s) %s", op_key, op->call_id, op->rc, update_id, removed ? "true" : "false", lrmd_event_rc2str(op->rc)); } else { do_crm_log(log_level, "LRM operation %s (call=%d, status=%d, cib-update=%d, confirmed=%s) %s", op_key, op->call_id, op->op_status, update_id, removed ? "true" : "false", services_lrm_status_str(op->op_status)); } if (op->rc != 0 && op->output != NULL) { crm_info("Result: %s", op->output); } else if (op->output != NULL) { crm_debug("Result: %s", op->output); } if (op->rsc_deleted) { crm_info("Deletion of resource '%s' complete after %s", op->rsc_id, op_key); delete_rsc_entry(NULL, op->rsc_id, NULL, pcmk_ok, NULL); } /* If a shutdown was escalated while operations were pending, * then the FSA will be stalled right now... allow it to continue */ mainloop_set_trigger(fsa_source); update_history_cache(rsc, op); lrmd_free_rsc_info(rsc); free(op_key); free(op_id); return TRUE; } diff --git a/crmd/membership.c b/crmd/membership.c index 6331e6452f..96d08c2334 100644 --- a/crmd/membership.c +++ b/crmd/membership.c @@ -1,253 +1,306 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ /* put these first so that uuid_t is defined without conflicts */ #include #include #include #include #include #include #include #include #include #include #include #include gboolean membership_flux_hack = FALSE; void post_cache_update(int instance); -void ghash_update_cib_node(gpointer key, gpointer value, gpointer user_data); - int last_peer_update = 0; extern GHashTable *voted; struct update_data_s { - const char *state; const char *caller; - xmlNode *updates; - gboolean overwrite_join; + xmlNode *parent; + int flags; }; extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); static void check_dead_member(const char *uname, GHashTable * members) { CRM_CHECK(uname != NULL, return); if (members != NULL && g_hash_table_lookup(members, uname) != NULL) { crm_err("%s didnt really leave the membership!", uname); return; } erase_node_from_join(uname); if (voted != NULL) { g_hash_table_remove(voted, uname); } if (safe_str_eq(fsa_our_uname, uname)) { crm_err("We're not part of the cluster anymore"); register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL); } else if (AM_I_DC == FALSE && safe_str_eq(uname, fsa_our_dc)) { crm_warn("Our DC node (%s) left the cluster", uname); register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL); } else if (fsa_state == S_INTEGRATION || fsa_state == S_FINALIZE_JOIN) { check_join_state(fsa_state, __FUNCTION__); } } static void reap_dead_nodes(gpointer key, gpointer value, gpointer user_data) { crm_node_t *node = value; if (crm_is_peer_active(node) == FALSE) { check_dead_member(node->uname, NULL); fail_incompletable_actions(transition_graph, node->uuid); } } gboolean ever_had_quorum = FALSE; void post_cache_update(int instance) { xmlNode *no_op = NULL; crm_peer_seq = instance; crm_debug("Updated cache after membership event %d.", instance); g_hash_table_foreach(crm_peer_cache, reap_dead_nodes, NULL); set_bit(fsa_input_register, R_MEMBERSHIP); if (AM_I_DC) { - populate_cib_nodes(TRUE); - do_update_cib_nodes(FALSE, __FUNCTION__); + populate_cib_nodes(node_update_quick|node_update_cluster|node_update_peer, __FUNCTION__); } /* * If we lost nodes, we should re-check the election status * Safe to call outside of an election */ register_fsa_action(A_ELECTION_CHECK); /* Membership changed, remind everyone we're here. * This will aid detection of duplicate DCs */ no_op = create_request(CRM_OP_NOOP, NULL, NULL, CRM_SYSTEM_CRMD, AM_I_DC ? CRM_SYSTEM_DC : CRM_SYSTEM_CRMD, NULL); send_cluster_message(NULL, crm_msg_crmd, no_op, FALSE); free_xml(no_op); } static void crmd_node_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; last_peer_update = 0; if (rc == pcmk_ok) { crm_trace("Node update %d complete", call_id); } else { crm_err("Node update %d failed", call_id); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } -void -ghash_update_cib_node(gpointer key, gpointer value, gpointer user_data) +static xmlNode * +do_update_node_cib(crm_node_t *node, int flags, xmlNode *parent, const char *source) { - xmlNode *tmp1 = NULL; - const char *join = NULL; - crm_node_t *node = value; - struct update_data_s *data = (struct update_data_s *)user_data; + const char *value = NULL; + xmlNode *node_state = create_xml_node(parent, XML_CIB_TAG_STATE); - data->state = XML_BOOLEAN_NO; - if (safe_str_eq(node->state, CRM_NODE_ACTIVE)) { - data->state = XML_BOOLEAN_YES; + set_uuid(node_state, XML_ATTR_UUID, node->uname); + crm_xml_add(node_state, XML_ATTR_UNAME, node->uname); + + if(flags & node_update_cluster) { + if (safe_str_eq(node->state, CRM_NODE_ACTIVE)) { + value = XML_BOOLEAN_YES; + } else if(node->state) { + value = XML_BOOLEAN_NO; + } else { + value = NULL; + } + crm_xml_add(node_state, XML_NODE_IN_CLUSTER, value); } - crm_debug("Updating %s: %s (overwrite=%s) hash_size=%d", - node->uname, data->state, data->overwrite_join ? "true" : "false", - g_hash_table_size(confirmed_nodes)); + if(flags & node_update_peer) { + value = OFFLINESTATUS; + if (node->processes & proc_flags) { + value = ONLINESTATUS; + } + crm_xml_add(node_state, XML_NODE_IS_PEER, value); + } - if (data->overwrite_join) { - if ((node->processes & proc_flags) == FALSE) { - join = CRMD_JOINSTATE_DOWN; + if(flags & node_update_join) { + const char *peer_member = g_hash_table_lookup(confirmed_nodes, node->uname); + if (peer_member != NULL) { + value = CRMD_JOINSTATE_MEMBER; } else { - const char *peer_member = g_hash_table_lookup(confirmed_nodes, node->uname); + value = CRMD_JOINSTATE_DOWN; + } + crm_xml_add(node_state, XML_NODE_JOIN_STATE, value); + } + + if(flags & node_update_expected) { + const char *peer_member = g_hash_table_lookup(confirmed_nodes, node->uname); - if (peer_member != NULL) { - join = CRMD_JOINSTATE_MEMBER; - } else { - join = CRMD_JOINSTATE_PENDING; - } + if (peer_member != NULL) { + value = CRMD_JOINSTATE_MEMBER; + } else { + value = CRMD_JOINSTATE_DOWN; } + crm_xml_add(node_state, XML_NODE_EXPECTED, value); } + + crm_xml_add(node_state, XML_ATTR_ORIGIN, source); + + return node_state; +} + +static void +ghash_update_cib_node(gpointer key, gpointer value, gpointer user_data) +{ + crm_node_t *node = value; + struct update_data_s *data = (struct update_data_s *)user_data; + + do_update_node_cib(node, data->flags, data->parent, data->caller); +} - tmp1 = - create_node_state(node->uname, - data->state, - (node->processes & proc_flags) ? ONLINESTATUS : OFFLINESTATUS, join, - NULL, FALSE, data->caller); +static void +create_cib_node_definition(gpointer key, gpointer value, gpointer user_data) +{ + crm_node_t *node = value; + xmlNode *cib_nodes = user_data; + xmlNode *cib_new_node = NULL; - add_node_copy(data->updates, tmp1); - free_xml(tmp1); + crm_trace("Creating node entry for %s/%s", node->uname, node->uuid); + cib_new_node = create_xml_node(cib_nodes, XML_CIB_TAG_NODE); + crm_xml_add(cib_new_node, XML_ATTR_ID, node->uuid); + crm_xml_add(cib_new_node, XML_ATTR_UNAME, node->uname); } void -do_update_cib_nodes(gboolean overwrite, const char *caller) +populate_cib_nodes(enum node_update_flags flags, const char *source) { int call_id = 0; + gboolean from_hashtable = TRUE; int call_options = cib_scope_local | cib_quorum_override; - struct update_data_s update_data; - xmlNode *fragment = NULL; - - if (crm_peer_cache == NULL) { - /* We got a replace notification before being connected to - * the CCM. - * So there is no need to update the local CIB with our values - * - since we have none. - */ - return; + xmlNode *node_list = create_xml_node(NULL, XML_CIB_TAG_NODES); - } else if (AM_I_DC == FALSE) { - return; +#if SUPPORT_HEARTBEAT + if (is_not_set(flags, node_update_quick) && is_heartbeat_cluster()) { + from_hashtable = heartbeat_initialize_nodelist(fsa_cluster_conn, FALSE, node_list); } +#endif - fragment = create_xml_node(NULL, XML_CIB_TAG_STATUS); +#if SUPPORT_COROSYNC + if (is_not_set(flags, node_update_quick) && is_corosync_cluster()) { + from_hashtable = corosync_initialize_nodelist(NULL, FALSE, node_list); + } +#endif + + if(from_hashtable) { + /* if(uname_is_uuid()) { */ + /* g_hash_table_foreach(crm_peer_id_cache, create_cib_node_definition, node_list); */ + /* } else { */ + g_hash_table_foreach(crm_peer_cache, create_cib_node_definition, node_list); + /* } */ + } - update_data.caller = caller; - update_data.updates = fragment; - update_data.overwrite_join = overwrite; + crm_trace("Populating section from %s", from_hashtable?"hashtable":"cluster"); - g_hash_table_foreach(crm_peer_cache, ghash_update_cib_node, &update_data); + fsa_cib_update(XML_CIB_TAG_NODES, node_list, call_options, call_id, NULL); + add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, default_cib_update_callback); - fsa_cib_update(XML_CIB_TAG_STATUS, fragment, call_options, call_id, NULL); - add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, crmd_node_update_complete); - last_peer_update = call_id; + free_xml(node_list); + + if (crm_peer_cache != NULL && AM_I_DC) { + /* + * There is no need to update the local CIB with our values if + * we've not seen valid membership data + */ + struct update_data_s update_data; + node_list = create_xml_node(NULL, XML_CIB_TAG_STATUS); - free_xml(fragment); + update_data.caller = source; + update_data.parent = node_list; + update_data.flags = flags; + + g_hash_table_foreach(crm_peer_cache, ghash_update_cib_node, &update_data); + + fsa_cib_update(XML_CIB_TAG_STATUS, node_list, call_options, call_id, NULL); + add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, crmd_node_update_complete); + last_peer_update = call_id; + + free_xml(node_list); + } } static void cib_quorum_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { fsa_data_t *msg_data = NULL; if (rc == pcmk_ok) { crm_trace("Quorum update %d complete", call_id); } else { crm_err("Quorum update %d failed", call_id); crm_log_xml_debug(msg, "failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } void crm_update_quorum(gboolean quorum, gboolean force_update) { ever_had_quorum |= quorum; if (AM_I_DC && (force_update || fsa_has_quorum != quorum)) { int call_id = 0; xmlNode *update = NULL; int call_options = cib_scope_local | cib_quorum_override; update = create_xml_node(NULL, XML_TAG_CIB); crm_xml_add_int(update, XML_ATTR_HAVE_QUORUM, quorum); set_uuid(update, XML_ATTR_DC_UUID, fsa_our_uname); fsa_cib_update(XML_TAG_CIB, update, call_options, call_id, NULL); crm_debug("Updating quorum status to %s (call=%d)", quorum ? "true" : "false", call_id); add_cib_op_callback(fsa_cib_conn, call_id, FALSE, NULL, cib_quorum_update_complete); free_xml(update); } fsa_has_quorum = quorum; } diff --git a/crmd/membership.h b/crmd/membership.h index 40bebed7ba..a67748c0ee 100644 --- a/crmd/membership.h +++ b/crmd/membership.h @@ -1,7 +1,6 @@ -void ghash_update_cib_node(gpointer key, gpointer value, gpointer user_data); void reap_dead_ccm_nodes(gpointer key, gpointer value, gpointer user_data); void post_cache_update(int instance); extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source); #define proc_flags (crm_proc_crmd | crm_proc_cpg) diff --git a/crmd/messages.c b/crmd/messages.c index 035cd5ba4b..db492b6923 100644 --- a/crmd/messages.c +++ b/crmd/messages.c @@ -1,984 +1,975 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include GListPtr fsa_message_queue = NULL; extern void crm_shutdown(int nsig); void handle_response(xmlNode * stored_msg); enum crmd_fsa_input handle_request(xmlNode * stored_msg); enum crmd_fsa_input handle_shutdown_request(xmlNode * stored_msg); #ifdef MSG_LOG # define ROUTER_RESULT(x) crm_trace("Router result: %s", x); \ crm_log_xml_trace(msg, "router.log"); #else # define ROUTER_RESULT(x) crm_trace("Router result: %s", x) #endif /* debug only, can wrap all it likes */ int last_data_id = 0; void register_fsa_error_adv(enum crmd_fsa_cause cause, enum crmd_fsa_input input, fsa_data_t * cur_data, void *new_data, const char *raised_from) { /* save the current actions if any */ if (fsa_actions != A_NOTHING) { register_fsa_input_adv(cur_data ? cur_data->fsa_cause : C_FSA_INTERNAL, I_NULL, cur_data ? cur_data->data : NULL, fsa_actions, TRUE, __FUNCTION__); } /* reset the action list */ fsa_actions = A_NOTHING; /* register the error */ register_fsa_input_adv(cause, input, new_data, A_NOTHING, TRUE, raised_from); } int register_fsa_input_adv(enum crmd_fsa_cause cause, enum crmd_fsa_input input, void *data, long long with_actions, gboolean prepend, const char *raised_from) { unsigned old_len = g_list_length(fsa_message_queue); fsa_data_t *fsa_data = NULL; last_data_id++; CRM_CHECK(raised_from != NULL, raised_from = ""); crm_trace("%s %s FSA input %d (%s) (cause=%s) %s data", raised_from, prepend ? "prepended" : "appended", last_data_id, fsa_input2string(input), fsa_cause2string(cause), data ? "with" : "without"); if (input == I_WAIT_FOR_EVENT) { do_fsa_stall = TRUE; crm_debug("Stalling the FSA pending further input: cause=%s", fsa_cause2string(cause)); if (old_len > 0) { crm_warn("%s stalled the FSA with pending inputs", raised_from); fsa_dump_queue(LOG_DEBUG); } if (data == NULL) { set_bit(fsa_actions, with_actions); with_actions = A_NOTHING; return 0; } crm_err("%s stalled the FSA with data - this may be broken", raised_from); } if (input == I_NULL && with_actions == A_NOTHING /* && data == NULL */ ) { /* no point doing anything */ crm_err("Cannot add entry to queue: no input and no action"); return 0; } fsa_data = calloc(1, sizeof(fsa_data_t)); fsa_data->id = last_data_id; fsa_data->fsa_input = input; fsa_data->fsa_cause = cause; fsa_data->origin = raised_from; fsa_data->data = NULL; fsa_data->data_type = fsa_dt_none; fsa_data->actions = with_actions; if (with_actions != A_NOTHING) { crm_trace("Adding actions %.16llx to input", with_actions); } if (data != NULL) { switch (cause) { case C_FSA_INTERNAL: case C_CRMD_STATUS_CALLBACK: case C_IPC_MESSAGE: case C_HA_MESSAGE: crm_trace("Copying %s data from %s as a HA msg", fsa_cause2string(cause), raised_from); CRM_CHECK(((ha_msg_input_t *) data)->msg != NULL, crm_err("Bogus data from %s", raised_from)); fsa_data->data = copy_ha_msg_input(data); fsa_data->data_type = fsa_dt_ha_msg; break; case C_LRM_OP_CALLBACK: crm_trace("Copying %s data from %s as lrmd_event_data_t", fsa_cause2string(cause), raised_from); fsa_data->data = lrmd_copy_event((lrmd_event_data_t *) data); fsa_data->data_type = fsa_dt_lrm; break; case C_CCM_CALLBACK: case C_SUBSYSTEM_CONNECT: case C_LRM_MONITOR_CALLBACK: case C_TIMER_POPPED: case C_SHUTDOWN: case C_HEARTBEAT_FAILED: case C_HA_DISCONNECT: case C_ILLEGAL: case C_UNKNOWN: case C_STARTUP: crm_err("Copying %s data (from %s)" " not yet implemented", fsa_cause2string(cause), raised_from); exit(1); break; } crm_trace("%s data copied", fsa_cause2string(fsa_data->fsa_cause)); } /* make sure to free it properly later */ if (prepend) { crm_trace("Prepending input"); fsa_message_queue = g_list_prepend(fsa_message_queue, fsa_data); } else { fsa_message_queue = g_list_append(fsa_message_queue, fsa_data); } crm_trace("Queue len: %d", g_list_length(fsa_message_queue)); fsa_dump_queue(LOG_DEBUG_2); if (old_len == g_list_length(fsa_message_queue)) { crm_err("Couldnt add message to the queue"); } if (fsa_source) { crm_trace("Triggering FSA: %s", __FUNCTION__); mainloop_set_trigger(fsa_source); } return last_data_id; } void fsa_dump_queue(int log_level) { int offset = 0; GListPtr lpc = NULL; for (lpc = fsa_message_queue; lpc != NULL; lpc = lpc->next) { fsa_data_t *data = (fsa_data_t *) lpc->data; do_crm_log_unlikely(log_level, "queue[%d(%d)]: input %s raised by %s()\t(cause=%s)", offset++, data->id, fsa_input2string(data->fsa_input), data->origin, fsa_cause2string(data->fsa_cause)); } } ha_msg_input_t * copy_ha_msg_input(ha_msg_input_t * orig) { ha_msg_input_t *copy = NULL; xmlNodePtr data = NULL; if (orig != NULL) { crm_trace("Copy msg"); data = copy_xml(orig->msg); } else { crm_trace("No message to copy"); } copy = new_ha_msg_input(data); if (orig && orig->msg != NULL) { CRM_CHECK(copy->msg != NULL, crm_err("copy failed")); } return copy; } void delete_fsa_input(fsa_data_t * fsa_data) { lrmd_event_data_t *op = NULL; xmlNode *foo = NULL; if (fsa_data == NULL) { return; } crm_trace("About to free %s data", fsa_cause2string(fsa_data->fsa_cause)); if (fsa_data->data != NULL) { switch (fsa_data->data_type) { case fsa_dt_ha_msg: delete_ha_msg_input(fsa_data->data); break; case fsa_dt_xml: foo = fsa_data->data; free_xml(foo); break; case fsa_dt_lrm: op = (lrmd_event_data_t *) fsa_data->data; lrmd_free_event(op); break; case fsa_dt_none: if (fsa_data->data != NULL) { crm_err("Dont know how to free %s data from %s", fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); exit(1); } break; } crm_trace("%s data freed", fsa_cause2string(fsa_data->fsa_cause)); } free(fsa_data); } /* returns the next message */ fsa_data_t * get_message(void) { fsa_data_t *message = g_list_nth_data(fsa_message_queue, 0); fsa_message_queue = g_list_remove(fsa_message_queue, message); crm_trace("Processing input %d", message->id); return message; } /* returns the current head of the FIFO queue */ gboolean is_message(void) { return (g_list_length(fsa_message_queue) > 0); } void * fsa_typed_data_adv(fsa_data_t * fsa_data, enum fsa_data_type a_type, const char *caller) { void *ret_val = NULL; if (fsa_data == NULL) { crm_err("%s: No FSA data available", caller); } else if (fsa_data->data == NULL) { crm_err("%s: No message data available. Origin: %s", caller, fsa_data->origin); } else if (fsa_data->data_type != a_type) { crm_crit( "%s: Message data was the wrong type! %d vs. requested=%d." " Origin: %s", caller, fsa_data->data_type, a_type, fsa_data->origin); CRM_ASSERT(fsa_data->data_type == a_type); } else { ret_val = fsa_data->data; } return ret_val; } /* A_MSG_ROUTE */ void do_msg_route(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg); route_message(msg_data->fsa_cause, input->msg); } void route_message(enum crmd_fsa_cause cause, xmlNode * input) { ha_msg_input_t fsa_input; enum crmd_fsa_input result = I_NULL; fsa_input.msg = input; CRM_CHECK(cause == C_IPC_MESSAGE || cause == C_HA_MESSAGE, return); /* try passing the buck first */ if (relay_message(input, cause == C_IPC_MESSAGE)) { return; } /* handle locally */ result = handle_message(input); /* done or process later? */ switch (result) { case I_NULL: case I_CIB_OP: case I_ROUTER: case I_NODE_JOIN: case I_JOIN_REQUEST: case I_JOIN_RESULT: break; default: /* Defering local processing of message */ register_fsa_input_later(cause, result, &fsa_input); return; } if (result != I_NULL) { /* add to the front of the queue */ register_fsa_input(cause, result, &fsa_input); } } gboolean relay_message(xmlNode * msg, gboolean originated_locally) { int dest = 1; int is_for_dc = 0; int is_for_dcib = 0; int is_for_te = 0; int is_for_crm = 0; int is_for_cib = 0; int is_local = 0; gboolean processing_complete = FALSE; const char *host_to = crm_element_value(msg, F_CRM_HOST_TO); const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO); const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); const char *type = crm_element_value(msg, F_TYPE); const char *msg_error = NULL; crm_trace("Routing message %s", crm_element_value(msg, XML_ATTR_REFERENCE)); if (msg == NULL) { msg_error = "Cannot route empty message"; } else if (safe_str_eq(CRM_OP_HELLO, crm_element_value(msg, F_CRM_TASK))) { /* quietly ignore */ processing_complete = TRUE; } else if (safe_str_neq(type, T_CRM)) { msg_error = "Bad message type"; } else if (sys_to == NULL) { msg_error = "Bad message destination: no subsystem"; } if (msg_error != NULL) { processing_complete = TRUE; crm_err("%s", msg_error); crm_log_xml_warn(msg, "bad msg"); } if (processing_complete) { return TRUE; } processing_complete = TRUE; is_for_dc = (strcasecmp(CRM_SYSTEM_DC, sys_to) == 0); is_for_dcib = (strcasecmp(CRM_SYSTEM_DCIB, sys_to) == 0); is_for_te = (strcasecmp(CRM_SYSTEM_TENGINE, sys_to) == 0); is_for_cib = (strcasecmp(CRM_SYSTEM_CIB, sys_to) == 0); is_for_crm = (strcasecmp(CRM_SYSTEM_CRMD, sys_to) == 0); is_local = 0; if (host_to == NULL || strlen(host_to) == 0) { if (is_for_dc || is_for_te) { is_local = 0; } else if (is_for_crm && originated_locally) { is_local = 0; } else { is_local = 1; } } else if (safe_str_eq(fsa_our_uname, host_to)) { is_local = 1; } if (is_for_dc || is_for_dcib || is_for_te) { if (AM_I_DC && is_for_te) { ROUTER_RESULT("Message result: Local relay"); send_msg_via_ipc(msg, sys_to); } else if (AM_I_DC) { ROUTER_RESULT("Message result: DC/CRMd process"); processing_complete = FALSE; /* more to be done by caller */ } else if (originated_locally && safe_str_neq(sys_from, CRM_SYSTEM_PENGINE) && safe_str_neq(sys_from, CRM_SYSTEM_TENGINE)) { /* Neither the TE or PE should be sending messages * to DC's on other nodes * * By definition, if we are no longer the DC, then * the PE or TE's data should be discarded */ #if SUPPORT_COROSYNC if (is_openais_cluster()) { dest = text2msg_type(sys_to); } #endif ROUTER_RESULT("Message result: External relay to DC"); send_cluster_message(host_to, dest, msg, TRUE); } else { /* discard */ ROUTER_RESULT("Message result: Discard, not DC"); } } else if (is_local && (is_for_crm || is_for_cib)) { ROUTER_RESULT("Message result: CRMd process"); processing_complete = FALSE; /* more to be done by caller */ } else if (is_local) { ROUTER_RESULT("Message result: Local relay"); send_msg_via_ipc(msg, sys_to); } else { #if SUPPORT_COROSYNC if (is_openais_cluster()) { dest = text2msg_type(sys_to); } #endif ROUTER_RESULT("Message result: External relay"); send_cluster_message(host_to, dest, msg, TRUE); } return processing_complete; } static gboolean process_hello_message(xmlNode * hello, char **uuid, char **client_name, char **major_version, char **minor_version) { const char *local_uuid; const char *local_client_name; const char *local_major_version; const char *local_minor_version; *uuid = NULL; *client_name = NULL; *major_version = NULL; *minor_version = NULL; if (hello == NULL) { return FALSE; } local_uuid = crm_element_value(hello, "client_uuid"); local_client_name = crm_element_value(hello, "client_name"); local_major_version = crm_element_value(hello, "major_version"); local_minor_version = crm_element_value(hello, "minor_version"); if (local_uuid == NULL || strlen(local_uuid) == 0) { crm_err("Hello message was not valid (field %s not found)", "uuid"); return FALSE; } else if (local_client_name == NULL || strlen(local_client_name) == 0) { crm_err("Hello message was not valid (field %s not found)", "client name"); return FALSE; } else if (local_major_version == NULL || strlen(local_major_version) == 0) { crm_err("Hello message was not valid (field %s not found)", "major version"); return FALSE; } else if (local_minor_version == NULL || strlen(local_minor_version) == 0) { crm_err("Hello message was not valid (field %s not found)", "minor version"); return FALSE; } *uuid = strdup(local_uuid); *client_name = strdup(local_client_name); *major_version = strdup(local_major_version); *minor_version = strdup(local_minor_version); crm_trace("Hello message ok"); return TRUE; } gboolean crmd_authorize_message(xmlNode * client_msg, crmd_client_t * curr_client) { /* check the best case first */ const char *sys_from = crm_element_value(client_msg, F_CRM_SYS_FROM); char *uuid = NULL; char *client_name = NULL; char *major_version = NULL; char *minor_version = NULL; const char *filtered_from; gpointer table_key = NULL; gboolean auth_result = FALSE; gboolean can_reply = FALSE; /* no-one has registered with this id */ xmlNode *xml = NULL; const char *op = crm_element_value(client_msg, F_CRM_TASK); if (safe_str_neq(CRM_OP_HELLO, op)) { if (sys_from == NULL) { crm_warn("Message [%s] was had no value for %s... discarding", crm_element_value(client_msg, XML_ATTR_REFERENCE), F_CRM_SYS_FROM); return FALSE; } filtered_from = sys_from; /* The CIB can have two names on the DC */ if (strcasecmp(sys_from, CRM_SYSTEM_DCIB) == 0) filtered_from = CRM_SYSTEM_CIB; if (g_hash_table_lookup(ipc_clients, filtered_from) != NULL) { can_reply = TRUE; /* reply can be routed */ } crm_trace("Message reply can%s be routed from %s.", can_reply ? "" : " not", sys_from); if (can_reply == FALSE) { crm_warn("Message [%s] not authorized", crm_element_value(client_msg, XML_ATTR_REFERENCE)); } return can_reply; } crm_trace("received client join msg"); crm_log_xml_trace(client_msg, "join"); xml = get_message_xml(client_msg, F_CRM_DATA); auth_result = process_hello_message(xml, &uuid, &client_name, &major_version, &minor_version); if (auth_result == TRUE) { if (client_name == NULL || uuid == NULL) { crm_err("Bad client details (client_name=%s, uuid=%s)", crm_str(client_name), crm_str(uuid)); auth_result = FALSE; } } if (auth_result == TRUE) { /* check version */ int mav = atoi(major_version); int miv = atoi(minor_version); crm_trace("Checking client version number"); if (mav < 0 || miv < 0) { crm_err("Client version (%d:%d) is not acceptable", mav, miv); auth_result = FALSE; } } table_key = (gpointer) generate_hash_key(client_name, uuid); if (auth_result == TRUE) { xmlNode *xml = NULL; crm_trace("Accepted client %s", crm_str(table_key)); curr_client->table_key = table_key; curr_client->sub_sys = strdup(client_name); curr_client->uuid = strdup(uuid); g_hash_table_insert(ipc_clients, table_key, curr_client->ipc); xml = create_hello_message("n/a", CRM_SYSTEM_CRMD, "0", "1"); crm_ipcs_send(curr_client->ipc, xml, FALSE); free_xml(xml); crm_trace("Updated client list with %s", crm_str(table_key)); crm_trace("Triggering FSA: %s", __FUNCTION__); mainloop_set_trigger(fsa_source); } else { free(table_key); crm_warn("Rejected client logon request"); qb_ipcs_disconnect(curr_client->ipc); } free(uuid); free(minor_version); free(major_version); free(client_name); /* hello messages should never be processed further */ return FALSE; } enum crmd_fsa_input handle_message(xmlNode * msg) { const char *type = NULL; CRM_CHECK(msg != NULL, return I_NULL); type = crm_element_value(msg, F_CRM_MSG_TYPE); if (crm_str_eq(type, XML_ATTR_REQUEST, TRUE)) { return handle_request(msg); } else if (crm_str_eq(type, XML_ATTR_RESPONSE, TRUE)) { handle_response(msg); return I_NULL; } crm_err("Unknown message type: %s", type); return I_NULL; } static enum crmd_fsa_input handle_failcount_op(xmlNode * stored_msg) { const char *rsc = NULL; xmlNode *xml_rsc = get_xpath_object("//" XML_CIB_TAG_RESOURCE, stored_msg, LOG_ERR); if (xml_rsc) { rsc = ID(xml_rsc); } if (rsc) { char *attr = NULL; crm_info("Removing failcount for %s", rsc); attr = crm_concat("fail-count", rsc, '-'); update_attrd(NULL, attr, NULL, NULL); free(attr); attr = crm_concat("last-failure", rsc, '-'); update_attrd(NULL, attr, NULL, NULL); free(attr); lrm_clear_last_failure(rsc); } else { crm_log_xml_warn(stored_msg, "invalid failcount op"); } return I_NULL; } enum crmd_fsa_input handle_request(xmlNode * stored_msg) { xmlNode *msg = NULL; const char *op = crm_element_value(stored_msg, F_CRM_TASK); /* Optimize this for the DC - it has the most to do */ if (op == NULL) { crm_log_xml_err(stored_msg, "Bad message"); return I_NULL; } /*========== DC-Only Actions ==========*/ if (AM_I_DC) { if (strcmp(op, CRM_OP_JOIN_ANNOUNCE) == 0) { return I_NODE_JOIN; } else if (strcmp(op, CRM_OP_JOIN_REQUEST) == 0) { return I_JOIN_REQUEST; } else if (strcmp(op, CRM_OP_JOIN_CONFIRM) == 0) { return I_JOIN_RESULT; } else if (strcmp(op, CRM_OP_SHUTDOWN) == 0) { const char *host_from = crm_element_value(stored_msg, F_CRM_HOST_FROM); gboolean dc_match = safe_str_eq(host_from, fsa_our_dc); if (is_set(fsa_input_register, R_SHUTDOWN)) { crm_info("Shutting ourselves down (DC)"); return I_STOP; } else if (dc_match) { crm_err("We didnt ask to be shut down, yet our" " TE is telling us too." " Better get out now!"); return I_TERMINATE; } else if (fsa_state != S_STOPPING) { crm_err("Another node is asking us to shutdown" " but we think we're ok."); return I_ELECTION; } } else if (strcmp(op, CRM_OP_SHUTDOWN_REQ) == 0) { /* a slave wants to shut down */ /* create cib fragment and add to message */ return handle_shutdown_request(stored_msg); } } /*========== common actions ==========*/ if (strcmp(op, CRM_OP_NOVOTE) == 0) { ha_msg_input_t fsa_input; fsa_input.msg = stored_msg; register_fsa_input_adv(C_HA_MESSAGE, I_NULL, &fsa_input, A_ELECTION_COUNT | A_ELECTION_CHECK, FALSE, __FUNCTION__); } else if (strcmp(op, CRM_OP_CLEAR_FAILCOUNT) == 0) { return handle_failcount_op(stored_msg); } else if (strcmp(op, CRM_OP_VOTE) == 0) { /* count the vote and decide what to do after that */ ha_msg_input_t fsa_input; fsa_input.msg = stored_msg; register_fsa_input_adv(C_HA_MESSAGE, I_NULL, &fsa_input, A_ELECTION_COUNT | A_ELECTION_CHECK, FALSE, __FUNCTION__); /* Sometimes we _must_ go into S_ELECTION */ if (fsa_state == S_HALT) { crm_debug("Forcing an election from S_HALT"); return I_ELECTION; #if 0 } else if (AM_I_DC) { /* This is the old way of doing things but what is gained? */ return I_ELECTION; #endif } } else if (strcmp(op, CRM_OP_JOIN_OFFER) == 0) { crm_debug("Raising I_JOIN_OFFER: join-%s", crm_element_value(stored_msg, F_CRM_JOIN_ID)); return I_JOIN_OFFER; } else if (strcmp(op, CRM_OP_JOIN_ACKNAK) == 0) { crm_debug("Raising I_JOIN_RESULT: join-%s", crm_element_value(stored_msg, F_CRM_JOIN_ID)); return I_JOIN_RESULT; } else if (strcmp(op, CRM_OP_LRM_DELETE) == 0 || strcmp(op, CRM_OP_LRM_FAIL) == 0 || strcmp(op, CRM_OP_LRM_REFRESH) == 0 || strcmp(op, CRM_OP_REPROBE) == 0) { crm_xml_add(stored_msg, F_CRM_SYS_TO, CRM_SYSTEM_LRMD); return I_ROUTER; } else if (strcmp(op, CRM_OP_NOOP) == 0) { return I_NULL; } else if (strcmp(op, CRM_OP_LOCAL_SHUTDOWN) == 0) { crm_shutdown(SIGTERM); /*return I_SHUTDOWN; */ return I_NULL; /*========== (NOT_DC)-Only Actions ==========*/ } else if (AM_I_DC == FALSE && strcmp(op, CRM_OP_SHUTDOWN) == 0) { const char *host_from = crm_element_value(stored_msg, F_CRM_HOST_FROM); gboolean dc_match = safe_str_eq(host_from, fsa_our_dc); if (dc_match || fsa_our_dc == NULL) { if (is_set(fsa_input_register, R_SHUTDOWN) == FALSE) { crm_err("We didn't ask to be shut down, yet our" " DC is telling us too."); set_bit(fsa_input_register, R_STAYDOWN); return I_STOP; } crm_info("Shutting down"); return I_STOP; } else { crm_warn("Discarding %s op from %s", op, host_from); } } else if (strcmp(op, CRM_OP_PING) == 0) { /* eventually do some stuff to figure out * if we /are/ ok */ const char *sys_to = crm_element_value(stored_msg, F_CRM_SYS_TO); xmlNode *ping = create_xml_node(NULL, XML_CRM_TAG_PING); crm_xml_add(ping, XML_PING_ATTR_STATUS, "ok"); crm_xml_add(ping, XML_PING_ATTR_SYSFROM, sys_to); crm_xml_add(ping, "crmd_state", fsa_state2string(fsa_state)); /* Ok, so technically not so interesting, but CTS needs to see this */ crm_notice("Current ping state: %s", fsa_state2string(fsa_state)); msg = create_reply(stored_msg, ping); relay_message(msg, TRUE); free_xml(ping); free_xml(msg); } else if (strcmp(op, CRM_OP_RM_NODE_CACHE) == 0) { xmlNode *options = get_xpath_object("//"XML_TAG_OPTIONS, stored_msg, LOG_ERR); int id = 0; if (options) { crm_element_value_int(options, XML_ATTR_ID, &id); } if (id) { reap_crm_member(id); } } else { crm_err("Unexpected request (%s) sent to %s", op, AM_I_DC ? "the DC" : "non-DC node"); crm_log_xml_err(stored_msg, "Unexpected"); } return I_NULL; } void handle_response(xmlNode * stored_msg) { const char *op = crm_element_value(stored_msg, F_CRM_TASK); if (op == NULL) { crm_log_xml_err(stored_msg, "Bad message"); } else if (AM_I_DC && strcmp(op, CRM_OP_PECALC) == 0) { /* Check if the PE answer been superceeded by a subsequent request? */ const char *msg_ref = crm_element_value(stored_msg, XML_ATTR_REFERENCE); if (msg_ref == NULL) { crm_err("%s - Ignoring calculation with no reference", op); } else if (safe_str_eq(msg_ref, fsa_pe_ref)) { ha_msg_input_t fsa_input; fsa_input.msg = stored_msg; register_fsa_input_later(C_IPC_MESSAGE, I_PE_SUCCESS, &fsa_input); crm_trace("Completed: %s...", fsa_pe_ref); } else { crm_info("%s calculation %s is obsolete", op, msg_ref); } } else if (strcmp(op, CRM_OP_VOTE) == 0 || strcmp(op, CRM_OP_SHUTDOWN_REQ) == 0 || strcmp(op, CRM_OP_SHUTDOWN) == 0) { } else { const char *host_from = crm_element_value(stored_msg, F_CRM_HOST_FROM); crm_err("Unexpected response (op=%s, src=%s) sent to the %s", op, host_from, AM_I_DC ? "DC" : "CRMd"); } } enum crmd_fsa_input handle_shutdown_request(xmlNode * stored_msg) { /* handle here to avoid potential version issues * where the shutdown message/proceedure may have * been changed in later versions. * * This way the DC is always in control of the shutdown */ char *now_s = NULL; time_t now = time(NULL); - xmlNode *node_state = NULL; const char *host_from = crm_element_value(stored_msg, F_CRM_HOST_FROM); if (host_from == NULL) { /* we're shutting down and the DC */ host_from = fsa_our_uname; } crm_info("Creating shutdown request for %s (state=%s)", host_from, fsa_state2string(fsa_state)); - crm_log_xml_trace(stored_msg, "message"); - node_state = create_node_state(host_from, NULL, NULL, NULL, - CRMD_STATE_INACTIVE, FALSE, __FUNCTION__); - - fsa_cib_anon_update(XML_CIB_TAG_STATUS, node_state, cib_quorum_override); - crm_log_xml_trace(node_state, "Shutdown update"); - free_xml(node_state); - now_s = crm_itoa(now); update_attrd(host_from, XML_CIB_ATTR_SHUTDOWN, now_s, NULL); free(now_s); /* will be picked up by the TE as long as its running */ return I_NULL; } /* msg is deleted by the time this returns */ extern gboolean process_te_message(xmlNode * msg, xmlNode * xml_data); gboolean send_msg_via_ipc(xmlNode * msg, const char *sys) { gboolean send_ok = TRUE; qb_ipcs_connection_t *client_channel; client_channel = (qb_ipcs_connection_t *) g_hash_table_lookup(ipc_clients, sys); if (crm_element_value(msg, F_CRM_HOST_FROM) == NULL) { crm_xml_add(msg, F_CRM_HOST_FROM, fsa_our_uname); } if (client_channel != NULL) { /* Transient clients such as crmadmin */ send_ok = crm_ipcs_send(client_channel, msg, TRUE); } else if (sys != NULL && strcmp(sys, CRM_SYSTEM_TENGINE) == 0) { xmlNode *data = get_message_xml(msg, F_CRM_DATA); process_te_message(msg, data); } else if (sys != NULL && strcmp(sys, CRM_SYSTEM_LRMD) == 0) { fsa_data_t fsa_data; ha_msg_input_t fsa_input; fsa_input.msg = msg; fsa_input.xml = get_message_xml(msg, F_CRM_DATA); fsa_data.id = 0; fsa_data.actions = 0; fsa_data.data = &fsa_input; fsa_data.fsa_input = I_MESSAGE; fsa_data.fsa_cause = C_IPC_MESSAGE; fsa_data.origin = __FUNCTION__; fsa_data.data_type = fsa_dt_ha_msg; #ifdef FSA_TRACE crm_trace("Invoking action A_LRM_INVOKE (%.16llx)", A_LRM_INVOKE); #endif do_lrm_invoke(A_LRM_INVOKE, C_IPC_MESSAGE, fsa_state, I_MESSAGE, &fsa_data); } else { crm_err("Unknown Sub-system (%s)... discarding message.", crm_str(sys)); send_ok = FALSE; } return send_ok; } ha_msg_input_t * new_ha_msg_input(xmlNode * orig) { ha_msg_input_t *input_copy = NULL; input_copy = calloc(1, sizeof(ha_msg_input_t)); input_copy->msg = orig; input_copy->xml = get_message_xml(input_copy->msg, F_CRM_DATA); return input_copy; } void delete_ha_msg_input(ha_msg_input_t * orig) { if (orig == NULL) { return; } free_xml(orig->msg); free(orig); } diff --git a/crmd/te_callbacks.c b/crmd/te_callbacks.c index 33c472418d..b2f19b5301 100644 --- a/crmd/te_callbacks.c +++ b/crmd/te_callbacks.c @@ -1,601 +1,601 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include /* For ONLINESTATUS etc */ void te_update_confirm(const char *event, xmlNode * msg); extern char *te_uuid; gboolean shuttingdown = FALSE; crm_graph_t *transition_graph; crm_trigger_t *transition_trigger = NULL; /* #define rsc_op_template "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_CIB_TAG_STATE"[@uname='%s']"//"XML_LRM_TAG_RSC_OP"[@id='%s]" */ #define rsc_op_template "//"XML_TAG_DIFF_ADDED"//"XML_TAG_CIB"//"XML_LRM_TAG_RSC_OP"[@id='%s']" static const char * get_node_id(xmlNode * rsc_op) { xmlNode *node = rsc_op; while (node != NULL && safe_str_neq(XML_CIB_TAG_STATE, TYPE(node))) { node = node->parent; } CRM_CHECK(node != NULL, return NULL); return ID(node); } static void process_resource_updates(xmlXPathObject * xpathObj) { /* - + */ int lpc = 0, max = xpathObj->nodesetval->nodeNr; for (lpc = 0; lpc < max; lpc++) { xmlNode *rsc_op = getXpathResult(xpathObj, lpc); const char *node = get_node_id(rsc_op); process_graph_event(rsc_op, node); } } void te_update_diff(const char *event, xmlNode * msg) { int rc = -1; const char *op = NULL; xmlNode *diff = NULL; xmlXPathObject *xpathObj = NULL; int diff_add_updates = 0; int diff_add_epoch = 0; int diff_add_admin_epoch = 0; int diff_del_updates = 0; int diff_del_epoch = 0; int diff_del_admin_epoch = 0; CRM_CHECK(msg != NULL, return); crm_element_value_int(msg, F_CIB_RC, &rc); if (transition_graph == NULL) { crm_trace("No graph"); return; } else if (rc < pcmk_ok) { crm_trace("Filter rc=%d (%s)", rc, pcmk_strerror(rc)); return; } else if (transition_graph->complete == TRUE && fsa_state != S_IDLE && fsa_state != S_TRANSITION_ENGINE && fsa_state != S_POLICY_ENGINE) { crm_trace("Filter state=%s, complete=%d", fsa_state2string(fsa_state), transition_graph->complete); return; } op = crm_element_value(msg, F_CIB_OPERATION); diff = get_message_xml(msg, F_CIB_UPDATE_RESULT); cib_diff_version_details(diff, &diff_add_admin_epoch, &diff_add_epoch, &diff_add_updates, &diff_del_admin_epoch, &diff_del_epoch, &diff_del_updates); crm_debug("Processing diff (%s): %d.%d.%d -> %d.%d.%d (%s)", op, diff_del_admin_epoch, diff_del_epoch, diff_del_updates, diff_add_admin_epoch, diff_add_epoch, diff_add_updates, fsa_state2string(fsa_state)); log_cib_diff(LOG_DEBUG_2, diff, op); if (cib_config_changed(NULL, NULL, &diff)) { abort_transition(INFINITY, tg_restart, "Non-status change", diff); goto bail; /* configuration changed */ } /* Tickets Attributes - Added/Updated */ xpathObj = xpath_search(diff, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_CIB_TAG_TICKETS); if (xpathObj && xpathObj->nodesetval->nodeNr > 0) { xmlNode *aborted = getXpathResult(xpathObj, 0); abort_transition(INFINITY, tg_restart, "Ticket attribute: update", aborted); goto bail; } else if (xpathObj) { xmlXPathFreeObject(xpathObj); } /* Tickets Attributes - Removed */ xpathObj = xpath_search(diff, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//" XML_CIB_TAG_TICKETS); if (xpathObj && xpathObj->nodesetval->nodeNr > 0) { xmlNode *aborted = getXpathResult(xpathObj, 0); abort_transition(INFINITY, tg_restart, "Ticket attribute: removal", aborted); goto bail; } else if (xpathObj) { xmlXPathFreeObject(xpathObj); } /* Transient Attributes - Added/Updated */ xpathObj = xpath_search(diff, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_TAG_TRANSIENT_NODEATTRS "//" XML_CIB_TAG_NVPAIR); if (xpathObj && xpathObj->nodesetval->nodeNr > 0) { int lpc; for (lpc = 0; lpc < xpathObj->nodesetval->nodeNr; lpc++) { xmlNode *attr = getXpathResult(xpathObj, lpc); const char *name = crm_element_value(attr, XML_NVPAIR_ATTR_NAME); const char *value = NULL; if (safe_str_eq(CRM_OP_PROBED, name)) { value = crm_element_value(attr, XML_NVPAIR_ATTR_VALUE); } if (crm_is_true(value) == FALSE) { abort_transition(INFINITY, tg_restart, "Transient attribute: update", attr); crm_log_xml_trace(attr, "Abort"); goto bail; } } } else if (xpathObj) { xmlXPathFreeObject(xpathObj); } /* Transient Attributes - Removed */ xpathObj = xpath_search(diff, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//" XML_TAG_TRANSIENT_NODEATTRS); if (xpathObj && xpathObj->nodesetval->nodeNr > 0) { xmlNode *aborted = getXpathResult(xpathObj, 0); abort_transition(INFINITY, tg_restart, "Transient attribute: removal", aborted); goto bail; } else if (xpathObj) { xmlXPathFreeObject(xpathObj); } /* Check for node state updates... possibly from a shutdown we requested */ xpathObj = xpath_search(diff, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_CIB_TAG_STATE); if (xpathObj) { int lpc = 0, max = xpathObj->nodesetval->nodeNr; for (lpc = 0; lpc < max; lpc++) { xmlNode *node = getXpathResult(xpathObj, lpc); const char *event_node = crm_element_value(node, XML_ATTR_ID); const char *ccm_state = crm_element_value(node, XML_NODE_IN_CLUSTER); const char *shutdown_s = crm_element_value(node, XML_CIB_ATTR_SHUTDOWN); const char *crmd_state = crm_element_value(node, XML_NODE_IS_PEER); if (safe_str_eq(ccm_state, XML_BOOLEAN_FALSE) || safe_str_eq(crmd_state, CRMD_JOINSTATE_DOWN)) { crm_action_t *shutdown = match_down_event(0, event_node, NULL); if (shutdown != NULL) { const char *task = crm_element_value(shutdown->xml, XML_LRM_ATTR_TASK); if (safe_str_neq(task, CRM_OP_FENCE)) { /* Wait for stonithd to tell us it is complete via tengine_stonith_callback() */ crm_debug("Confirming %s op %d", task, shutdown->id); /* match->confirmed = TRUE; */ stop_te_timer(shutdown->timer); update_graph(transition_graph, shutdown); trigger_graph(); } } else { crm_info("Stonith/shutdown of %s not matched", event_node); abort_transition(INFINITY, tg_restart, "Node failure", node); } fail_incompletable_actions(transition_graph, event_node); } if (shutdown_s) { int shutdown = crm_parse_int(shutdown_s, NULL); if (shutdown > 0) { crm_info("Aborting on " XML_CIB_ATTR_SHUTDOWN " attribute for %s", event_node); abort_transition(INFINITY, tg_restart, "Shutdown request", node); } } } xmlXPathFreeObject(xpathObj); } /* * Check for and fast-track the processing of LRM refreshes * In large clusters this can result in _huge_ speedups * * Unfortunately we can only do so when there are no pending actions * Otherwise we could miss updates we're waiting for and stall * */ xpathObj = NULL; if (transition_graph->pending == 0) { xpathObj = xpath_search(diff, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_LRM_TAG_RESOURCE); } if (xpathObj) { int updates = xpathObj->nodesetval->nodeNr; if (updates > 1) { /* Updates by, or in response to, TE actions will never contain updates * for more than one resource at a time */ crm_debug("Detected LRM refresh - %d resources updated: Skipping all resource events", updates); crm_log_xml_trace(diff, "lrm-refresh"); abort_transition(INFINITY, tg_restart, "LRM Refresh", NULL); goto bail; } xmlXPathFreeObject(xpathObj); } /* Process operation updates */ xpathObj = xpath_search(diff, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_LRM_TAG_RSC_OP); if (xpathObj) { process_resource_updates(xpathObj); xmlXPathFreeObject(xpathObj); } /* Detect deleted (as opposed to replaced or added) actions - eg. crm_resource -C */ xpathObj = xpath_search(diff, "//" XML_TAG_DIFF_REMOVED "//" XML_LRM_TAG_RSC_OP); if (xpathObj) { int lpc = 0, max = xpathObj->nodesetval->nodeNr; for (lpc = 0; lpc < max; lpc++) { int max = 0; const char *op_id = NULL; char *rsc_op_xpath = NULL; xmlXPathObject *op_match = NULL; xmlNode *match = getXpathResult(xpathObj, lpc); CRM_CHECK(match != NULL, continue); op_id = ID(match); max = strlen(rsc_op_template) + strlen(op_id) + 1; rsc_op_xpath = calloc(1, max); snprintf(rsc_op_xpath, max, rsc_op_template, op_id); op_match = xpath_search(diff, rsc_op_xpath); if (op_match == NULL || op_match->nodesetval->nodeNr == 0) { /* Prevent false positives by matching cancelations too */ const char *node = get_node_id(match); crm_action_t *cancelled = get_cancel_action(op_id, node); if (cancelled == NULL) { crm_debug("No match for deleted action %s (%s on %s)", rsc_op_xpath, op_id, node); abort_transition(INFINITY, tg_restart, "Resource op removal", match); if (op_match) { xmlXPathFreeObject(op_match); } free(rsc_op_xpath); goto bail; } else { crm_debug("Deleted lrm_rsc_op %s on %s was for graph event %d", op_id, node, cancelled->id); } } if (op_match) { xmlXPathFreeObject(op_match); } free(rsc_op_xpath); } } bail: if (xpathObj) { xmlXPathFreeObject(xpathObj); } } gboolean process_te_message(xmlNode * msg, xmlNode * xml_data) { const char *from = crm_element_value(msg, F_ORIG); const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO); const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM); const char *ref = crm_element_value(msg, XML_ATTR_REFERENCE); const char *op = crm_element_value(msg, F_CRM_TASK); const char *type = crm_element_value(msg, F_CRM_MSG_TYPE); crm_trace("Processing %s (%s) message", op, ref); crm_log_xml_trace(msg, "ipc"); if (op == NULL) { /* error */ } else if (sys_to == NULL || strcasecmp(sys_to, CRM_SYSTEM_TENGINE) != 0) { crm_trace("Bad sys-to %s", crm_str(sys_to)); return FALSE; } else if (safe_str_eq(op, CRM_OP_INVOKE_LRM) && safe_str_eq(sys_from, CRM_SYSTEM_LRMD) /* && safe_str_eq(type, XML_ATTR_RESPONSE) */ ) { xmlXPathObject *xpathObj = NULL; crm_log_xml_trace(msg, "Processing (N)ACK"); crm_debug("Processing (N)ACK %s from %s", crm_element_value(msg, XML_ATTR_REFERENCE), from); xpathObj = xpath_search(xml_data, "//" XML_LRM_TAG_RSC_OP); if (xpathObj) { process_resource_updates(xpathObj); xmlXPathFreeObject(xpathObj); xpathObj = NULL; } else { crm_log_xml_err(msg, "Invalid (N)ACK"); return FALSE; } } else { crm_err("Unknown command: %s::%s from %s", type, op, sys_from); } crm_trace("finished processing message"); return TRUE; } GHashTable *stonith_failures = NULL; struct st_fail_rec { int count; }; gboolean too_many_st_failures(void) { GHashTableIter iter; const char *key = NULL; struct st_fail_rec *value = NULL; if(stonith_failures == NULL) { return FALSE; } g_hash_table_iter_init(&iter, stonith_failures); while (g_hash_table_iter_next(&iter, (gpointer *) & key, (gpointer *) & value)) { if(value->count > 10) { crm_notice("Too many failures to fence %s (%d), giving up", key, value->count); return TRUE; } } return FALSE; } void tengine_stonith_callback(stonith_t * stonith, const xmlNode * msg, int call_id, int rc, xmlNode * output, void *userdata) { char *uuid = NULL; int target_rc = -1; int stonith_id = -1; int transition_id = -1; crm_action_t *action = NULL; struct st_fail_rec *rec = NULL; CRM_CHECK(userdata != NULL, return); crm_log_xml_trace(output, "StonithOp"); crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata, pcmk_strerror(rc), rc); if (AM_I_DC == FALSE) { return; } /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */ /* op->call_id, op->optype, op->node_name, op->op_result, */ /* (char *)op->node_list, op->private_data); */ /* filter out old STONITH actions */ CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, &target_rc), crm_err("Invalid event detected"); goto bail; ); if (transition_graph->complete || stonith_id < 0 || safe_str_neq(uuid, te_uuid) || transition_graph->id != transition_id) { crm_info("Ignoring STONITH action initiated outside of the current transition"); goto bail; } /* this will mark the event complete if a match is found */ action = get_action(stonith_id, FALSE); if (action == NULL) { crm_err("Stonith action not matched"); goto bail; } stop_te_timer(action->timer); if(stonith_failures == NULL) { stonith_failures = g_hash_table_new_full( crm_str_hash, g_str_equal, g_hash_destroy_str, free); } if (rc == pcmk_ok) { const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); crm_debug("Stonith operation %d for %s passed", call_id, target); if (action->confirmed == FALSE) { action->confirmed = TRUE; send_stonith_update(action, target, uuid); } rec = g_hash_table_lookup(stonith_failures, target); if(rec) { rec->count = 0; } } else { const char *target = crm_element_value_const(action->xml, XML_LRM_ATTR_TARGET); const char *allow_fail = crm_meta_value(action->params, XML_ATTR_TE_ALLOWFAIL); action->failed = TRUE; if (crm_is_true(allow_fail) == FALSE) { crm_notice("Stonith operation %d for %s failed (%s): aborting transition.", call_id, target, pcmk_strerror(rc)); abort_transition(INFINITY, tg_restart, "Stonith failed", NULL); } rec = g_hash_table_lookup(stonith_failures, target); if(rec) { rec->count++; } else { rec = malloc(sizeof(struct st_fail_rec)); rec->count = 1; g_hash_table_insert(stonith_failures, strdup(target), rec); } } update_graph(transition_graph, action); trigger_graph(); bail: free(userdata); free(uuid); return; } void cib_fencing_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { if (rc < pcmk_ok) { crm_err("Fencing update %d for %s: failed - %s (%d)", call_id, (char *)user_data, pcmk_strerror(rc), rc); crm_log_xml_warn(msg, "Failed update"); abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL); } else { crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data); } free(user_data); } void cib_action_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { if (rc < pcmk_ok) { crm_err("Update %d FAILED: %s", call_id, pcmk_strerror(rc)); } } void cib_failcount_updated(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { if (rc < pcmk_ok) { crm_err("Update %d FAILED: %s", call_id, pcmk_strerror(rc)); } } gboolean action_timer_callback(gpointer data) { crm_action_timer_t *timer = NULL; CRM_CHECK(data != NULL, return FALSE); timer = (crm_action_timer_t *) data; stop_te_timer(timer); crm_warn("Timer popped (timeout=%d, abort_level=%d, complete=%s)", timer->timeout, transition_graph->abort_priority, transition_graph->complete ? "true" : "false"); CRM_CHECK(timer->action != NULL, return FALSE); if (transition_graph->complete) { crm_warn("Ignoring timeout while not in transition"); } else if (timer->reason == timeout_action_warn) { print_action(LOG_WARNING, "Action missed its timeout: ", timer->action); /* Don't check the FSA state * * We might also be in S_INTEGRATION or some other state waiting for this * action so we can close the transition and continue */ } else { /* fail the action */ gboolean send_update = TRUE; const char *task = crm_element_value(timer->action->xml, XML_LRM_ATTR_TASK); print_action(LOG_ERR, "Aborting transition, action lost: ", timer->action); timer->action->failed = TRUE; timer->action->confirmed = TRUE; abort_transition(INFINITY, tg_restart, "Action lost", NULL); update_graph(transition_graph, timer->action); trigger_graph(); if (timer->action->type != action_type_rsc) { send_update = FALSE; } else if (safe_str_eq(task, "cancel")) { /* we dont need to update the CIB with these */ send_update = FALSE; } if (send_update) { /* cib_action_update(timer->action, PCMK_LRM_OP_PENDING, PCMK_EXECRA_STATUS_UNKNOWN); */ cib_action_update(timer->action, PCMK_LRM_OP_TIMEOUT, PCMK_EXECRA_UNKNOWN_ERROR); } } return FALSE; } diff --git a/include/crm/cluster.h b/include/crm/cluster.h index 824858b196..a0dc16b45b 100644 --- a/include/crm/cluster.h +++ b/include/crm/cluster.h @@ -1,210 +1,209 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef CRM_COMMON_CLUSTER__H # define CRM_COMMON_CLUSTER__H # include # include # if SUPPORT_HEARTBEAT # include # include # endif extern gboolean crm_have_quorum; extern GHashTable *crm_peer_cache; extern GHashTable *crm_peer_id_cache; extern unsigned long long crm_peer_seq; # ifndef CRM_SERVICE # define CRM_SERVICE PCMK_SERVICE_ID # endif /* *INDENT-OFF* */ #define CRM_NODE_LOST "lost" #define CRM_NODE_MEMBER "member" #define CRM_NODE_ACTIVE CRM_NODE_MEMBER -#define CRM_NODE_INACTIVE CRM_NODE_LOST #define CRM_NODE_EVICTED "evicted" enum crm_proc_flag { crm_proc_none = 0x00000001, /* 3 messaging types */ crm_proc_heartbeat = 0x01000000, crm_proc_plugin = 0x00000002, crm_proc_cpg = 0x04000000, crm_proc_lrmd = 0x00000010, crm_proc_cib = 0x00000100, crm_proc_crmd = 0x00000200, crm_proc_attrd = 0x00001000, crm_proc_pe = 0x00010000, crm_proc_te = 0x00020000, crm_proc_stonithd = 0x00002000, crm_proc_stonith_ng= 0x00100000, crm_proc_mgmtd = 0x00040000, }; /* *INDENT-ON* */ typedef struct crm_peer_node_s { uint32_t id; uint64_t born; uint64_t last_seen; int32_t votes; uint32_t processes; char *uname; char *state; char *uuid; char *addr; char *version; } crm_node_t; static inline const char * peer2text(enum crm_proc_flag proc) { const char *text = "unknown"; switch (proc) { case crm_proc_none: text = "unknown"; break; case crm_proc_plugin: text = "ais"; break; case crm_proc_heartbeat: text = "heartbeat"; break; case crm_proc_cib: text = "cib"; break; case crm_proc_crmd: text = "crmd"; break; case crm_proc_pe: text = "pengine"; break; case crm_proc_te: text = "tengine"; break; case crm_proc_lrmd: text = "lrmd"; break; case crm_proc_attrd: text = "attrd"; break; case crm_proc_stonithd: text = "stonithd"; break; case crm_proc_stonith_ng: text = "stonith-ng"; break; case crm_proc_mgmtd: text = "mgmtd"; break; case crm_proc_cpg: text = "corosync-cpg"; break; } return text; } void crm_peer_init(void); void crm_peer_destroy(void); char *get_corosync_uuid(uint32_t id, const char *uname); const char *get_node_uuid(uint32_t id, const char *uname); int get_corosync_id(int id, const char *uuid); gboolean crm_cluster_connect(char **our_uname, char **our_uuid, void *dispatch, void *destroy, # if SUPPORT_HEARTBEAT ll_cluster_t ** hb_conn # else void **unused # endif ); enum crm_ais_msg_types; gboolean send_cluster_message(const char *node, enum crm_ais_msg_types service, xmlNode * data, gboolean ordered); void destroy_crm_node(gpointer/* crm_node_t* */ data); crm_node_t *crm_get_peer(unsigned int id, const char *uname); guint crm_active_peers(void); gboolean crm_is_peer_active(const crm_node_t * node); guint reap_crm_member(uint32_t id); int crm_terminate_member(int nodeid, const char *uname, void* unused); int crm_terminate_member_no_mainloop(int nodeid, const char *uname, int *connection); gboolean crm_get_cluster_name(char **cname); # if SUPPORT_HEARTBEAT gboolean crm_is_heartbeat_peer_active(const crm_node_t * node); # endif # if SUPPORT_COROSYNC extern int ais_fd_sync; gboolean crm_is_corosync_peer_active(const crm_node_t * node); gboolean send_ais_text(int class, const char *data, gboolean local, const char *node, enum crm_ais_msg_types dest); gboolean get_ais_nodeid(uint32_t * id, char **uname); # endif void empty_uuid_cache(void); const char *get_uuid(const char *uname); const char *get_uname(const char *uuid); void set_uuid(xmlNode * node, const char *attr, const char *uname); void unget_uuid(const char *uname); enum crm_status_type { crm_status_uname, crm_status_nstate, crm_status_processes, }; enum crm_ais_msg_types text2msg_type(const char *text); void crm_set_status_callback(void (*dispatch) (enum crm_status_type, crm_node_t *, const void *)); /* *INDENT-OFF* */ enum cluster_type_e { pcmk_cluster_unknown = 0x0001, pcmk_cluster_invalid = 0x0002, pcmk_cluster_heartbeat = 0x0004, pcmk_cluster_classic_ais = 0x0010, pcmk_cluster_corosync = 0x0020, pcmk_cluster_cman = 0x0040, }; /* *INDENT-ON* */ enum cluster_type_e get_cluster_type(void); const char *name_for_cluster_type(enum cluster_type_e type); gboolean is_corosync_cluster(void); gboolean is_cman_cluster(void); gboolean is_openais_cluster(void); gboolean is_classic_ais_cluster(void); gboolean is_heartbeat_cluster(void); #endif diff --git a/include/crm/compatibility.h b/include/crm/compatibility.h index 283f981bf0..c85dd01393 100644 --- a/include/crm/compatibility.h +++ b/include/crm/compatibility.h @@ -1,305 +1,308 @@ /* * Copyright (C) 2012 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef CRM_COMPATIBILITY__H # define CRM_COMPATIBILITY__H # define LOG_DEBUG_2 LOG_TRACE # define LOG_DEBUG_3 LOG_TRACE # define LOG_DEBUG_4 LOG_TRACE # define LOG_DEBUG_5 LOG_TRACE # define LOG_DEBUG_6 LOG_TRACE -# define XML_CIB_ATTR_HASTATE "ha" +# define XML_CIB_ATTR_HASTATE "ha" # define XML_CIB_ATTR_JOINSTATE XML_NODE_JOIN_STATE # define XML_CIB_ATTR_EXPSTATE XML_NODE_EXPECTED # define XML_CIB_ATTR_INCCM XML_NODE_IN_CLUSTER # define XML_CIB_ATTR_CRMDSTATE XML_NODE_IS_PEER +# define CRMD_STATE_ACTIVE CRMD_JOINSTATE_MEMBER +# define CRMD_STATE_INACTIVE CRMD_JOINSTATE_DOWN + enum cib_errors { cib_ok = pcmk_ok, cib_operation = -EINVAL, cib_create_msg = -EPROTO, cib_not_connected = -ENOTCONN, cib_not_authorized = -EACCES, cib_send_failed = -ECOMM, cib_reply_failed = -ENOMSG, cib_return_code = -EPROTO, cib_output_data = -ENOMSG, cib_connection = -ENOTCONN, cib_authentication = -EPROTO, cib_missing = -EINVAL, cib_variant = -EPROTONOSUPPORT, CIBRES_MISSING_FIELD = -EINVAL, cib_unknown = -EINVAL, cib_STALE = -ENOKEY, cib_EXISTS = -ENOTUNIQ, cib_NOTEXISTS = -ENXIO, cib_ACTIVATION = -ENODATA, cib_NOOBJECT = -EINVAL, cib_NOPARENT = -EINVAL, cib_NOTSUPPORTED = -EPROTONOSUPPORT, cib_registration_msg = -EPROTO, cib_callback_token = -EPROTO, cib_callback_register = -ECOMM, cib_client_gone = -ECONNRESET, cib_not_master = -EPERM, cib_missing_data = -EINVAL, cib_remote_timeout = -ETIME, cib_no_quorum = -pcmk_err_no_quorum, cib_diff_failed = -pcmk_err_diff_failed, cib_diff_resync = -pcmk_err_diff_resync, cib_old_data = -pcmk_err_old_data, cib_dtd_validation = -pcmk_err_dtd_validation, cib_bad_section = -EINVAL, cib_bad_permissions = -EACCES, cib_invalid_argument = -EINVAL, cib_transform_failed = -pcmk_err_transform_failed, cib_permission_denied = -EACCES, }; enum stonith_errors { stonith_ok = pcmk_ok, stonith_pending = -EINPROGRESS, st_err_generic = -pcmk_err_generic, st_err_internal = -EPROTO, st_err_not_supported = -EPROTONOSUPPORT, st_err_connection = -ENOTCONN, st_err_missing = -EINVAL, st_err_exists = -ENOTUNIQ, st_err_timeout = -ETIME, st_err_ipc = -ECOMM, st_err_peer = -ENOMSG, st_err_unknown_operation = -EOPNOTSUPP, st_err_unknown_device = -ENODEV, st_err_none_available = -EHOSTUNREACH, st_err_signal = -ECONNABORTED, st_err_agent_fork = -ECHILD, st_err_agent_args = -EREMOTEIO, st_err_agent = -ECONNABORTED, st_err_invalid_level = -EINVAL, }; enum lrmd_errors { lrmd_ok = pcmk_ok, lrmd_pending = -EINPROGRESS, lrmd_err_generic = -EPROTONOSUPPORT, lrmd_err_internal = -EPROTO, lrmd_err_connection = -ENOTCONN, lrmd_err_missing = -EINVAL, lrmd_err_ipc = -ECOMM, lrmd_err_peer = -ENOMSG, lrmd_err_unknown_operation = -EOPNOTSUPP, lrmd_err_unknown_rsc = -ENODEV, lrmd_err_no_metadata = -EIO, lrmd_err_stonith_connection = -EUNATCH, lrmd_err_provider_required = -EINVAL, }; #define stonith_error2string pcmk_strerror #define lrmd_error2string pcmk_strerror #define cib_error2string pcmk_strerror static inline void slist_basic_destroy(GListPtr list) { GListPtr gIter = NULL; for (gIter = list; gIter != NULL; gIter = gIter->next) { free(gIter->data); } g_list_free(list); } # define crm_strdup strdup # define set_bit_inplace set_bit # define clear_bit_inplace clear_bit # define crm_malloc0(malloc_obj, length) do { \ malloc_obj = malloc(length); \ if(malloc_obj == NULL) { \ crm_err("Failed allocation of %lu bytes", (unsigned long)length); \ CRM_ASSERT(malloc_obj != NULL); \ } \ memset(malloc_obj, 0, length); \ } while(0) # define crm_malloc(malloc_obj, length) do { \ malloc_obj = malloc(length); \ if(malloc_obj == NULL) { \ crm_err("Failed allocation of %lu bytes", (unsigned long)length); \ CRM_ASSERT(malloc_obj != NULL); \ } \ } while(0) # define crm_realloc(realloc_obj, length) do { \ realloc_obj = realloc(realloc_obj, length); \ CRM_ASSERT(realloc_obj != NULL); \ } while(0) #define crm_free(free_obj) do { free(free_obj); free_obj=NULL; } while(0) /* These two child iterator macros are no longer to be used * They exist for compatability reasons and will be removed in a * future release */ # define xml_child_iter(parent, child, code) do { \ if(parent != NULL) { \ xmlNode *child = NULL; \ xmlNode *__crm_xml_iter = parent->children; \ while(__crm_xml_iter != NULL) { \ child = __crm_xml_iter; \ __crm_xml_iter = __crm_xml_iter->next; \ if(child->type == XML_ELEMENT_NODE) { \ code; \ } \ } \ } \ } while(0) # define xml_child_iter_filter(parent, child, filter, code) do { \ if(parent != NULL) { \ xmlNode *child = NULL; \ xmlNode *__crm_xml_iter = parent->children; \ while(__crm_xml_iter != NULL) { \ child = __crm_xml_iter; \ __crm_xml_iter = __crm_xml_iter->next; \ if(child->type == XML_ELEMENT_NODE) { \ if(filter == NULL \ || crm_str_eq(filter, (const char *)child->name, TRUE)) { \ code; \ } \ } \ } \ } \ } while(0) # define xml_prop_iter(parent, prop_name, prop_value, code) do { \ if(parent != NULL) { \ xmlAttrPtr prop_iter = parent->properties; \ const char *prop_name = NULL; \ const char *prop_value = NULL; \ while(prop_iter != NULL) { \ prop_name = (const char *)prop_iter->name; \ prop_value = crm_element_value(parent, prop_name); \ prop_iter = prop_iter->next; \ if(prop_name) { \ code; \ } \ } \ } \ } while(0) # define xml_prop_name_iter(parent, prop_name, code) do { \ if(parent != NULL) { \ xmlAttrPtr prop_iter = parent->properties; \ const char *prop_name = NULL; \ while(prop_iter != NULL) { \ prop_name = (const char *)prop_iter->name; \ prop_iter = prop_iter->next; \ if(prop_name) { \ code; \ } \ } \ } \ } while(0) # define zap_xml_from_parent(parent, xml_obj) free_xml_from_parent(parent, xml_obj); xml_obj = NULL /* Use something like this instead of the next macro: GListPtr gIter = rsc->children; for(; gIter != NULL; gIter = gIter->next) { resource_t *child_rsc = (resource_t*)gIter->data; ... } */ # define slist_destroy(child_type, child, parent, a) do { \ GListPtr __crm_iter_head = parent; \ child_type *child = NULL; \ while(__crm_iter_head != NULL) { \ child = (child_type *) __crm_iter_head->data; \ __crm_iter_head = __crm_iter_head->next; \ { a; } \ } \ g_list_free(parent); \ } while(0) # ifdef CRM_ATTRD__H static inline gboolean attrd_update(crm_ipc_t *cluster, char command, const char *host, const char *name, const char *value, const char *section, const char *set, const char *dampen) { return attrd_update_delegate(cluster, command, host, name, value, section, set, dampen, NULL); } static inline gboolean attrd_lazy_update(char command, const char *host, const char *name, const char *value, const char *section, const char *set, const char *dampen) { return attrd_update_delegate(NULL, command, host, name, value, section, set, dampen, NULL); } static inline gboolean attrd_update_no_mainloop(int *connection, char command, const char *host, const char *name, const char *value, const char *section, const char *set, const char *dampen) { return attrd_update_delegate(NULL, command, host, name, value, section, set, dampen, NULL); } # endif # ifdef CIB_UTIL__H static inline int update_attr(cib_t * the_cib, int call_options, const char *section, const char *node_uuid, const char *set_type, const char *set_name, const char *attr_id, const char *attr_name, const char *attr_value, gboolean to_console) { return update_attr_delegate(the_cib, call_options, section, node_uuid, set_type, set_name, attr_id, attr_name, attr_value, to_console, NULL); } static inline int find_nvpair_attr(cib_t * the_cib, const char *attr, const char *section, const char *node_uuid, const char *set_type, const char *set_name, const char *attr_id, const char *attr_name, gboolean to_console, char **value) { return find_nvpair_attr_delegate(the_cib, attr, section, node_uuid, set_type, set_name, attr_id, attr_name, to_console, value, NULL); } static inline int read_attr(cib_t * the_cib, const char *section, const char *node_uuid, const char *set_type, const char *set_name, const char *attr_id, const char *attr_name, char **attr_value, gboolean to_console) { return read_attr_delegate(the_cib, section, node_uuid, set_type, set_name, attr_id, attr_name, attr_value, to_console, NULL); } static inline int delete_attr(cib_t * the_cib, int options, const char *section, const char *node_uuid, const char *set_type, const char *set_name, const char *attr_id, const char *attr_name, const char *attr_value, gboolean to_console) { return delete_attr_delegate(the_cib, options, section, node_uuid, set_type, set_name, attr_id, attr_name, attr_value, to_console, NULL); } # endif #endif diff --git a/include/crm/crm.h b/include/crm/crm.h index c28e21b481..ec17da3c48 100644 --- a/include/crm/crm.h +++ b/include/crm/crm.h @@ -1,193 +1,190 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef CRM__H # define CRM__H # include # include # include # include # undef MIN # undef MAX # include # include # define CRM_FEATURE_SET "3.0.6" # define MINIMUM_SCHEMA_VERSION "pacemaker-1.0" # define LATEST_SCHEMA_VERSION "pacemaker-"CRM_DTD_VERSION # define EOS '\0' # define DIMOF(a) ((int) (sizeof(a)/sizeof(a[0])) ) # ifndef MAX_NAME # define MAX_NAME 256 # endif # ifndef __GNUC__ # define __builtin_expect(expr, result) (expr) # endif /* Some handy macros used by the Linux kernel */ # define __likely(expr) __builtin_expect(expr, 1) # define __unlikely(expr) __builtin_expect(expr, 0) # define CRM_META "CRM_meta" extern const char *crm_system_name; /* *INDENT-OFF* */ /* Clean these up at some point, some probably should be runtime options */ # define SOCKET_LEN 1024 # define APPNAME_LEN 256 # define MAX_IPC_FAIL 5 # define MAX_IPC_DELAY 120 # define MSG_LOG 1 # define DOT_FSA_ACTIONS 1 # define DOT_ALL_FSA_INPUTS 1 /* #define FSA_TRACE 1 */ # define INFINITY_S "INFINITY" # define MINUS_INFINITY_S "-INFINITY" # define INFINITY 1000000 /* Sub-systems */ # define CRM_SYSTEM_DC "dc" # define CRM_SYSTEM_DCIB "dcib" /* The master CIB */ # define CRM_SYSTEM_CIB "cib" # define CRM_SYSTEM_CRMD "crmd" # define CRM_SYSTEM_LRMD "lrmd" # define CRM_SYSTEM_PENGINE "pengine" # define CRM_SYSTEM_TENGINE "tengine" # define CRM_SYSTEM_STONITHD "stonithd" # define CRM_SYSTEM_MCP "pacemakerd" /* Valid operations */ # define CRM_OP_NOOP "noop" # define CRM_OP_JOIN_ANNOUNCE "join_announce" # define CRM_OP_JOIN_OFFER "join_offer" # define CRM_OP_JOIN_REQUEST "join_request" # define CRM_OP_JOIN_ACKNAK "join_ack_nack" # define CRM_OP_JOIN_CONFIRM "join_confirm" # define CRM_OP_DIE "die_no_respawn" # define CRM_OP_RETRIVE_CIB "retrieve_cib" # define CRM_OP_PING "ping" # define CRM_OP_VOTE "vote" # define CRM_OP_NOVOTE "no-vote" # define CRM_OP_HELLO "hello" # define CRM_OP_HBEAT "dc_beat" # define CRM_OP_PECALC "pe_calc" # define CRM_OP_ABORT "abort" # define CRM_OP_QUIT "quit" # define CRM_OP_LOCAL_SHUTDOWN "start_shutdown" # define CRM_OP_SHUTDOWN_REQ "req_shutdown" # define CRM_OP_SHUTDOWN "do_shutdown" # define CRM_OP_FENCE "stonith" # define CRM_OP_EVENTCC "event_cc" # define CRM_OP_TEABORT "te_abort" # define CRM_OP_TEABORTED "te_abort_confirmed" /* we asked */ # define CRM_OP_TE_HALT "te_halt" # define CRM_OP_TECOMPLETE "te_complete" # define CRM_OP_TETIMEOUT "te_timeout" # define CRM_OP_TRANSITION "transition" # define CRM_OP_REGISTER "register" # define CRM_OP_DEBUG_UP "debug_inc" # define CRM_OP_DEBUG_DOWN "debug_dec" # define CRM_OP_INVOKE_LRM "lrm_invoke" # define CRM_OP_LRM_REFRESH "lrm_refresh" # define CRM_OP_LRM_QUERY "lrm_query" # define CRM_OP_LRM_DELETE "lrm_delete" # define CRM_OP_LRM_FAIL "lrm_fail" # define CRM_OP_PROBED "probe_complete" # define CRM_OP_REPROBE "probe_again" # define CRM_OP_CLEAR_FAILCOUNT "clear_failcount" # define CRM_OP_RELAXED_SET "one-or-more" # define CRM_OP_RM_NODE_CACHE "rm_node_cache" -# define CRMD_STATE_ACTIVE "member" -# define CRMD_STATE_INACTIVE "down" - -# define CRMD_JOINSTATE_DOWN CRMD_STATE_INACTIVE -# define CRMD_JOINSTATE_PENDING "pending" -# define CRMD_JOINSTATE_MEMBER CRMD_STATE_ACTIVE -# define CRMD_JOINSTATE_NACK "banned" +# define CRMD_JOINSTATE_DOWN "down" +# define CRMD_JOINSTATE_PENDING "pending" +# define CRMD_JOINSTATE_MEMBER "member" +# define CRMD_JOINSTATE_NACK "banned" # define CRMD_ACTION_DELETE "delete" # define CRMD_ACTION_CANCEL "cancel" # define CRMD_ACTION_MIGRATE "migrate_to" # define CRMD_ACTION_MIGRATED "migrate_from" # define CRMD_ACTION_START "start" # define CRMD_ACTION_STARTED "running" # define CRMD_ACTION_STOP "stop" # define CRMD_ACTION_STOPPED "stopped" # define CRMD_ACTION_PROMOTE "promote" # define CRMD_ACTION_PROMOTED "promoted" # define CRMD_ACTION_DEMOTE "demote" # define CRMD_ACTION_DEMOTED "demoted" # define CRMD_ACTION_NOTIFY "notify" # define CRMD_ACTION_NOTIFIED "notified" # define CRMD_ACTION_STATUS "monitor" /* short names */ # define RSC_DELETE CRMD_ACTION_DELETE # define RSC_CANCEL CRMD_ACTION_CANCEL # define RSC_MIGRATE CRMD_ACTION_MIGRATE # define RSC_MIGRATED CRMD_ACTION_MIGRATED # define RSC_START CRMD_ACTION_START # define RSC_STARTED CRMD_ACTION_STARTED # define RSC_STOP CRMD_ACTION_STOP # define RSC_STOPPED CRMD_ACTION_STOPPED # define RSC_PROMOTE CRMD_ACTION_PROMOTE # define RSC_PROMOTED CRMD_ACTION_PROMOTED # define RSC_DEMOTE CRMD_ACTION_DEMOTE # define RSC_DEMOTED CRMD_ACTION_DEMOTED # define RSC_NOTIFY CRMD_ACTION_NOTIFY # define RSC_NOTIFIED CRMD_ACTION_NOTIFIED # define RSC_STATUS CRMD_ACTION_STATUS /* *INDENT-ON* */ typedef GList *GListPtr; # include # include # include # define crm_str_hash g_str_hash_traditional guint g_str_hash_traditional(gconstpointer v); #endif diff --git a/lib/cluster/corosync.c b/lib/cluster/corosync.c index 5f0991a30e..c205ecef4c 100644 --- a/lib/cluster/corosync.c +++ b/lib/cluster/corosync.c @@ -1,1092 +1,1092 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include cpg_handle_t pcmk_cpg_handle = 0; struct cpg_name pcmk_cpg_group = { .length = 0, .value[0] = 0, }; quorum_handle_t pcmk_quorum_handle = 0; gboolean(*quorum_app_callback) (unsigned long long seq, gboolean quorate) = NULL; static char *pcmk_uname = NULL; static int pcmk_uname_len = 0; static uint32_t pcmk_nodeid = 0; #define cs_repeat(counter, max, code) do { \ code; \ if(rc == CS_ERR_TRY_AGAIN || rc == CS_ERR_QUEUE_FULL) { \ counter++; \ crm_debug("Retrying operation after %ds", counter); \ sleep(counter); \ } else { \ break; \ } \ } while(counter < max) #ifndef INTERFACE_MAX # define INTERFACE_MAX 2 /* from the private coroapi.h header */ #endif static gboolean corosync_name_is_valid(const char *key, const char *name) { int octet; if(name == NULL) { crm_trace("%s is empty", key); return FALSE; } else if(sscanf(name, "%d.%d.%d.%d", &octet, &octet, &octet, &octet) == 4) { crm_trace("%s contains an ipv4 address, ignoring: %s", key, name); return FALSE; } else if(strstr(name, ":") != NULL) { crm_trace("%s contains an ipv4 address, ignoring: %s", key, name); return FALSE; } crm_trace("%s is valid", key); return TRUE; } /* * CFG functionality stolen from node_name() in corosync-quorumtool.c * This resolves the first address assigned to a node and returns the name or IP address. */ static char *corosync_node_name(cmap_handle_t cmap_handle, uint32_t nodeid) { int lpc = 0; int rc = CS_OK; int retries = 0; char *name = NULL; cmap_handle_t local_handle = 0; corosync_cfg_handle_t cfg_handle = 0; static corosync_cfg_callbacks_t cfg_callbacks = {}; if(cmap_handle == 0 && local_handle == 0) { retries = 0; crm_trace("Initializing CMAP connection"); do { rc = cmap_initialize(&local_handle); if(rc != CS_OK) { retries++; crm_debug("API connection setup failed: %s. Retrying in %ds", cs_strerror(rc), retries); sleep(retries); } } while(retries < 5 && rc != CS_OK); if (rc != CS_OK) { crm_warn("Could not connect to Cluster Configuration Database API, error %d", cs_strerror(rc)); local_handle = 0; } } if(cmap_handle == 0) { cmap_handle = local_handle; } while(name == NULL && cmap_handle != 0) { uint32_t id = 0; char *key = NULL; key = g_strdup_printf("nodelist.node.%d.nodeid", lpc); rc = cmap_get_uint32(cmap_handle, key, &id); crm_trace("Checking %u vs %u from %s", nodeid, id, key); g_free(key); if(rc != CS_OK) { break; } if(nodeid == id) { crm_trace("Searching for node name for %u in nodelist.node.%d %s", nodeid, lpc, name); if(name == NULL) { key = g_strdup_printf("nodelist.node.%d.ring0_addr", lpc); rc = cmap_get_string(cmap_handle, key, &name); crm_trace("%s = %s", key, name); if(corosync_name_is_valid(key, name) == FALSE) { free(name); name = NULL; } g_free(key); } if(name == NULL) { key = g_strdup_printf("nodelist.node.%d.name", lpc); rc = cmap_get_string(cmap_handle, key, &name); crm_trace("%s = %s %d", key, name, rc); if(corosync_name_is_valid(key, name) == FALSE) { free(name); name = NULL; } g_free(key); } break; } lpc++; } if(name == NULL) { retries = 0; crm_trace("Initializing CFG connection"); do { rc = corosync_cfg_initialize(&cfg_handle, &cfg_callbacks); if(rc != CS_OK) { retries++; crm_debug("API connection setup failed: %s. Retrying in %ds", cs_strerror(rc), retries); sleep(retries); } } while(retries < 5 && rc != CS_OK); if (rc != CS_OK) { crm_warn("Could not connect to the Corosync CFG API, error %d", cs_strerror(rc)); cfg_handle = 0; } } if(name == NULL && cfg_handle != 0) { int numaddrs; char buf[INET6_ADDRSTRLEN]; socklen_t addrlen; struct sockaddr_storage *ss; corosync_cfg_node_address_t addrs[INTERFACE_MAX]; rc = corosync_cfg_get_node_addrs(cfg_handle, nodeid, INTERFACE_MAX, &numaddrs, addrs); if (rc == CS_OK) { ss = (struct sockaddr_storage *)addrs[0].address; if (ss->ss_family == AF_INET6) { addrlen = sizeof(struct sockaddr_in6); } else { addrlen = sizeof(struct sockaddr_in); } if (getnameinfo((struct sockaddr *)addrs[0].address, addrlen, buf, sizeof(buf), NULL, 0, 0) == 0) { crm_notice("Inferred node name '%s' for nodeid %u from DNS", buf, nodeid); if(corosync_name_is_valid("DNS", buf)) { name = strdup(buf); } } } else { crm_debug("Unable to get node address for nodeid %u: %s", nodeid, cs_strerror(rc)); } cmap_finalize(cfg_handle); } if(local_handle) { cmap_finalize(local_handle); } if(name == NULL) { crm_err("Unable to get node name for nodeid %u", nodeid); } return name; } enum crm_ais_msg_types text2msg_type(const char *text) { int type = crm_msg_none; CRM_CHECK(text != NULL, return type); if (safe_str_eq(text, "ais")) { type = crm_msg_ais; } else if (safe_str_eq(text, "crm_plugin")) { type = crm_msg_ais; } else if (safe_str_eq(text, CRM_SYSTEM_CIB)) { type = crm_msg_cib; } else if (safe_str_eq(text, CRM_SYSTEM_CRMD)) { type = crm_msg_crmd; } else if (safe_str_eq(text, CRM_SYSTEM_DC)) { type = crm_msg_crmd; } else if (safe_str_eq(text, CRM_SYSTEM_TENGINE)) { type = crm_msg_te; } else if (safe_str_eq(text, CRM_SYSTEM_PENGINE)) { type = crm_msg_pe; } else if (safe_str_eq(text, CRM_SYSTEM_LRMD)) { type = crm_msg_lrmd; } else if (safe_str_eq(text, CRM_SYSTEM_STONITHD)) { type = crm_msg_stonithd; } else if (safe_str_eq(text, "stonith-ng")) { type = crm_msg_stonith_ng; } else if (safe_str_eq(text, "attrd")) { type = crm_msg_attrd; } else { /* This will normally be a transient client rather than * a cluster daemon. Set the type to the pid of the client */ int scan_rc = sscanf(text, "%d", &type); if (scan_rc != 1) { /* Ensure its sane */ type = crm_msg_none; } } return type; } static char *ais_cluster_name = NULL; gboolean crm_get_cluster_name(char **cname) { CRM_CHECK(cname != NULL, return FALSE); if (ais_cluster_name) { *cname = strdup(ais_cluster_name); return TRUE; } return FALSE; } gboolean send_ais_text(int class, const char *data, gboolean local, const char *node, enum crm_ais_msg_types dest) { static int msg_id = 0; static int local_pid = 0; int retries = 0; int rc = CS_OK; int buf_len = sizeof(cs_ipc_header_response_t); char *buf = NULL; struct iovec iov; const char *transport = "pcmk"; AIS_Message *ais_msg = NULL; enum crm_ais_msg_types sender = text2msg_type(crm_system_name); /* There are only 6 handlers registered to crm_lib_service in plugin.c */ CRM_CHECK(class < 6, crm_err("Invalid message class: %d", class); return FALSE); if (data == NULL) { data = ""; } if (local_pid == 0) { local_pid = getpid(); } if (sender == crm_msg_none) { sender = local_pid; } ais_msg = calloc(1, sizeof(AIS_Message)); ais_msg->id = msg_id++; ais_msg->header.id = class; ais_msg->header.error = CS_OK; ais_msg->host.type = dest; ais_msg->host.local = local; if (node) { ais_msg->host.size = strlen(node); memset(ais_msg->host.uname, 0, MAX_NAME); memcpy(ais_msg->host.uname, node, ais_msg->host.size); ais_msg->host.id = 0; } else { ais_msg->host.size = 0; memset(ais_msg->host.uname, 0, MAX_NAME); ais_msg->host.id = 0; } ais_msg->sender.id = 0; ais_msg->sender.type = sender; ais_msg->sender.pid = local_pid; ais_msg->sender.size = pcmk_uname_len; memset(ais_msg->sender.uname, 0, MAX_NAME); memcpy(ais_msg->sender.uname, pcmk_uname, ais_msg->sender.size); ais_msg->size = 1 + strlen(data); if (ais_msg->size < CRM_BZ2_THRESHOLD) { failback: ais_msg = realloc(ais_msg, sizeof(AIS_Message) + ais_msg->size); memcpy(ais_msg->data, data, ais_msg->size); } else { char *compressed = NULL; char *uncompressed = strdup(data); unsigned int len = (ais_msg->size * 1.1) + 600; /* recomended size */ crm_trace("Compressing message payload"); compressed = malloc( len); rc = BZ2_bzBuffToBuffCompress(compressed, &len, uncompressed, ais_msg->size, CRM_BZ2_BLOCKS, 0, CRM_BZ2_WORK); free(uncompressed); if (rc != BZ_OK) { crm_err("Compression failed: %d", rc); free(compressed); goto failback; } ais_msg = realloc(ais_msg, sizeof(AIS_Message) + len + 1); memcpy(ais_msg->data, compressed, len); ais_msg->data[len] = 0; free(compressed); ais_msg->is_compressed = TRUE; ais_msg->compressed_size = len; crm_trace("Compression details: %d -> %d", ais_msg->size, ais_data_len(ais_msg)); } ais_msg->header.size = sizeof(AIS_Message) + ais_data_len(ais_msg); crm_trace("Sending%s message %d to %s.%s (data=%d, total=%d)", ais_msg->is_compressed ? " compressed" : "", ais_msg->id, ais_dest(&(ais_msg->host)), msg_type2text(dest), ais_data_len(ais_msg), ais_msg->header.size); iov.iov_base = ais_msg; iov.iov_len = ais_msg->header.size; buf = realloc(buf, buf_len); do { if (rc == CS_ERR_TRY_AGAIN || rc == CS_ERR_QUEUE_FULL) { retries++; crm_info("Peer overloaded or membership in flux:" " Re-sending message (Attempt %d of 20)", retries); sleep(retries); /* Proportional back off */ } errno = 0; transport = "cpg"; CRM_CHECK(dest != crm_msg_ais, rc = CS_ERR_MESSAGE_ERROR; goto bail); rc = cpg_mcast_joined(pcmk_cpg_handle, CPG_TYPE_AGREED, &iov, 1); if (rc == CS_ERR_TRY_AGAIN || rc == CS_ERR_QUEUE_FULL) { cpg_flow_control_state_t fc_state = CPG_FLOW_CONTROL_DISABLED; int rc2 = cpg_flow_control_state_get(pcmk_cpg_handle, &fc_state); if (rc2 == CS_OK && fc_state == CPG_FLOW_CONTROL_ENABLED) { crm_warn("Connection overloaded, cannot send messages"); goto bail; } else if (rc2 != CS_OK) { crm_warn("Could not determin the connection state: %s (%d)", ais_error2text(rc2), rc2); goto bail; } } } while ((rc == CS_ERR_TRY_AGAIN || rc == CS_ERR_QUEUE_FULL) && retries < 20); bail: if (rc != CS_OK) { crm_perror(LOG_ERR, "Sending message %d via %s: FAILED (rc=%d): %s", ais_msg->id, transport, rc, ais_error2text(rc)); } else { crm_trace("Message %d: sent", ais_msg->id); } free(buf); free(ais_msg); return (rc == CS_OK); } gboolean send_ais_message(xmlNode * msg, gboolean local, const char *node, enum crm_ais_msg_types dest) { gboolean rc = TRUE; char *data = dump_xml_unformatted(msg); rc = send_ais_text(crm_class_cluster, data, local, node, dest); free(data); return rc; } void terminate_ais_connection(void) { crm_notice("Disconnecting from Corosync"); if(pcmk_cpg_handle) { crm_trace("Disconnecting CPG"); cpg_leave(pcmk_cpg_handle, &pcmk_cpg_group); cpg_finalize(pcmk_cpg_handle); pcmk_cpg_handle = 0; } else { crm_info("No CPG connection"); } if(pcmk_quorum_handle) { crm_trace("Disconnecting quorum"); quorum_finalize(pcmk_quorum_handle); pcmk_quorum_handle = 0; } else { crm_info("No Quorum connection"); } } int ais_membership_timer = 0; gboolean ais_membership_force = FALSE; static gboolean ais_dispatch_message(AIS_Message * msg, gboolean(*dispatch) (AIS_Message *, char *, int)) { char *data = NULL; char *uncompressed = NULL; xmlNode *xml = NULL; CRM_ASSERT(msg != NULL); crm_trace("Got new%s message (size=%d, %d, %d)", msg->is_compressed ? " compressed" : "", ais_data_len(msg), msg->size, msg->compressed_size); data = msg->data; if (msg->is_compressed && msg->size > 0) { int rc = BZ_OK; unsigned int new_size = msg->size + 1; if (check_message_sanity(msg, NULL) == FALSE) { goto badmsg; } crm_trace("Decompressing message data"); uncompressed = calloc(1, new_size); rc = BZ2_bzBuffToBuffDecompress(uncompressed, &new_size, data, msg->compressed_size, 1, 0); if (rc != BZ_OK) { crm_err("Decompression failed: %d", rc); goto badmsg; } CRM_ASSERT(rc == BZ_OK); CRM_ASSERT(new_size == msg->size); data = uncompressed; } else if (check_message_sanity(msg, data) == FALSE) { goto badmsg; } else if (safe_str_eq("identify", data)) { int pid = getpid(); char *pid_s = crm_itoa(pid); send_ais_text(crm_class_cluster, pid_s, TRUE, NULL, crm_msg_ais); free(pid_s); goto done; } if (msg->header.id != crm_class_members) { crm_update_peer(__FUNCTION__, msg->sender.id, 0, 0, 0, 0, msg->sender.uname, msg->sender.uname, NULL, NULL); } if (msg->header.id == crm_class_rmpeer) { uint32_t id = crm_int_helper(data, NULL); crm_info("Removing peer %s/%u", data, id); reap_crm_member(id); goto done; } crm_trace("Payload: %s", data); if (dispatch != NULL) { dispatch(msg, data, 0); } done: free(uncompressed); free_xml(xml); return TRUE; badmsg: crm_err("Invalid message (id=%d, dest=%s:%s, from=%s:%s.%d):" " min=%d, total=%d, size=%d, bz2_size=%d", msg->id, ais_dest(&(msg->host)), msg_type2text(msg->host.type), ais_dest(&(msg->sender)), msg_type2text(msg->sender.type), msg->sender.pid, (int)sizeof(AIS_Message), msg->header.size, msg->size, msg->compressed_size); goto done; } gboolean(*pcmk_cpg_dispatch_fn) (AIS_Message *, char *, int) = NULL; static int pcmk_cpg_dispatch(gpointer user_data) { int rc = 0; pcmk_cpg_dispatch_fn = user_data; rc = cpg_dispatch(pcmk_cpg_handle, CS_DISPATCH_ALL); if (rc != CS_OK) { crm_err("Connection to the CPG API failed: %d", rc); return -1; } return 0; } static void pcmk_cpg_deliver(cpg_handle_t handle, const struct cpg_name *groupName, uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) { AIS_Message *ais_msg = (AIS_Message *) msg; if (ais_msg->sender.id > 0 && ais_msg->sender.id != nodeid) { crm_err("Nodeid mismatch from %d.%d: claimed nodeid=%u", nodeid, pid, ais_msg->sender.id); return; } else if (ais_msg->host.size != 0 && safe_str_neq(ais_msg->host.uname, pcmk_uname)) { /* Not for us */ return; } ais_msg->sender.id = nodeid; if (ais_msg->sender.size == 0) { crm_node_t *peer = crm_get_peer(nodeid, NULL); if (peer == NULL) { crm_err("Peer with nodeid=%u is unknown", nodeid); } else if (peer->uname == NULL) { crm_err("No uname for peer with nodeid=%u", nodeid); } else { crm_notice("Fixing uname for peer with nodeid=%u", nodeid); ais_msg->sender.size = strlen(peer->uname); memset(ais_msg->sender.uname, 0, MAX_NAME); memcpy(ais_msg->sender.uname, peer->uname, ais_msg->sender.size); } } ais_dispatch_message(ais_msg, pcmk_cpg_dispatch_fn); } static void pcmk_cpg_membership(cpg_handle_t handle, const struct cpg_name *groupName, const struct cpg_address *member_list, size_t member_list_entries, const struct cpg_address *left_list, size_t left_list_entries, const struct cpg_address *joined_list, size_t joined_list_entries) { int i; gboolean found = FALSE; static int counter = 0; for (i = 0; i < left_list_entries; i++) { crm_node_t *peer = crm_get_peer(left_list[i].nodeid, NULL); crm_info("Left[%d.%d] %s.%d ", counter, i, groupName->value, left_list[i].nodeid); crm_update_peer_proc(__FUNCTION__, peer, crm_proc_cpg, OFFLINESTATUS); } for (i = 0; i < joined_list_entries; i++) { crm_info("Joined[%d.%d] %s.%d ", counter, i, groupName->value, joined_list[i].nodeid); } for (i = 0; i < member_list_entries; i++) { crm_node_t *peer = crm_get_peer(member_list[i].nodeid, NULL); crm_info("Member[%d.%d] %s.%d ", counter, i, groupName->value, member_list[i].nodeid); crm_update_peer_proc(__FUNCTION__, peer, crm_proc_cpg, ONLINESTATUS); if(pcmk_nodeid == member_list[i].nodeid) { found = TRUE; } } if(!found) { crm_err("We're not part of CPG group %s anymore!", groupName->value); /* Possibly re-call cpg_join() */ } counter++; } cpg_callbacks_t cpg_callbacks = { .cpg_deliver_fn = pcmk_cpg_deliver, .cpg_confchg_fn = pcmk_cpg_membership, }; static gboolean init_cpg_connection(gboolean(*dispatch) (AIS_Message *, char *, int), void (*destroy) (gpointer), uint32_t * nodeid) { int rc = -1; int fd = 0; int retries = 0; crm_node_t *peer = NULL; struct mainloop_fd_callbacks cpg_fd_callbacks = { .dispatch = pcmk_cpg_dispatch, .destroy = destroy, }; strcpy(pcmk_cpg_group.value, crm_system_name); pcmk_cpg_group.length = strlen(crm_system_name) + 1; cs_repeat(retries, 30, rc = cpg_initialize(&pcmk_cpg_handle, &cpg_callbacks)); if (rc != CS_OK) { crm_err("Could not connect to the Cluster Process Group API: %d\n", rc); goto bail; } retries = 0; cs_repeat(retries, 30, rc = cpg_local_get(pcmk_cpg_handle, (unsigned int *)nodeid)); if (rc != CS_OK) { crm_err("Could not get local node id from the CPG API"); goto bail; } retries = 0; cs_repeat(retries, 30, rc = cpg_join(pcmk_cpg_handle, &pcmk_cpg_group)); if (rc != CS_OK) { crm_err("Could not join the CPG group '%s': %d", crm_system_name, rc); goto bail; } rc = cpg_fd_get(pcmk_cpg_handle, &fd); if (rc != CS_OK) { crm_err("Could not obtain the CPG API connection: %d\n", rc); goto bail; } mainloop_add_fd("corosync-cpg", fd, dispatch, &cpg_fd_callbacks); bail: if (rc != CS_OK) { cpg_finalize(pcmk_cpg_handle); return FALSE; } peer = crm_get_peer(pcmk_nodeid, pcmk_uname); crm_update_peer_proc(__FUNCTION__, peer, crm_proc_cpg, ONLINESTATUS); return TRUE; } static int pcmk_quorum_dispatch(gpointer user_data) { int rc = 0; rc = quorum_dispatch(pcmk_quorum_handle, CS_DISPATCH_ALL); if (rc < 0) { crm_err("Connection to the Quorum API failed: %d", rc); return -1; } return 0; } static void corosync_mark_unseen_peer_dead(gpointer key, gpointer value, gpointer user_data) { int *seq = user_data; crm_node_t *node = value; - if (node->last_seen != *seq && crm_str_eq(CRM_NODE_LOST, node->state, TRUE) == FALSE) { + if (node->last_seen != *seq && node->state && crm_str_eq(CRM_NODE_LOST, node->state, TRUE) == FALSE) { crm_notice("Node %d/%s was not seen in the previous transition", node->id, node->uname); crm_update_peer(__FUNCTION__, node->id, 0, 0, 0, 0, NULL, NULL, NULL, CRM_NODE_LOST); } } static void corosync_mark_node_unseen(gpointer key, gpointer value, gpointer user_data) { crm_node_t *node = value; node->last_seen = 0; } static void pcmk_quorum_notification(quorum_handle_t handle, uint32_t quorate, uint64_t ring_id, uint32_t view_list_entries, uint32_t * view_list) { int i; static gboolean init_phase = TRUE; if (quorate != crm_have_quorum) { crm_notice("Membership " U64T ": quorum %s (%lu)", ring_id, quorate ? "acquired" : "lost", (long unsigned int)view_list_entries); crm_have_quorum = quorate; } else { crm_info("Membership " U64T ": quorum %s (%lu)", ring_id, quorate ? "retained" : "still lost", (long unsigned int)view_list_entries); } if(view_list_entries == 0 && init_phase) { crm_info("Corosync membership is still forming, ignoring"); return; } init_phase = FALSE; g_hash_table_foreach(crm_peer_cache, corosync_mark_node_unseen, NULL); for (i = 0; i < view_list_entries; i++) { uint32_t id = view_list[i]; char *name = NULL; crm_node_t *node = NULL; char *uuid = get_corosync_uuid(id, NULL); crm_debug("Member[%d] %d ", i, id); node = crm_get_peer(id, NULL); if(node->uname == NULL) { crm_info("Obtaining name for new node %u", id); name = corosync_node_name(0, id); } crm_update_peer(__FUNCTION__, id, 0, ring_id, 0, 0, uuid, name, NULL, CRM_NODE_MEMBER); free(name); } crm_trace("Reaping unseen nodes..."); g_hash_table_foreach(crm_peer_cache, corosync_mark_unseen_peer_dead, &ring_id); if (quorum_app_callback) { quorum_app_callback(ring_id, quorate); } } quorum_callbacks_t quorum_callbacks = { .quorum_notify_fn = pcmk_quorum_notification, }; gboolean init_quorum_connection(gboolean(*dispatch) (unsigned long long, gboolean), void (*destroy) (gpointer)) { int rc = -1; int fd = 0; int quorate = 0; uint32_t quorum_type = 0; struct mainloop_fd_callbacks quorum_fd_callbacks; quorum_fd_callbacks.dispatch = pcmk_quorum_dispatch; quorum_fd_callbacks.destroy = destroy; crm_debug("Configuring Pacemaker to obtain quorum from Corosync"); rc = quorum_initialize(&pcmk_quorum_handle, &quorum_callbacks, &quorum_type); if (rc != CS_OK) { crm_err("Could not connect to the Quorum API: %d\n", rc); goto bail; } else if (quorum_type != QUORUM_SET) { crm_err("Corosync quorum is not configured\n"); goto bail; } rc = quorum_getquorate(pcmk_quorum_handle, &quorate); if (rc != CS_OK) { crm_err("Could not obtain the current Quorum API state: %d\n", rc); goto bail; } crm_notice("Quorum %s", quorate ? "acquired" : "lost"); quorum_app_callback = dispatch; crm_have_quorum = quorate; rc = quorum_trackstart(pcmk_quorum_handle, CS_TRACK_CHANGES | CS_TRACK_CURRENT); if (rc != CS_OK) { crm_err("Could not setup Quorum API notifications: %d\n", rc); goto bail; } rc = quorum_fd_get(pcmk_quorum_handle, &fd); if (rc != CS_OK) { crm_err("Could not obtain the Quorum API connection: %d\n", rc); goto bail; } mainloop_add_fd("quorum", fd, dispatch, &quorum_fd_callbacks); corosync_initialize_nodelist(NULL, FALSE, NULL); bail: if (rc != CS_OK) { quorum_finalize(pcmk_quorum_handle); return FALSE; } return TRUE; } gboolean init_ais_connection(gboolean(*dispatch) (AIS_Message *, char *, int), void (*destroy) (gpointer), char **our_uuid, char **our_uname, int *nodeid) { int retries = 0; while (retries++ < 30) { int rc = init_ais_connection_once(dispatch, destroy, our_uuid, our_uname, nodeid); switch (rc) { case CS_OK: return TRUE; break; case CS_ERR_TRY_AGAIN: case CS_ERR_QUEUE_FULL: break; default: return FALSE; } } crm_err("Retry count exceeded: %d", retries); return FALSE; } gboolean init_ais_connection_once(gboolean(*dispatch) (AIS_Message *, char *, int), void (*destroy) (gpointer), char **our_uuid, char **our_uname, int *nodeid) { struct utsname res; enum cluster_type_e stack = get_cluster_type(); crm_peer_init(); /* Here we just initialize comms */ if(stack != pcmk_cluster_corosync) { crm_err("Invalid cluster type: %s (%d)", name_for_cluster_type(stack), stack); return FALSE; } if (init_cpg_connection(dispatch, destroy, &pcmk_nodeid) == FALSE) { return FALSE; } else if (uname(&res) < 0) { crm_perror(LOG_ERR, "Could not determin the current host"); exit(100); } else { pcmk_uname = strdup(res.nodename); } crm_info("Connection to '%s': established", name_for_cluster_type(stack)); CRM_ASSERT(pcmk_uname != NULL); pcmk_uname_len = strlen(pcmk_uname); if (pcmk_nodeid != 0) { /* Ensure the local node always exists */ crm_update_peer(__FUNCTION__, pcmk_nodeid, 0, 0, 0, 0, pcmk_uname, pcmk_uname, NULL, NULL); } if (our_uuid != NULL) { *our_uuid = get_corosync_uuid(pcmk_nodeid, pcmk_uname); } if (our_uname != NULL) { *our_uname = strdup(pcmk_uname); } if (nodeid != NULL) { *nodeid = pcmk_nodeid; } return TRUE; } gboolean check_message_sanity(const AIS_Message * msg, const char *data) { gboolean sane = TRUE; int dest = msg->host.type; int tmp_size = msg->header.size - sizeof(AIS_Message); if (sane && msg->header.size == 0) { crm_warn("Message with no size"); sane = FALSE; } if (sane && msg->header.error != CS_OK) { crm_warn("Message header contains an error: %d", msg->header.error); sane = FALSE; } if (sane && ais_data_len(msg) != tmp_size) { crm_warn("Message payload size is incorrect: expected %d, got %d", ais_data_len(msg), tmp_size); sane = TRUE; } if (sane && ais_data_len(msg) == 0) { crm_warn("Message with no payload"); sane = FALSE; } if (sane && data && msg->is_compressed == FALSE) { int str_size = strlen(data) + 1; if (ais_data_len(msg) != str_size) { int lpc = 0; crm_warn("Message payload is corrupted: expected %d bytes, got %d", ais_data_len(msg), str_size); sane = FALSE; for (lpc = (str_size - 10); lpc < msg->size; lpc++) { if (lpc < 0) { lpc = 0; } crm_debug("bad_data[%d]: %d / '%c'", lpc, data[lpc], data[lpc]); } } } if (sane == FALSE) { crm_err("Invalid message %d: (dest=%s:%s, from=%s:%s.%d, compressed=%d, size=%d, total=%d)", msg->id, ais_dest(&(msg->host)), msg_type2text(dest), ais_dest(&(msg->sender)), msg_type2text(msg->sender.type), msg->sender.pid, msg->is_compressed, ais_data_len(msg), msg->header.size); } else { crm_trace ("Verfied message %d: (dest=%s:%s, from=%s:%s.%d, compressed=%d, size=%d, total=%d)", msg->id, ais_dest(&(msg->host)), msg_type2text(dest), ais_dest(&(msg->sender)), msg_type2text(msg->sender.type), msg->sender.pid, msg->is_compressed, ais_data_len(msg), msg->header.size); } return sane; } enum cluster_type_e find_corosync_variant(void) { int rc = CS_OK; cmap_handle_t handle; /* There can be only one (possibility if confdb isn't around) */ rc = cmap_initialize(&handle); if (rc != CS_OK) { crm_info("Failed to initialize the cmap API. Error %d", rc); return pcmk_cluster_unknown; } cmap_finalize(handle); return pcmk_cluster_corosync; } gboolean crm_is_corosync_peer_active(const crm_node_t * node) { if (node == NULL) { crm_trace("NULL"); return FALSE; } else if(safe_str_neq(node->state, CRM_NODE_MEMBER)) { crm_trace("%s: state=%s", node->uname, node->state); return FALSE; } else if((node->processes & crm_proc_cpg) == 0) { crm_trace("%s: processes=%.16x", node->uname, node->processes); return FALSE; } return TRUE; } gboolean corosync_initialize_nodelist(void *cluster, gboolean force_member, xmlNode *xml_parent) { int lpc = 0; int rc = CS_OK; int retries = 0; gboolean any = FALSE; cmap_handle_t cmap_handle; do { rc = cmap_initialize(&cmap_handle); if(rc != CS_OK) { retries++; crm_debug("API connection setup failed: %s. Retrying in %ds", cs_strerror(rc), retries); sleep(retries); } } while(retries < 5 && rc != CS_OK); if (rc != CS_OK) { crm_warn("Could not connect to Cluster Configuration Database API, error %d", rc); return FALSE; } crm_trace("Initializing corosync nodelist"); for(lpc = 0; ; lpc++) { uint32_t nodeid = 0; char *name = NULL; char *key = NULL; key = g_strdup_printf("nodelist.node.%d.nodeid", lpc); rc = cmap_get_uint32(cmap_handle, key, &nodeid); g_free(key); if(rc != CS_OK) { break; } name = corosync_node_name(cmap_handle, nodeid); if(nodeid > 0 || name != NULL) { crm_trace("Initializing node[%d] %u = %s", lpc, nodeid, name); crm_get_peer(nodeid, name); } if(nodeid > 0 && name != NULL) { any = TRUE; if(xml_parent) { xmlNode *node = create_xml_node(xml_parent, XML_CIB_TAG_NODE); crm_xml_add_int(node, XML_ATTR_ID, nodeid); crm_xml_add(node, XML_ATTR_UNAME, name); if(force_member) { crm_xml_add(node, XML_ATTR_TYPE, CRM_NODE_MEMBER); } } } free(name); } cmap_finalize(cmap_handle); return any; } diff --git a/lib/cluster/membership.c b/lib/cluster/membership.c index 50b8bd3786..94658b85ad 100644 --- a/lib/cluster/membership.c +++ b/lib/cluster/membership.c @@ -1,436 +1,436 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #ifndef _GNU_SOURCE # define _GNU_SOURCE #endif #include #include #include #include #include #include #include #include #include #include GHashTable *crm_peer_id_cache = NULL; GHashTable *crm_peer_cache = NULL; unsigned long long crm_peer_seq = 0; gboolean crm_have_quorum = FALSE; gboolean crm_is_peer_active(const crm_node_t * node) { #if SUPPORT_COROSYNC if(is_openais_cluster()) { return crm_is_corosync_peer_active(node); } #endif #if SUPPORT_HEARTBEAT if(is_heartbeat_cluster()) { return crm_is_heartbeat_peer_active(node); } #endif crm_err("Unhandled cluster type: %s", name_for_cluster_type(get_cluster_type())); return FALSE; } static gboolean crm_reap_dead_member(gpointer key, gpointer value, gpointer user_data) { crm_node_t *node = value; crm_node_t *search = user_data; if (search != NULL && node->id != search->id) { return FALSE; } else if (crm_is_peer_active(value) == FALSE) { crm_notice("Removing %s/%u from the membership list", node->uname, node->id); return TRUE; } return FALSE; } guint reap_crm_member(uint32_t id) { int matches = 0; crm_node_t *node = g_hash_table_lookup(crm_peer_id_cache, GUINT_TO_POINTER(id)); if (node == NULL) { crm_info("Peer %u is unknown", id); } else if (crm_is_peer_active(node)) { crm_warn("Peer %u/%s is still active", id, node->uname); } else { if (g_hash_table_remove(crm_peer_id_cache, GUINT_TO_POINTER(id))) { crm_notice("Removed dead peer %u from the uuid cache", id); } else { crm_warn("Peer %u/%s was not removed", id, node->uname); } matches = g_hash_table_foreach_remove(crm_peer_cache, crm_reap_dead_member, node); crm_notice("Removed %d dead peers with id=%u from the membership list", matches, id); } return matches; } static void crm_count_peer(gpointer key, gpointer value, gpointer user_data) { guint *count = user_data; crm_node_t *node = value; if (crm_is_peer_active(node)) { *count = *count + 1; } } guint crm_active_peers(void) { guint count = 0; g_hash_table_foreach(crm_peer_cache, crm_count_peer, &count); return count; } void destroy_crm_node(gpointer data) { crm_node_t *node = data; crm_trace("Destroying entry for node %u", node->id); free(node->addr); free(node->uname); free(node->state); free(node->uuid); free(node); } void crm_peer_init(void) { static gboolean initialized = FALSE; if (initialized) { return; } initialized = TRUE; crm_peer_destroy(); if (crm_peer_cache == NULL) { crm_peer_cache = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, destroy_crm_node); } if (crm_peer_id_cache == NULL) { crm_peer_id_cache = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL, NULL); } } void crm_peer_destroy(void) { if (crm_peer_cache != NULL) { g_hash_table_destroy(crm_peer_cache); crm_peer_cache = NULL; } if (crm_peer_id_cache != NULL) { g_hash_table_destroy(crm_peer_id_cache); crm_peer_id_cache = NULL; } } void (*crm_status_callback) (enum crm_status_type, crm_node_t *, const void *) = NULL; void crm_set_status_callback(void (*dispatch) (enum crm_status_type, crm_node_t *, const void *)) { crm_status_callback = dispatch; } static crm_node_t * crm_new_peer(unsigned int id, const char *uname) { crm_node_t *node = NULL; CRM_CHECK(uname != NULL || id > 0, return NULL); crm_peer_init(); crm_debug("Creating entry for node %s/%u", uname, id); node = calloc(1, sizeof(crm_node_t)); - node->state = strdup("unknown"); + node->state = NULL; if (id > 0) { node->id = id; crm_info("Node %s now has id: %u", crm_str(uname), id); g_hash_table_replace(crm_peer_id_cache, GUINT_TO_POINTER(node->id), node); } if (uname) { node->uname = strdup(uname); CRM_ASSERT(node->uname != NULL); crm_info("Node %u is now known as %s", id, node->uname); g_hash_table_replace(crm_peer_cache, node->uname, node); if (node->uuid == NULL) { const char *uuid = get_node_uuid(id, node->uname); if (node->uuid) { crm_info("Node %u has uuid %s", id, node->uuid); } else { node->uuid = strdup(uuid); } } if (crm_status_callback) { crm_status_callback(crm_status_uname, node, NULL); } } return node; } crm_node_t * crm_get_peer(unsigned int id, const char *uname) { crm_node_t *node = NULL; CRM_ASSERT(id > 0 || uname != NULL); crm_peer_init(); if (uname != NULL) { node = g_hash_table_lookup(crm_peer_cache, uname); } if (node == NULL && id > 0) { node = g_hash_table_lookup(crm_peer_id_cache, GUINT_TO_POINTER(id)); if (node && node->uname && uname) { crm_crit("Node %s and %s share the same cluster node id '%u'!", node->uname, uname, id); /* NOTE: Calling crm_new_peer() means the entry in * crm_peer_id_cache will point to the new entity */ /* TODO: Replace the old uname instead? */ node = crm_new_peer(id, uname); CRM_ASSERT(node->uname != NULL); } } if (node == NULL) { node = crm_new_peer(id, uname); if(uname) { const char *uuid = get_uuid(uname); crm_update_peer(__FUNCTION__, 0, 0, 0, -1, 0, uuid, uname, NULL, NULL); } } CRM_ASSERT(node); if (node && uname && node->uname == NULL) { node->uname = strdup(uname); crm_info("Node %u is now known as %s", id, uname); g_hash_table_insert(crm_peer_cache, node->uname, node); if (crm_status_callback) { crm_status_callback(crm_status_uname, node, NULL); } } if (node && node->uname && node->uuid == NULL) { const char *uuid = get_node_uuid(id, node->uname); if (node->uuid) { crm_info("Node %u has uuid %s", id, node->uuid); } else if (uuid) { node->uuid = strdup(uuid); } } if (node && id > 0 && id != node->id) { g_hash_table_remove(crm_peer_id_cache, GUINT_TO_POINTER(node->id)); g_hash_table_insert(crm_peer_id_cache, GUINT_TO_POINTER(id), node); node->id = id; crm_info("Node %s now has id: %u", crm_str(uname), id); } return node; } crm_node_t * crm_update_peer(const char *source, unsigned int id, uint64_t born, uint64_t seen, int32_t votes, uint32_t children, const char *uuid, const char *uname, const char *addr, const char *state) { gboolean addr_changed = FALSE; gboolean state_changed = FALSE; gboolean procs_changed = FALSE; gboolean votes_changed = FALSE; crm_node_t *node = NULL; id = get_corosync_id(id, uuid); CRM_CHECK(uname != NULL || id > 0, return NULL); CRM_ASSERT(crm_peer_cache != NULL); CRM_ASSERT(crm_peer_id_cache != NULL); node = crm_get_peer(id, uname); if (node == NULL) { crm_trace("No node found for %d/%s", id, uname); node = crm_new_peer(id, uname); CRM_LOG_ASSERT(node != NULL); if (node == NULL) { crm_err("Insufficient information to create node %d/%s", id, uname); return NULL; } /* do it now so we don't get '(new)' everywhere */ node->votes = votes; node->processes = children; if (addr) { node->addr = strdup(addr); } } if (votes > 0 && node->votes != votes) { votes_changed = TRUE; node->votes = votes; } if (node->uuid == NULL) { if (is_openais_cluster()) { /* Yes, overrule whatever was passed in */ node->uuid = get_corosync_uuid(id, uname); } else if (uuid != NULL) { node->uuid = strdup(uuid); } } if (children > 0 && children != node->processes) { uint32_t last = node->processes; node->processes = children; procs_changed = TRUE; if (crm_status_callback) { crm_status_callback(crm_status_processes, node, &last); } } if (born != 0) { node->born = born; } if (state != NULL && safe_str_neq(node->state, state)) { char *last = node->state; node->state = strdup(state); state_changed = TRUE; if (crm_status_callback) { crm_status_callback(crm_status_nstate, node, last); } free(last); } if (seen != 0 && safe_str_eq(node->state, CRM_NODE_MEMBER)) { node->last_seen = seen; } if (addr != NULL) { if (node->addr == NULL || crm_str_eq(node->addr, addr, FALSE) == FALSE) { addr_changed = TRUE; free(node->addr); node->addr = strdup(addr); } } if (state_changed || addr_changed || votes_changed) { do_crm_log_unlikely(state_changed?LOG_NOTICE:LOG_INFO, "%s: Node %s: id=%u state=%s%s addr=%s%s votes=%d%s born=" U64T " seen=" U64T " proc=%.32x%s", source, node->uname, node->id, node->state, state_changed ? " (new)" : "", node->addr, addr_changed ? " (new)" : "", node->votes, votes_changed ? " (new)" : "", node->born, node->last_seen, node->processes, procs_changed ? " (new)" : ""); } else if (procs_changed) { crm_debug("%s: Node %s: id=%u seen=" U64T " proc=%.32x (new)", source, node->uname, node->id, node->last_seen, node->processes); } return node; } void crm_update_peer_proc(const char *source, crm_node_t *node, uint32_t flag, const char *status) { uint32_t last = 0; gboolean changed = FALSE; CRM_CHECK(node != NULL, crm_err("%s: Could not set %s to %s for NULL", source, peer2text(flag), status); return); last = node->processes; if (safe_str_eq(status, ONLINESTATUS)) { if ((node->processes & flag) == 0) { set_bit(node->processes, flag); changed = TRUE; } } else if (node->processes & flag) { clear_bit(node->processes, flag); changed = TRUE; } if (changed) { crm_info("%s: %s.%d.%s is now %s", source, node->uname, node->id, peer2text(flag), status); if (crm_status_callback) { crm_status_callback(crm_status_processes, node, &last); } } else { crm_trace("%s: %s.%d.%s is unchanged %s", source, node->uname, node->id, peer2text(flag), status); } } int crm_terminate_member(int nodeid, const char *uname, void * unused) { /* Always use the synchronous, non-mainloop version */ return stonith_api_kick(nodeid, uname, 120, TRUE); } int crm_terminate_member_no_mainloop(int nodeid, const char *uname, int *connection) { return stonith_api_kick(nodeid, uname, 120, TRUE); } diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index df23dcad5b..7ee74957b2 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -1,2287 +1,2289 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include CRM_TRACE_INIT_DATA(pe_status); #define set_config_flag(data_set, option, flag) do { \ const char *tmp = pe_pref(data_set->config_hash, option); \ if(tmp) { \ if(crm_is_true(tmp)) { \ set_bit(data_set->flags, flag); \ } else { \ clear_bit(data_set->flags, flag); \ } \ } \ } while(0) gboolean unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, GListPtr next, enum action_fail_response *failed, pe_working_set_t * data_set); static void pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason) { CRM_CHECK(node, return); if (node->details->unclean == FALSE) { if (is_set(data_set->flags, pe_flag_stonith_enabled)) { crm_warn("Node %s will be fenced %s", node->details->uname, reason); } else { crm_warn("Node %s is unclean %s", node->details->uname, reason); } } node->details->unclean = TRUE; } gboolean unpack_config(xmlNode * config, pe_working_set_t * data_set) { const char *value = NULL; GHashTable *config_hash = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); data_set->config_hash = config_hash; unpack_instance_attributes(data_set->input, config, XML_CIB_TAG_PROPSET, NULL, config_hash, CIB_OPTIONS_FIRST, FALSE, data_set->now); verify_pe_options(data_set->config_hash); set_config_flag(data_set, "enable-startup-probes", pe_flag_startup_probes); crm_info("Startup probes: %s", is_set(data_set->flags, pe_flag_startup_probes) ? "enabled" : "disabled (dangerous)"); value = pe_pref(data_set->config_hash, "stonith-timeout"); data_set->stonith_timeout = crm_get_msec(value); crm_debug("STONITH timeout: %d", data_set->stonith_timeout); set_config_flag(data_set, "stonith-enabled", pe_flag_stonith_enabled); crm_debug("STONITH of failed nodes is %s", is_set(data_set->flags, pe_flag_stonith_enabled) ? "enabled" : "disabled"); data_set->stonith_action = pe_pref(data_set->config_hash, "stonith-action"); crm_trace("STONITH will %s nodes", data_set->stonith_action); set_config_flag(data_set, "stop-all-resources", pe_flag_stop_everything); crm_debug("Stop all active resources: %s", is_set(data_set->flags, pe_flag_stop_everything) ? "true" : "false"); set_config_flag(data_set, "symmetric-cluster", pe_flag_symmetric_cluster); if (is_set(data_set->flags, pe_flag_symmetric_cluster)) { crm_debug("Cluster is symmetric" " - resources can run anywhere by default"); } value = pe_pref(data_set->config_hash, "default-resource-stickiness"); data_set->default_resource_stickiness = char2score(value); crm_debug("Default stickiness: %d", data_set->default_resource_stickiness); value = pe_pref(data_set->config_hash, "no-quorum-policy"); if (safe_str_eq(value, "ignore")) { data_set->no_quorum_policy = no_quorum_ignore; } else if (safe_str_eq(value, "freeze")) { data_set->no_quorum_policy = no_quorum_freeze; } else if (safe_str_eq(value, "suicide")) { gboolean do_panic = FALSE; crm_element_value_int(data_set->input, XML_ATTR_QUORUM_PANIC, &do_panic); if (is_set(data_set->flags, pe_flag_stonith_enabled) == FALSE) { crm_config_err ("Setting no-quorum-policy=suicide makes no sense if stonith-enabled=false"); } if (do_panic && is_set(data_set->flags, pe_flag_stonith_enabled)) { data_set->no_quorum_policy = no_quorum_suicide; } else if (is_set(data_set->flags, pe_flag_have_quorum) == FALSE && do_panic == FALSE) { crm_notice("Resetting no-quorum-policy to 'stop': The cluster has never had quorum"); data_set->no_quorum_policy = no_quorum_stop; } } else { data_set->no_quorum_policy = no_quorum_stop; } switch (data_set->no_quorum_policy) { case no_quorum_freeze: crm_debug("On loss of CCM Quorum: Freeze resources"); break; case no_quorum_stop: crm_debug("On loss of CCM Quorum: Stop ALL resources"); break; case no_quorum_suicide: crm_notice("On loss of CCM Quorum: Fence all remaining nodes"); break; case no_quorum_ignore: crm_notice("On loss of CCM Quorum: Ignore"); break; } set_config_flag(data_set, "stop-orphan-resources", pe_flag_stop_rsc_orphans); crm_trace("Orphan resources are %s", is_set(data_set->flags, pe_flag_stop_rsc_orphans) ? "stopped" : "ignored"); set_config_flag(data_set, "stop-orphan-actions", pe_flag_stop_action_orphans); crm_trace("Orphan resource actions are %s", is_set(data_set->flags, pe_flag_stop_action_orphans) ? "stopped" : "ignored"); set_config_flag(data_set, "remove-after-stop", pe_flag_remove_after_stop); crm_trace("Stopped resources are removed from the status section: %s", is_set(data_set->flags, pe_flag_remove_after_stop) ? "true" : "false"); set_config_flag(data_set, "maintenance-mode", pe_flag_maintenance_mode); crm_trace("Maintenance mode: %s", is_set(data_set->flags, pe_flag_maintenance_mode) ? "true" : "false"); if (is_set(data_set->flags, pe_flag_maintenance_mode)) { clear_bit(data_set->flags, pe_flag_is_managed_default); } else { set_config_flag(data_set, "is-managed-default", pe_flag_is_managed_default); } crm_trace("By default resources are %smanaged", is_set(data_set->flags, pe_flag_is_managed_default) ? "" : "not "); set_config_flag(data_set, "start-failure-is-fatal", pe_flag_start_failure_fatal); crm_trace("Start failures are %s", is_set(data_set->flags, pe_flag_start_failure_fatal) ? "always fatal" : "handled by failcount"); node_score_red = char2score(pe_pref(data_set->config_hash, "node-health-red")); node_score_green = char2score(pe_pref(data_set->config_hash, "node-health-green")); node_score_yellow = char2score(pe_pref(data_set->config_hash, "node-health-yellow")); crm_info("Node scores: 'red' = %s, 'yellow' = %s, 'green' = %s", pe_pref(data_set->config_hash, "node-health-red"), pe_pref(data_set->config_hash, "node-health-yellow"), pe_pref(data_set->config_hash, "node-health-green")); data_set->placement_strategy = pe_pref(data_set->config_hash, "placement-strategy"); crm_trace("Placement strategy: %s", data_set->placement_strategy); return TRUE; } gboolean unpack_nodes(xmlNode * xml_nodes, pe_working_set_t * data_set) { xmlNode *xml_obj = NULL; node_t *new_node = NULL; const char *id = NULL; const char *uname = NULL; const char *type = NULL; const char *score = NULL; gboolean unseen_are_unclean = TRUE; const char *blind_faith = pe_pref(data_set->config_hash, "startup-fencing"); if (crm_is_true(blind_faith) == FALSE) { unseen_are_unclean = FALSE; crm_warn("Blind faith: not fencing unseen nodes"); } for (xml_obj = __xml_first_child(xml_nodes); xml_obj != NULL; xml_obj = __xml_next(xml_obj)) { if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_NODE, TRUE)) { new_node = NULL; id = crm_element_value(xml_obj, XML_ATTR_ID); uname = crm_element_value(xml_obj, XML_ATTR_UNAME); type = crm_element_value(xml_obj, XML_ATTR_TYPE); score = crm_element_value(xml_obj, XML_RULE_ATTR_SCORE); crm_trace("Processing node %s/%s", uname, id); if (id == NULL) { crm_config_err("Must specify id tag in "); continue; } if (pe_find_node(data_set->nodes, uname) != NULL) { crm_config_warn("Detected multiple node entries with uname=%s" " - this is rarely intended", uname); } new_node = calloc(1, sizeof(node_t)); if (new_node == NULL) { return FALSE; } new_node->weight = char2score(score); new_node->fixed = FALSE; new_node->details = calloc(1, sizeof(struct node_shared_s)); if (new_node->details == NULL) { free(new_node); return FALSE; } crm_trace("Creaing node for entry %s/%s", uname, id); new_node->details->id = id; new_node->details->uname = uname; new_node->details->type = node_ping; new_node->details->online = FALSE; new_node->details->shutdown = FALSE; new_node->details->running_rsc = NULL; new_node->details->attrs = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); new_node->details->utilization = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); /* if(data_set->have_quorum == FALSE */ /* && data_set->no_quorum_policy == no_quorum_stop) { */ /* /\* start shutting resources down *\/ */ /* new_node->weight = -INFINITY; */ /* } */ if (is_set(data_set->flags, pe_flag_stonith_enabled) == FALSE || unseen_are_unclean == FALSE) { /* blind faith... */ new_node->details->unclean = FALSE; } else { /* all nodes are unclean until we've seen their * status entry */ new_node->details->unclean = TRUE; } if (type == NULL || safe_str_eq(type, "member") || safe_str_eq(type, NORMALNODE)) { new_node->details->type = node_member; } add_node_attrs(xml_obj, new_node, FALSE, data_set); unpack_instance_attributes(data_set->input, xml_obj, XML_TAG_UTILIZATION, NULL, new_node->details->utilization, NULL, FALSE, data_set->now); data_set->nodes = g_list_append(data_set->nodes, new_node); crm_trace("Done with node %s", crm_element_value(xml_obj, XML_ATTR_UNAME)); } } return TRUE; } static void g_hash_destroy_node_list(gpointer data) { GListPtr domain = data; g_list_free_full(domain, free); } gboolean unpack_domains(xmlNode * xml_domains, pe_working_set_t * data_set) { const char *id = NULL; GListPtr domain = NULL; xmlNode *xml_node = NULL; xmlNode *xml_domain = NULL; crm_info("Unpacking domains"); data_set->domains = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_node_list); for (xml_domain = __xml_first_child(xml_domains); xml_domain != NULL; xml_domain = __xml_next(xml_domain)) { if (crm_str_eq((const char *)xml_domain->name, XML_CIB_TAG_DOMAIN, TRUE)) { domain = NULL; id = crm_element_value(xml_domain, XML_ATTR_ID); for (xml_node = __xml_first_child(xml_domain); xml_node != NULL; xml_node = __xml_next(xml_node)) { if (crm_str_eq((const char *)xml_node->name, XML_CIB_TAG_NODE, TRUE)) { node_t *copy = NULL; node_t *node = NULL; const char *uname = crm_element_value(xml_node, "name"); const char *score = crm_element_value(xml_node, XML_RULE_ATTR_SCORE); if (uname == NULL) { crm_config_err("Invalid domain %s: Must specify id tag in ", id); continue; } node = pe_find_node(data_set->nodes, uname); if (node == NULL) { node = pe_find_node_id(data_set->nodes, uname); } if (node == NULL) { crm_config_warn("Invalid domain %s: Node %s does not exist", id, uname); continue; } copy = node_copy(node); copy->weight = char2score(score); crm_debug("Adding %s to domain %s with score %s", node->details->uname, id, score); domain = g_list_prepend(domain, copy); } } if (domain) { crm_debug("Created domain %s with %d members", id, g_list_length(domain)); g_hash_table_replace(data_set->domains, strdup(id), domain); } } } return TRUE; } static void destroy_template_rsc_set(gpointer data) { xmlNode *rsc_set = data; free_xml(rsc_set); } gboolean unpack_resources(xmlNode * xml_resources, pe_working_set_t * data_set) { xmlNode *xml_obj = NULL; data_set->template_rsc_sets = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, destroy_template_rsc_set); for (xml_obj = __xml_first_child(xml_resources); xml_obj != NULL; xml_obj = __xml_next(xml_obj)) { resource_t *new_rsc = NULL; if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_RSC_TEMPLATE, TRUE)) { const char *template_id = ID(xml_obj); if (template_id && g_hash_table_lookup_extended(data_set->template_rsc_sets, template_id, NULL, NULL) == FALSE) { /* Record the template's ID for the knowledge of its existence anyway. */ g_hash_table_insert(data_set->template_rsc_sets, strdup(template_id), NULL); } continue; } crm_trace("Beginning unpack... <%s id=%s... >", crm_element_name(xml_obj), ID(xml_obj)); if (common_unpack(xml_obj, &new_rsc, NULL, data_set)) { data_set->resources = g_list_append(data_set->resources, new_rsc); print_resource(LOG_DEBUG_3, "Added", new_rsc, FALSE); } else { crm_config_err("Failed unpacking %s %s", crm_element_name(xml_obj), crm_element_value(xml_obj, XML_ATTR_ID)); if (new_rsc != NULL && new_rsc->fns != NULL) { new_rsc->fns->free(new_rsc); } } } data_set->resources = g_list_sort(data_set->resources, sort_rsc_priority); if (is_set(data_set->flags, pe_flag_stonith_enabled) && is_set(data_set->flags, pe_flag_have_stonith_resource) == FALSE) { crm_config_err("Resource start-up disabled since no STONITH resources have been defined"); crm_config_err("Either configure some or disable STONITH with the stonith-enabled option"); crm_config_err("NOTE: Clusters with shared data need STONITH to ensure data integrity"); } return TRUE; } /* The ticket state section: * "/cib/status/tickets/ticket_state" */ static gboolean unpack_ticket_state(xmlNode * xml_ticket, pe_working_set_t * data_set) { const char *ticket_id = NULL; const char *granted = NULL; const char *last_granted = NULL; const char *standby = NULL; xmlAttrPtr xIter = NULL; ticket_t *ticket = NULL; ticket_id = ID(xml_ticket); if (ticket_id == NULL || strlen(ticket_id) == 0) { return FALSE; } crm_trace("Processing ticket state for %s", ticket_id); ticket = g_hash_table_lookup(data_set->tickets, ticket_id); if (ticket == NULL) { ticket = ticket_new(ticket_id, data_set); if (ticket == NULL) { return FALSE; } } for (xIter = xml_ticket->properties; xIter; xIter = xIter->next) { const char *prop_name = (const char *)xIter->name; const char *prop_value = crm_element_value(xml_ticket, prop_name); if(crm_str_eq(prop_name, XML_ATTR_ID, TRUE)) { continue; } g_hash_table_replace(ticket->state, strdup(prop_name), strdup(prop_value)); } granted = g_hash_table_lookup(ticket->state, "granted"); if (granted && crm_is_true(granted)) { ticket->granted = TRUE; crm_info("We have ticket '%s'", ticket->id); } else { ticket->granted = FALSE; crm_info("We do not have ticket '%s'", ticket->id); } last_granted = g_hash_table_lookup(ticket->state, "last-granted"); if (last_granted) { ticket->last_granted = crm_parse_int(last_granted, 0); } standby = g_hash_table_lookup(ticket->state, "standby"); if (standby && crm_is_true(standby)) { ticket->standby = TRUE; if (ticket->granted) { crm_info("Granted ticket '%s' is in standby-mode", ticket->id); } } else { ticket->standby = FALSE; } crm_trace("Done with ticket state for %s", ticket_id); return TRUE; } static gboolean unpack_tickets_state(xmlNode * xml_tickets, pe_working_set_t * data_set) { xmlNode *xml_obj = NULL; for (xml_obj = __xml_first_child(xml_tickets); xml_obj != NULL; xml_obj = __xml_next(xml_obj)) { if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_TICKET_STATE, TRUE) == FALSE) { continue; } unpack_ticket_state(xml_obj, data_set); } return TRUE; } /* Compatibility with the deprecated ticket state section: * "/cib/status/tickets/instance_attributes" */ static void get_ticket_state_legacy(gpointer key, gpointer value, gpointer user_data) { const char *long_key = key; char *state_key = NULL; const char *granted_prefix = "granted-ticket-"; const char *last_granted_prefix = "last-granted-"; static int granted_prefix_strlen = 0; static int last_granted_prefix_strlen = 0; const char *ticket_id = NULL; const char *is_granted = NULL; const char *last_granted = NULL; const char *sep = NULL; ticket_t *ticket = NULL; pe_working_set_t *data_set = user_data; if (granted_prefix_strlen == 0) { granted_prefix_strlen = strlen(granted_prefix); } if (last_granted_prefix_strlen == 0) { last_granted_prefix_strlen = strlen(last_granted_prefix); } if (strstr(long_key, granted_prefix) == long_key) { ticket_id = long_key + granted_prefix_strlen; if (strlen(ticket_id)) { state_key = strdup("granted"); is_granted = value; } } else if (strstr(long_key, last_granted_prefix) == long_key) { ticket_id = long_key + last_granted_prefix_strlen; if (strlen(ticket_id)) { state_key = strdup("last-granted"); last_granted = value; } } else if ((sep = strrchr(long_key, '-'))) { ticket_id = sep + 1; state_key = strndup(long_key, strlen(long_key) - strlen(sep)); } if (ticket_id == NULL || strlen(ticket_id) == 0) { free(state_key); return; } if (state_key == NULL || strlen(state_key) == 0) { free(state_key); return; } ticket = g_hash_table_lookup(data_set->tickets, ticket_id); if (ticket == NULL) { ticket = ticket_new(ticket_id, data_set); if (ticket == NULL) { free(state_key); return; } } g_hash_table_replace(ticket->state, state_key, strdup(value)); if (is_granted) { if (crm_is_true(is_granted)) { ticket->granted = TRUE; crm_info("We have ticket '%s'", ticket->id); } else { ticket->granted = FALSE; crm_info("We do not have ticket '%s'", ticket->id); } } else if (last_granted) { ticket->last_granted = crm_parse_int(last_granted, 0); } } /* remove nodes that are down, stopping */ /* create +ve rsc_to_node constraints between resources and the nodes they are running on */ /* anything else? */ gboolean unpack_status(xmlNode * status, pe_working_set_t * data_set) { const char *id = NULL; xmlNode *state = NULL; xmlNode *lrm_rsc = NULL; node_t *this_node = NULL; crm_trace("Beginning unpack"); if (data_set->tickets == NULL) { data_set->tickets = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, destroy_ticket); } for (state = __xml_first_child(status); state != NULL; state = __xml_next(state)) { if (crm_str_eq((const char *)state->name, XML_CIB_TAG_TICKETS, TRUE)) { xmlNode *xml_tickets = state; GHashTable *state_hash = NULL; /* Compatibility with the deprecated ticket state section: * Unpack the attributes in the deprecated "/cib/status/tickets/instance_attributes" if it exists. */ state_hash = g_hash_table_new_full( crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); unpack_instance_attributes(data_set->input, xml_tickets, XML_TAG_ATTR_SETS, NULL, state_hash, NULL, TRUE, data_set->now); g_hash_table_foreach(state_hash, get_ticket_state_legacy, data_set); if (state_hash) { g_hash_table_destroy(state_hash); } /* Unpack the new "/cib/status/tickets/ticket_state"s */ unpack_tickets_state(xml_tickets, data_set); } if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE)) { const char *uname = crm_element_value(state, XML_ATTR_UNAME); xmlNode *attrs = find_xml_node(state, XML_TAG_TRANSIENT_NODEATTRS, FALSE); id = crm_element_value(state, XML_ATTR_ID); crm_trace("Processing node id=%s, uname=%s", id, uname); this_node = pe_find_node_id(data_set->nodes, id); if (uname == NULL) { /* error */ continue; } else if (this_node == NULL) { crm_config_warn("Node %s in status section no longer exists", uname); continue; } /* Mark the node as provisionally clean * - at least we have seen it in the current cluster's lifetime */ this_node->details->unclean = FALSE; add_node_attrs(attrs, this_node, TRUE, data_set); if (crm_is_true(g_hash_table_lookup(this_node->details->attrs, "standby"))) { crm_info("Node %s is in standby-mode", this_node->details->uname); this_node->details->standby = TRUE; } crm_trace("determining node state"); determine_online_status(state, this_node, data_set); if (this_node->details->online && data_set->no_quorum_policy == no_quorum_suicide) { /* Everything else should flow from this automatically * At least until the PE becomes able to migrate off healthy resources */ pe_fence_node(data_set, this_node, "because the cluster does not have quorum"); } } } /* Now that we know all node states, we can safely handle migration ops * But, for now, only process healthy nodes * - this is necessary for the logic in bug lf#2508 to function correctly */ for (state = __xml_first_child(status); state != NULL; state = __xml_next(state)) { if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE) == FALSE) { continue; } id = crm_element_value(state, XML_ATTR_ID); this_node = pe_find_node_id(data_set->nodes, id); if (this_node == NULL) { crm_info("Node %s is unknown", id); continue; } else if (this_node->details->online) { crm_trace("Processing lrm resource entries on healthy node: %s", this_node->details->uname); lrm_rsc = find_xml_node(state, XML_CIB_TAG_LRM, FALSE); lrm_rsc = find_xml_node(lrm_rsc, XML_LRM_TAG_RESOURCES, FALSE); unpack_lrm_resources(this_node, lrm_rsc, data_set); } } /* Now handle failed nodes - but only if stonith is enabled * * By definition, offline nodes run no resources so there is nothing to do. * Only when stonith is enabled do we need to know what is on the node to * ensure rsc start events happen after the stonith */ for (state = __xml_first_child(status); state != NULL && is_set(data_set->flags, pe_flag_stonith_enabled); state = __xml_next(state)) { if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE) == FALSE) { continue; } id = crm_element_value(state, XML_ATTR_ID); this_node = pe_find_node_id(data_set->nodes, id); if (this_node == NULL || this_node->details->online) { continue; } else { crm_trace("Processing lrm resource entries on unhealthy node: %s", this_node->details->uname); lrm_rsc = find_xml_node(state, XML_CIB_TAG_LRM, FALSE); lrm_rsc = find_xml_node(lrm_rsc, XML_LRM_TAG_RESOURCES, FALSE); unpack_lrm_resources(this_node, lrm_rsc, data_set); } } return TRUE; } static gboolean determine_online_status_no_fencing(pe_working_set_t * data_set, xmlNode * node_state, node_t * this_node) { gboolean online = FALSE; const char *join = crm_element_value(node_state, XML_NODE_JOIN_STATE); const char *is_peer = crm_element_value(node_state, XML_NODE_IS_PEER); const char *in_cluster = crm_element_value(node_state, XML_NODE_IN_CLUSTER); const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED); if (!crm_is_true(in_cluster)) { crm_trace("Node is down: in_cluster=%s", crm_str(in_cluster)); } else if (safe_str_eq(is_peer, ONLINESTATUS)) { if (safe_str_eq(join, CRMD_JOINSTATE_MEMBER)) { online = TRUE; } else { crm_debug("Node is not ready to run resources: %s", join); } } else if (this_node->details->expected_up == FALSE) { crm_trace("CRMd is down: in_cluster=%s", crm_str(in_cluster)); crm_trace("\tis_peer=%s, join=%s, expected=%s", crm_str(is_peer), crm_str(join), crm_str(exp_state)); } else { /* mark it unclean */ pe_fence_node(data_set, this_node, "because it is partially and/or un-expectedly down"); crm_info("\tin_cluster=%s, is_peer=%s, join=%s, expected=%s", crm_str(in_cluster), crm_str(is_peer), crm_str(join), crm_str(exp_state)); } return online; } static gboolean determine_online_status_fencing(pe_working_set_t * data_set, xmlNode * node_state, node_t * this_node) { gboolean online = FALSE; gboolean do_terminate = FALSE; const char *join = crm_element_value(node_state, XML_NODE_JOIN_STATE); const char *is_peer = crm_element_value(node_state, XML_NODE_IS_PEER); const char *in_cluster = crm_element_value(node_state, XML_NODE_IN_CLUSTER); const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED); const char *terminate = g_hash_table_lookup(this_node->details->attrs, "terminate"); /* - XML_NODE_IN_CLUSTER ::= true|false - XML_NODE_IS_PEER ::= true|false|online|offline - XML_NODE_JOIN_STATE ::= member|down|pending|banned - XML_NODE_EXPECTED ::= member|down */ if (crm_is_true(terminate)) { do_terminate = TRUE; } else if (terminate != NULL && strlen(terminate) > 0) { /* could be a time() value */ char t = terminate[0]; if (t != '0' && isdigit(t)) { do_terminate = TRUE; } } crm_trace("%s: in_cluster=%s, is_peer=%s, join=%s, expected=%s, term=%d", this_node->details->uname, crm_str(in_cluster), crm_str(is_peer), crm_str(join), crm_str(exp_state), do_terminate); online = crm_is_true(in_cluster); if(safe_str_eq(is_peer, ONLINESTATUS)) { is_peer = XML_BOOLEAN_YES; } if(exp_state == NULL) { exp_state = CRMD_JOINSTATE_DOWN; } if(this_node->details->shutdown) { crm_debug("%s is shutting down", this_node->details->uname); online = crm_is_true(is_peer); /* Slightly different criteria since we cant shut down a dead peer */ } else if(do_terminate == FALSE && safe_str_eq(exp_state, CRMD_JOINSTATE_DOWN)) { - /* TO-DO: Have the crmd no longer pre-set CRMD_JOINSTATE_DOWN for shutdown */ - if(crm_is_true(in_cluster) || crm_is_true(is_peer)) { + if(in_cluster == NULL) { + pe_fence_node(data_set, this_node, "because the peer has not been seen by the cluster"); + + } else if(crm_is_true(in_cluster) || crm_is_true(is_peer)) { crm_info("- Node %s is not ready to run resources", this_node->details->uname); this_node->details->standby = TRUE; this_node->details->pending = TRUE; } else { crm_trace("%s is down or still coming up", this_node->details->uname); } } else if(crm_is_true(in_cluster) == FALSE) { pe_fence_node(data_set, this_node, "because the node was not part of the cluster"); } else if(crm_is_true(is_peer) == FALSE) { pe_fence_node(data_set, this_node, "because our peer process was not available"); /* Everything is running at this point, now check join state */ } else if (safe_str_eq(join, CRMD_JOINSTATE_NACK)) { pe_fence_node(data_set, this_node, "because it failed the pacemaker membership criteria"); } else if (do_terminate) { pe_fence_node(data_set, this_node, "because termination was requested"); } else if (safe_str_eq(join, CRMD_JOINSTATE_MEMBER)) { crm_info("Node %s is active", this_node->details->uname); } else if (safe_str_eq(join, CRMD_JOINSTATE_PENDING)) { crm_info("+ Node %s is not ready to run resources", this_node->details->uname); this_node->details->standby = TRUE; this_node->details->pending = TRUE; } else { pe_fence_node(data_set, this_node, "because the peer was in an unknown state"); crm_warn("%s: in-cluster=%s, is-peer=%s, join=%s, expected=%s, term=%d, shutdown=%d", this_node->details->uname, crm_str(in_cluster), crm_str(is_peer), crm_str(join), crm_str(exp_state), do_terminate, this_node->details->shutdown); } return online; } gboolean determine_online_status(xmlNode * node_state, node_t * this_node, pe_working_set_t * data_set) { gboolean online = FALSE; const char *shutdown = NULL; const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED); if (this_node == NULL) { crm_config_err("No node to check"); return online; } this_node->details->shutdown = FALSE; this_node->details->expected_up = FALSE; shutdown = g_hash_table_lookup(this_node->details->attrs, XML_CIB_ATTR_SHUTDOWN); crm_trace("Shutdown: %s", shutdown); if (shutdown != NULL && safe_str_neq("0", shutdown)) { this_node->details->shutdown = TRUE; } else if (safe_str_eq(exp_state, CRMD_JOINSTATE_MEMBER)) { this_node->details->expected_up = TRUE; } if(this_node->details->type != node_member) { this_node->details->unclean = FALSE; online = FALSE; /* As far as resource management is concerned, * the node is safely offline. * Anyone caught abusing this logic will be shot */ } if (is_set(data_set->flags, pe_flag_stonith_enabled) == FALSE) { online = determine_online_status_no_fencing(data_set, node_state, this_node); } else { online = determine_online_status_fencing(data_set, node_state, this_node); } if (online) { this_node->details->online = TRUE; } else { /* remove node from contention */ this_node->fixed = TRUE; this_node->weight = -INFINITY; } if (online && this_node->details->shutdown) { /* dont run resources here */ this_node->fixed = TRUE; this_node->weight = -INFINITY; } if (this_node->details->unclean) { pe_proc_warn("Node %s is unclean", this_node->details->uname); } else if (this_node->details->online) { crm_info("Node %s is %s", this_node->details->uname, this_node->details->shutdown ? "shutting down" : this_node->details->pending ? "pending" : this_node->details->standby ? "standby" : "online"); } else { crm_trace("Node %s is offline", this_node->details->uname); } return online; } char * clone_strip(const char *last_rsc_id) { int lpc = 0; char *zero = NULL; CRM_CHECK(last_rsc_id != NULL, return NULL); if (last_rsc_id != NULL) { lpc = strlen(last_rsc_id); } while (--lpc > 0) { switch (last_rsc_id[lpc]) { case 0: return NULL; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': break; case ':': zero = calloc(1, lpc + 1); memcpy(zero, last_rsc_id, lpc); zero[lpc] = 0; return zero; default: zero = strdup(last_rsc_id); return zero; } } return NULL; } char * clone_zero(const char *last_rsc_id) { int lpc = 0; char *zero = NULL; CRM_CHECK(last_rsc_id != NULL, return NULL); if (last_rsc_id != NULL) { lpc = strlen(last_rsc_id); } while (--lpc > 0) { switch (last_rsc_id[lpc]) { case 0: return NULL; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': break; case ':': zero = calloc(1, lpc + 3); memcpy(zero, last_rsc_id, lpc); zero[lpc] = ':'; zero[lpc + 1] = '0'; zero[lpc + 2] = 0; return zero; default: lpc = strlen(last_rsc_id); zero = calloc(1, lpc + 3); memcpy(zero, last_rsc_id, lpc); zero[lpc] = ':'; zero[lpc + 1] = '0'; zero[lpc + 2] = 0; crm_trace("%s -> %s", last_rsc_id, zero); return zero; } } return NULL; } static resource_t * create_fake_resource(const char *rsc_id, xmlNode * rsc_entry, pe_working_set_t * data_set) { resource_t *rsc = NULL; xmlNode *xml_rsc = create_xml_node(NULL, XML_CIB_TAG_RESOURCE); copy_in_properties(xml_rsc, rsc_entry); crm_xml_add(xml_rsc, XML_ATTR_ID, rsc_id); crm_log_xml_debug(xml_rsc, "Orphan resource"); if (!common_unpack(xml_rsc, &rsc, NULL, data_set)) { return NULL; } set_bit(rsc->flags, pe_rsc_orphan); data_set->resources = g_list_append(data_set->resources, rsc); return rsc; } extern resource_t *create_child_clone(resource_t * rsc, int sub_id, pe_working_set_t * data_set); static resource_t * find_anonymous_clone(pe_working_set_t * data_set, node_t * node, resource_t * parent, const char *rsc_id) { GListPtr rIter = NULL; resource_t *rsc = NULL; gboolean skip_inactive = FALSE; CRM_ASSERT(parent != NULL); CRM_ASSERT(parent->variant == pe_clone || parent->variant == pe_master); CRM_ASSERT(is_not_set(parent->flags, pe_rsc_unique)); /* Find an instance active (or partially active for grouped clones) on the specified node */ crm_trace("Looking for %s on %s in %s", rsc_id, node->details->uname, parent->id); for(rIter = parent->children; rsc == NULL && rIter; rIter = rIter->next) { GListPtr nIter = NULL; GListPtr locations = NULL; resource_t *child = rIter->data; child->fns->location(child, &locations, TRUE); if (locations == NULL) { crm_trace("Resource %s, skip inactive", child->id); continue; } for(nIter = locations; nIter && rsc == NULL; nIter = nIter->next) { node_t *childnode = nIter->data; if(childnode->details == node->details) { /* ->find_rsc() because we might be a cloned group */ rsc = parent->fns->find_rsc(child, rsc_id, NULL, pe_find_clone); crm_trace("Resource %s, active", rsc->id); } /* Keep this block, it means we'll do the right thing if * anyone toggles the unique flag to 'off' */ if(rsc && rsc->running_on) { crm_notice("/Anonymous/ clone %s is already running on %s", parent->id, node->details->uname); skip_inactive = TRUE; rsc = NULL; } } g_list_free(locations); } /* Find an inactive instance */ if(skip_inactive == FALSE) { crm_trace("Looking for %s anywhere", rsc_id); for(rIter = parent->children; rsc == NULL && rIter; rIter = rIter->next) { GListPtr locations = NULL; resource_t *child = rIter->data; child->fns->location(child, &locations, TRUE); if (locations == NULL) { /* ->find_rsc() because we might be a cloned group */ rsc = parent->fns->find_rsc(child, rsc_id, NULL, pe_find_clone); crm_trace("Resource %s, empty slot", rsc->id); } g_list_free(locations); } } if (rsc == NULL) { /* Create an extra orphan */ resource_t *top = create_child_clone(parent, -1, data_set); /* ->find_rsc() because we might be a cloned group */ rsc = top->fns->find_rsc(top, rsc_id, NULL, pe_find_clone); CRM_ASSERT(rsc != NULL); crm_debug("Created orphan %s for %s: %s on %s", top->id, parent->id, rsc_id, node->details->uname); } if (safe_str_neq(rsc_id, rsc->id)) { crm_info("Internally renamed %s on %s to %s%s", rsc_id, node->details->uname, rsc->id, is_set(rsc->flags, pe_rsc_orphan) ? " (ORPHAN)" : ""); } return rsc; } static resource_t * unpack_find_resource(pe_working_set_t * data_set, node_t * node, const char *rsc_id, xmlNode * rsc_entry) { resource_t *rsc = NULL; resource_t *parent = NULL; char *alt_rsc_id = strdup(rsc_id); crm_trace("looking for %s", rsc_id); rsc = pe_find_resource(data_set->resources, alt_rsc_id); /* no match */ if (rsc == NULL) { /* Even when clone-max=0, we still create a single :0 orphan to match against */ char *tmp = clone_zero(alt_rsc_id); resource_t *clone0 = pe_find_resource(data_set->resources, tmp); parent = uber_parent(clone0); free(tmp); crm_trace("%s not found: %s", alt_rsc_id, parent ? parent->id : "orphan"); } else { parent = uber_parent(rsc); } if (parent && parent->variant > pe_group) { if(is_not_set(parent->flags, pe_rsc_unique)) { char *base = clone_strip(rsc_id); rsc = find_anonymous_clone(data_set, node, parent, base); CRM_ASSERT(rsc != NULL); free(base); } if (safe_str_neq(rsc_id, rsc->id)) { free(rsc->clone_name); rsc->clone_name = strdup(rsc_id); } } free(alt_rsc_id); return rsc; } static resource_t * process_orphan_resource(xmlNode * rsc_entry, node_t * node, pe_working_set_t * data_set) { resource_t *rsc = NULL; const char *rsc_id = crm_element_value(rsc_entry, XML_ATTR_ID); crm_debug("Detected orphan resource %s on %s", rsc_id, node->details->uname); rsc = create_fake_resource(rsc_id, rsc_entry, data_set); if (is_set(data_set->flags, pe_flag_stop_rsc_orphans) == FALSE) { clear_bit(rsc->flags, pe_rsc_managed); } else { GListPtr gIter = NULL; print_resource(LOG_DEBUG_3, "Added orphan", rsc, FALSE); CRM_CHECK(rsc != NULL, return NULL); resource_location(rsc, NULL, -INFINITY, "__orphan_dont_run__", data_set); for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { node_t *node = (node_t *) gIter->data; if (node->details->online && get_failcount(node, rsc, NULL, data_set)) { action_t *clear_op = NULL; action_t *ready = get_pseudo_op(CRM_OP_PROBED, data_set); clear_op = custom_action(rsc, crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_'), CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set); add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); crm_info("Clearing failcount (%d) for orphaned resource %s on %s (%s)", get_failcount(node, rsc, NULL, data_set), rsc->id, node->details->uname, clear_op->uuid); order_actions(clear_op, ready, pe_order_optional); } } } return rsc; } static void process_rsc_state(resource_t * rsc, node_t * node, enum action_fail_response on_fail, xmlNode * migrate_op, pe_working_set_t * data_set) { crm_trace("Resource %s is %s on %s: on_fail=%s", rsc->id, role2text(rsc->role), node->details->uname, fail2text(on_fail)); /* process current state */ if (rsc->role != RSC_ROLE_UNKNOWN) { resource_t *iter = rsc; while (iter) { if (g_hash_table_lookup(iter->known_on, node->details->id) == NULL) { node_t *n = node_copy(node); crm_trace("%s (aka. %s) known on %s", rsc->id, rsc->clone_name, n->details->uname); g_hash_table_insert(iter->known_on, (gpointer) n->details->id, n); } if (is_set(iter->flags, pe_rsc_unique)) { break; } iter = iter->parent; } } if (node->details->unclean) { /* No extra processing needed * Also allows resources to be started again after a node is shot */ on_fail = action_fail_ignore; } switch (on_fail) { case action_fail_ignore: /* nothing to do */ break; case action_fail_fence: /* treat it as if it is still running * but also mark the node as unclean */ pe_fence_node(data_set, node, "to recover from resource failure(s)"); break; case action_fail_standby: node->details->standby = TRUE; node->details->standby_onfail = TRUE; break; case action_fail_block: /* is_managed == FALSE will prevent any * actions being sent for the resource */ clear_bit(rsc->flags, pe_rsc_managed); set_bit(rsc->flags, pe_rsc_block); break; case action_fail_migrate: /* make sure it comes up somewhere else * or not at all */ resource_location(rsc, node, -INFINITY, "__action_migration_auto__", data_set); break; case action_fail_stop: rsc->next_role = RSC_ROLE_STOPPED; break; case action_fail_recover: if (rsc->role != RSC_ROLE_STOPPED && rsc->role != RSC_ROLE_UNKNOWN) { set_bit(rsc->flags, pe_rsc_failed); stop_action(rsc, node, FALSE); } break; } if (rsc->role != RSC_ROLE_STOPPED && rsc->role != RSC_ROLE_UNKNOWN) { if (is_set(rsc->flags, pe_rsc_orphan)) { if (is_set(rsc->flags, pe_rsc_managed)) { crm_config_warn("Detected active orphan %s running on %s", rsc->id, node->details->uname); } else { crm_config_warn("Cluster configured not to stop active orphans." " %s must be stopped manually on %s", rsc->id, node->details->uname); } } native_add_running(rsc, node, data_set); if (on_fail != action_fail_ignore) { set_bit(rsc->flags, pe_rsc_failed); } } else if (rsc->clone_name) { crm_trace("Resetting clone_name %s for %s (stopped)", rsc->clone_name, rsc->id); free(rsc->clone_name); rsc->clone_name = NULL; } else { char *key = stop_key(rsc); GListPtr possible_matches = find_actions(rsc->actions, key, node); GListPtr gIter = possible_matches; for (; gIter != NULL; gIter = gIter->next) { action_t *stop = (action_t *) gIter->data; stop->flags |= pe_action_optional; } free(key); } } /* create active recurring operations as optional */ static void process_recurring(node_t * node, resource_t * rsc, int start_index, int stop_index, GListPtr sorted_op_list, pe_working_set_t * data_set) { int counter = -1; const char *task = NULL; const char *status = NULL; GListPtr gIter = sorted_op_list; crm_trace("%s: Start index %d, stop index = %d", rsc->id, start_index, stop_index); for (; gIter != NULL; gIter = gIter->next) { xmlNode *rsc_op = (xmlNode *) gIter->data; int interval = 0; char *key = NULL; const char *id = ID(rsc_op); const char *interval_s = NULL; counter++; if (node->details->online == FALSE) { crm_trace("Skipping %s/%s: node is offline", rsc->id, node->details->uname); break; /* Need to check if there's a monitor for role="Stopped" */ } else if (start_index < stop_index && counter <= stop_index) { crm_trace("Skipping %s/%s: resource is not active", id, node->details->uname); continue; } else if (counter < start_index) { crm_trace("Skipping %s/%s: old %d", id, node->details->uname, counter); continue; } interval_s = crm_element_value(rsc_op, XML_LRM_ATTR_INTERVAL); interval = crm_parse_int(interval_s, "0"); if (interval == 0) { crm_trace("Skipping %s/%s: non-recurring", id, node->details->uname); continue; } status = crm_element_value(rsc_op, XML_LRM_ATTR_OPSTATUS); if (safe_str_eq(status, "-1")) { crm_trace("Skipping %s/%s: status", id, node->details->uname); continue; } task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); /* create the action */ key = generate_op_key(rsc->id, task, interval); crm_trace("Creating %s/%s", key, node->details->uname); custom_action(rsc, key, task, node, TRUE, TRUE, data_set); } } void calculate_active_ops(GListPtr sorted_op_list, int *start_index, int *stop_index) { int counter = -1; int implied_monitor_start = -1; int implied_master_start = -1; const char *task = NULL; const char *status = NULL; GListPtr gIter = sorted_op_list; *stop_index = -1; *start_index = -1; for (; gIter != NULL; gIter = gIter->next) { xmlNode *rsc_op = (xmlNode *) gIter->data; counter++; task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); status = crm_element_value(rsc_op, XML_LRM_ATTR_OPSTATUS); if (safe_str_eq(task, CRMD_ACTION_STOP) && safe_str_eq(status, "0")) { *stop_index = counter; } else if (safe_str_eq(task, CRMD_ACTION_START) || safe_str_eq(task, CRMD_ACTION_MIGRATED)) { *start_index = counter; } else if ((implied_monitor_start <= *stop_index) && safe_str_eq(task, CRMD_ACTION_STATUS)) { const char *rc = crm_element_value(rsc_op, XML_LRM_ATTR_RC); if (safe_str_eq(rc, "0") || safe_str_eq(rc, "8")) { implied_monitor_start = counter; } } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE) || safe_str_eq(task, CRMD_ACTION_DEMOTE)) { implied_master_start = counter; } } if (*start_index == -1) { if (implied_master_start != -1) { *start_index = implied_master_start; } else if (implied_monitor_start != -1) { *start_index = implied_monitor_start; } } } static void unpack_lrm_rsc_state(node_t * node, xmlNode * rsc_entry, pe_working_set_t * data_set) { GListPtr gIter = NULL; int stop_index = -1; int start_index = -1; enum rsc_role_e req_role = RSC_ROLE_UNKNOWN; const char *task = NULL; const char *rsc_id = crm_element_value(rsc_entry, XML_ATTR_ID); resource_t *rsc = NULL; GListPtr op_list = NULL; GListPtr sorted_op_list = NULL; xmlNode *migrate_op = NULL; xmlNode *rsc_op = NULL; enum action_fail_response on_fail = FALSE; enum rsc_role_e saved_role = RSC_ROLE_UNKNOWN; crm_trace("[%s] Processing %s on %s", crm_element_name(rsc_entry), rsc_id, node->details->uname); /* extract operations */ op_list = NULL; sorted_op_list = NULL; for (rsc_op = __xml_first_child(rsc_entry); rsc_op != NULL; rsc_op = __xml_next(rsc_op)) { if (crm_str_eq((const char *)rsc_op->name, XML_LRM_TAG_RSC_OP, TRUE)) { op_list = g_list_prepend(op_list, rsc_op); } } if (op_list == NULL) { /* if there are no operations, there is nothing to do */ return; } /* find the resource */ rsc = unpack_find_resource(data_set, node, rsc_id, rsc_entry); if (rsc == NULL) { rsc = process_orphan_resource(rsc_entry, node, data_set); } CRM_ASSERT(rsc != NULL); /* process operations */ saved_role = rsc->role; on_fail = action_fail_ignore; rsc->role = RSC_ROLE_UNKNOWN; sorted_op_list = g_list_sort(op_list, sort_op_by_callid); for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) { xmlNode *rsc_op = (xmlNode *) gIter->data; task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); if (safe_str_eq(task, CRMD_ACTION_MIGRATED)) { migrate_op = rsc_op; } unpack_rsc_op(rsc, node, rsc_op, gIter->next, &on_fail, data_set); } /* create active recurring operations as optional */ calculate_active_ops(sorted_op_list, &start_index, &stop_index); process_recurring(node, rsc, start_index, stop_index, sorted_op_list, data_set); /* no need to free the contents */ g_list_free(sorted_op_list); process_rsc_state(rsc, node, on_fail, migrate_op, data_set); if (get_target_role(rsc, &req_role)) { if (rsc->next_role == RSC_ROLE_UNKNOWN || req_role < rsc->next_role) { crm_debug("%s: Overwriting calculated next role %s" " with requested next role %s", rsc->id, role2text(rsc->next_role), role2text(req_role)); rsc->next_role = req_role; } else if (req_role > rsc->next_role) { crm_info("%s: Not overwriting calculated next role %s" " with requested next role %s", rsc->id, role2text(rsc->next_role), role2text(req_role)); } } if (saved_role > rsc->role) { rsc->role = saved_role; } } gboolean unpack_lrm_resources(node_t * node, xmlNode * lrm_rsc_list, pe_working_set_t * data_set) { xmlNode *rsc_entry = NULL; CRM_CHECK(node != NULL, return FALSE); crm_trace("Unpacking resources on %s", node->details->uname); for (rsc_entry = __xml_first_child(lrm_rsc_list); rsc_entry != NULL; rsc_entry = __xml_next(rsc_entry)) { if (crm_str_eq((const char *)rsc_entry->name, XML_LRM_TAG_RESOURCE, TRUE)) { unpack_lrm_rsc_state(node, rsc_entry, data_set); } } return TRUE; } static void set_active(resource_t * rsc) { resource_t *top = uber_parent(rsc); if (top && top->variant == pe_master) { rsc->role = RSC_ROLE_SLAVE; } else { rsc->role = RSC_ROLE_STARTED; } } static void set_node_score(gpointer key, gpointer value, gpointer user_data) { node_t *node = value; int *score = user_data; node->weight = *score; } #define STATUS_PATH_MAX 1024 static xmlNode * find_lrm_op(const char *resource, const char *op, const char *node, const char *source, pe_working_set_t * data_set) { int offset = 0; char xpath[STATUS_PATH_MAX]; offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "//node_state[@uname='%s']", node); offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "//" XML_LRM_TAG_RESOURCE "[@id='%s']", resource); /* Need to check against transition_magic too? */ if (source && safe_str_eq(op, CRMD_ACTION_MIGRATE)) { offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "/" XML_LRM_TAG_RSC_OP "[@operation='%s' and @migrate_target='%s']", op, source); } else if (source && safe_str_eq(op, CRMD_ACTION_MIGRATED)) { offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "/" XML_LRM_TAG_RSC_OP "[@operation='%s' and @migrate_source='%s']", op, source); } else { offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "/" XML_LRM_TAG_RSC_OP "[@operation='%s']", op); } return get_xpath_object(xpath, data_set->input, LOG_DEBUG); } gboolean unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, GListPtr next, enum action_fail_response * on_fail, pe_working_set_t * data_set) { int task_id = 0; const char *id = NULL; const char *key = NULL; const char *task = NULL; const char *task_key = NULL; const char *magic = NULL; const char *actual_rc = NULL; /* const char *target_rc = NULL; */ const char *task_status = NULL; const char *interval_s = NULL; const char *op_version = NULL; int interval = 0; int task_status_i = -2; int actual_rc_i = 0; int target_rc = -1; int last_failure = 0; action_t *action = NULL; node_t *effective_node = NULL; resource_t *failed = NULL; gboolean expired = FALSE; gboolean is_probe = FALSE; gboolean clear_past_failure = FALSE; CRM_CHECK(rsc != NULL, return FALSE); CRM_CHECK(node != NULL, return FALSE); CRM_CHECK(xml_op != NULL, return FALSE); id = ID(xml_op); task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); task_key = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); task_status = crm_element_value(xml_op, XML_LRM_ATTR_OPSTATUS); op_version = crm_element_value(xml_op, XML_ATTR_CRM_VERSION); magic = crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC); key = crm_element_value(xml_op, XML_ATTR_TRANSITION_KEY); crm_element_value_int(xml_op, XML_LRM_ATTR_CALLID, &task_id); CRM_CHECK(id != NULL, return FALSE); CRM_CHECK(task != NULL, return FALSE); CRM_CHECK(task_status != NULL, return FALSE); task_status_i = crm_parse_int(task_status, NULL); CRM_CHECK(task_status_i <= PCMK_LRM_OP_ERROR, return FALSE); CRM_CHECK(task_status_i >= PCMK_LRM_OP_PENDING, return FALSE); if (safe_str_eq(task, CRMD_ACTION_NOTIFY)) { /* safe to ignore these */ return TRUE; } if (rsc->failure_timeout > 0) { int last_run = 0; if (crm_element_value_int(xml_op, "last-rc-change", &last_run) == 0) { time_t now = get_timet_now(data_set); if (now > (last_run + rsc->failure_timeout)) { expired = TRUE; } } } crm_trace("Unpacking task %s/%s (call_id=%d, status=%s) on %s (role=%s)", id, task, task_id, task_status, node->details->uname, role2text(rsc->role)); interval_s = crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL); interval = crm_parse_int(interval_s, "0"); if (interval == 0 && safe_str_eq(task, CRMD_ACTION_STATUS)) { is_probe = TRUE; } if (node->details->unclean) { crm_trace("Node %s (where %s is running) is unclean." " Further action depends on the value of the stop's on-fail attribue", node->details->uname, rsc->id); } actual_rc = crm_element_value(xml_op, XML_LRM_ATTR_RC); CRM_CHECK(actual_rc != NULL, return FALSE); actual_rc_i = crm_parse_int(actual_rc, NULL); if (key) { int dummy = 0; char *dummy_string = NULL; decode_transition_key(key, &dummy_string, &dummy, &dummy, &target_rc); free(dummy_string); } if (task_status_i == PCMK_LRM_OP_DONE && target_rc >= 0) { if (target_rc == actual_rc_i) { task_status_i = PCMK_LRM_OP_DONE; } else { task_status_i = PCMK_LRM_OP_ERROR; crm_debug("%s on %s returned %d (%s) instead of the expected value: %d (%s)", id, node->details->uname, actual_rc_i, lrmd_event_rc2str(actual_rc_i), target_rc, lrmd_event_rc2str(target_rc)); } } else if (task_status_i == PCMK_LRM_OP_ERROR) { /* let us decide that */ task_status_i = PCMK_LRM_OP_DONE; } if (task_status_i == PCMK_LRM_OP_NOTSUPPORTED) { actual_rc_i = PCMK_EXECRA_UNIMPLEMENT_FEATURE; } if (task_status_i != actual_rc_i && rsc->failure_timeout > 0 && get_failcount(node, rsc, &last_failure, data_set) == 0) { if (last_failure > 0) { action_t *clear_op = NULL; clear_op = custom_action(rsc, crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_'), CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set); add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); crm_notice("Clearing expired failcount for %s on %s", rsc->id, node->details->uname); } } if (expired && actual_rc_i != PCMK_EXECRA_NOT_RUNNING && actual_rc_i != PCMK_EXECRA_RUNNING_MASTER && actual_rc_i != PCMK_EXECRA_OK) { crm_notice("Ignoring expired failure %s (rc=%d, magic=%s) on %s", id, actual_rc_i, magic, node->details->uname); goto done; } /* we could clean this up significantly except for old LRMs and CRMs that * didnt include target_rc and liked to remap status */ switch (actual_rc_i) { case PCMK_EXECRA_NOT_RUNNING: if (is_probe || target_rc == actual_rc_i) { task_status_i = PCMK_LRM_OP_DONE; rsc->role = RSC_ROLE_STOPPED; /* clear any previous failure actions */ *on_fail = action_fail_ignore; rsc->next_role = RSC_ROLE_UNKNOWN; } else if (safe_str_neq(task, CRMD_ACTION_STOP)) { task_status_i = PCMK_LRM_OP_ERROR; } break; case PCMK_EXECRA_RUNNING_MASTER: if (is_probe) { task_status_i = PCMK_LRM_OP_DONE; crm_notice("Operation %s found resource %s active in master mode on %s", task, rsc->id, node->details->uname); } else if (target_rc == actual_rc_i) { /* nothing to do */ } else if (target_rc >= 0) { task_status_i = PCMK_LRM_OP_ERROR; /* legacy code for pre-0.6.5 operations */ } else if (safe_str_neq(task, CRMD_ACTION_STATUS) || rsc->role != RSC_ROLE_MASTER) { task_status_i = PCMK_LRM_OP_ERROR; if (rsc->role != RSC_ROLE_MASTER) { crm_err("%s reported %s in master mode on %s", id, rsc->id, node->details->uname); } } rsc->role = RSC_ROLE_MASTER; break; case PCMK_EXECRA_FAILED_MASTER: rsc->role = RSC_ROLE_MASTER; task_status_i = PCMK_LRM_OP_ERROR; break; case PCMK_EXECRA_UNIMPLEMENT_FEATURE: if (interval > 0) { task_status_i = PCMK_LRM_OP_NOTSUPPORTED; break; } /* else: fall through */ case PCMK_EXECRA_INSUFFICIENT_PRIV: case PCMK_EXECRA_NOT_INSTALLED: case PCMK_EXECRA_INVALID_PARAM: effective_node = node; /* fall through */ case PCMK_EXECRA_NOT_CONFIGURED: failed = rsc; if (is_not_set(rsc->flags, pe_rsc_unique)) { failed = uber_parent(failed); } do_crm_log(actual_rc_i == PCMK_EXECRA_NOT_INSTALLED ? LOG_NOTICE : LOG_ERR, "Preventing %s from re-starting %s %s: operation %s failed '%s' (rc=%d)", failed->id, effective_node ? "on" : "anywhere in the cluster", effective_node ? effective_node->details->uname : "", task, lrmd_event_rc2str(actual_rc_i), actual_rc_i); resource_location(failed, effective_node, -INFINITY, "hard-error", data_set); if (is_probe) { /* treat these like stops */ task = CRMD_ACTION_STOP; task_status_i = PCMK_LRM_OP_DONE; crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname); if (actual_rc_i != PCMK_EXECRA_NOT_INSTALLED || is_set(data_set->flags, pe_flag_symmetric_cluster)) { if ((node->details->shutdown == FALSE) || (node->details->online == TRUE)) { add_node_copy(data_set->failed, xml_op); } } } break; case PCMK_EXECRA_OK: if (is_probe && target_rc == 7) { task_status_i = PCMK_LRM_OP_DONE; crm_info("Operation %s found resource %s active on %s", task, rsc->id, node->details->uname); /* legacy code for pre-0.6.5 operations */ } else if (target_rc < 0 && interval > 0 && rsc->role == RSC_ROLE_MASTER) { /* catch status ops that return 0 instead of 8 while they * are supposed to be in master mode */ task_status_i = PCMK_LRM_OP_ERROR; } break; default: if (task_status_i == PCMK_LRM_OP_DONE) { crm_info("Remapping %s (rc=%d) on %s to an ERROR", id, actual_rc_i, node->details->uname); task_status_i = PCMK_LRM_OP_ERROR; } } if (task_status_i == PCMK_LRM_OP_ERROR || task_status_i == PCMK_LRM_OP_TIMEOUT || task_status_i == PCMK_LRM_OP_NOTSUPPORTED) { const char *action_key = task_key ? task_key : id; action = custom_action(rsc, strdup(action_key), task, NULL, TRUE, FALSE, data_set); if (expired) { crm_notice("Ignoring expired failure (calculated) %s (rc=%d, magic=%s) on %s", id, actual_rc_i, magic, node->details->uname); goto done; } else if (action->on_fail == action_fail_ignore) { crm_warn("Remapping %s (rc=%d) on %s to DONE: ignore", id, actual_rc_i, node->details->uname); task_status_i = PCMK_LRM_OP_DONE; set_bit(rsc->flags, pe_rsc_failure_ignored); crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname); if ((node->details->shutdown == FALSE) || (node->details->online == TRUE)) { add_node_copy(data_set->failed, xml_op); } } } switch (task_status_i) { case PCMK_LRM_OP_PENDING: if (safe_str_eq(task, CRMD_ACTION_START)) { set_bit(rsc->flags, pe_rsc_start_pending); set_active(rsc); } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) { rsc->role = RSC_ROLE_MASTER; } /* * Intentionally ignoring pending migrate ops here; * haven't decided if we need to do anything special * with them yet... */ break; case PCMK_LRM_OP_DONE: crm_trace("%s/%s completed on %s", rsc->id, task, node->details->uname); if (actual_rc_i == PCMK_EXECRA_NOT_RUNNING) { clear_past_failure = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_START)) { rsc->role = RSC_ROLE_STARTED; clear_past_failure = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_STOP)) { rsc->role = RSC_ROLE_STOPPED; clear_past_failure = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) { rsc->role = RSC_ROLE_MASTER; clear_past_failure = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_DEMOTE)) { /* Demote from Master does not clear an error */ rsc->role = RSC_ROLE_SLAVE; } else if (safe_str_eq(task, CRMD_ACTION_MIGRATED)) { rsc->role = RSC_ROLE_STARTED; clear_past_failure = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_MIGRATE)) { /* * The normal sequence is (now): migrate_to(Src) -> migrate_from(Tgt) -> stop(Src) * * So if a migrate_to is followed by a stop, then we dont need to care what * happended on the target node * * Without the stop, we need to look for a successful migrate_from. * This would also imply we're no longer running on the source * * Without the stop, and without a migrate_from op we make sure the resource * gets stopped on both source and target (assuming the target is up) * */ int stop_id = 0; xmlNode *stop_op = find_lrm_op(rsc->id, CRMD_ACTION_STOP, node->details->id, NULL, data_set); if (stop_op) { crm_element_value_int(stop_op, XML_LRM_ATTR_CALLID, &stop_id); } if (stop_op == NULL || stop_id < task_id) { int from_rc = 0, from_status = 0; const char *migrate_source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE); const char *migrate_target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET); node_t *target = pe_find_node(data_set->nodes, migrate_target); node_t *source = pe_find_node(data_set->nodes, migrate_source); xmlNode *migrate_from = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATED, migrate_target, migrate_source, data_set); rsc->role = RSC_ROLE_STARTED; /* can be master? */ if (migrate_from) { crm_element_value_int(migrate_from, XML_LRM_ATTR_RC, &from_rc); crm_element_value_int(migrate_from, XML_LRM_ATTR_OPSTATUS, &from_status); crm_trace("%s op on %s exited with status=%d, rc=%d", ID(migrate_from), migrate_target, from_status, from_rc); } if (migrate_from && from_rc == PCMK_EXECRA_OK && from_status == PCMK_LRM_OP_DONE) { crm_trace("Detected dangling migration op: %s on %s", ID(xml_op), migrate_source); /* all good * just need to arrange for the stop action to get sent * but _without_ affecting the target somehow */ rsc->role = RSC_ROLE_STOPPED; rsc->dangling_migrations = g_list_prepend(rsc->dangling_migrations, node); } else if (migrate_from) { /* Failed */ crm_trace("Marking active on %s %p %d", migrate_target, target, target->details->online); if (target && target->details->online) { native_add_running(rsc, target, data_set); } } else { /* Pending or complete but erased */ node_t *target = pe_find_node_id(data_set->nodes, migrate_target); crm_trace("Marking active on %s %p %d", migrate_target, target, target->details->online); if (target && target->details->online) { native_add_running(rsc, target, data_set); if (source && source->details->online) { /* If we make it here we have a partial migration. The migrate_to * has completed but the migrate_from on the target has not. Hold on * to the target and source on the resource. Later on if we detect that * the resource is still going to run on that target, we may continue * the migration */ rsc->partial_migration_target = target; rsc->partial_migration_source = source; } } else { /* Consider it failed here - forces a restart, prevents migration */ set_bit(rsc->flags, pe_rsc_failed); } } } } else if (rsc->role < RSC_ROLE_STARTED) { /* start, migrate_to and migrate_from will land here */ crm_trace("%s active on %s", rsc->id, node->details->uname); set_active(rsc); } /* clear any previous failure actions */ if (clear_past_failure) { switch (*on_fail) { case action_fail_block: case action_fail_stop: case action_fail_fence: case action_fail_migrate: case action_fail_standby: crm_trace("%s.%s is not cleared by a completed stop", rsc->id, fail2text(*on_fail)); break; case action_fail_ignore: case action_fail_recover: *on_fail = action_fail_ignore; rsc->next_role = RSC_ROLE_UNKNOWN; } } break; case PCMK_LRM_OP_ERROR: case PCMK_LRM_OP_TIMEOUT: case PCMK_LRM_OP_NOTSUPPORTED: crm_warn("Processing failed op %s on %s: %s (%d)", id, node->details->uname, lrmd_event_rc2str(actual_rc_i), actual_rc_i); crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname); if ((node->details->shutdown == FALSE) || (node->details->online == TRUE)) { add_node_copy(data_set->failed, xml_op); } if (*on_fail < action->on_fail) { *on_fail = action->on_fail; } if (safe_str_eq(task, CRMD_ACTION_STOP)) { resource_location(rsc, node, -INFINITY, "__stop_fail__", data_set); } else if (safe_str_eq(task, CRMD_ACTION_MIGRATED)) { int stop_id = 0; int migrate_id = 0; const char *migrate_source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE); const char *migrate_target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET); xmlNode *stop_op = find_lrm_op(rsc->id, CRMD_ACTION_STOP, migrate_source, NULL, data_set); xmlNode *migrate_op = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATE, migrate_source, migrate_target, data_set); if (stop_op) { crm_element_value_int(stop_op, XML_LRM_ATTR_CALLID, &stop_id); } if (migrate_op) { crm_element_value_int(migrate_op, XML_LRM_ATTR_CALLID, &migrate_id); } /* Get our state right */ rsc->role = RSC_ROLE_STARTED; /* can be master? */ if (stop_op == NULL || stop_id < migrate_id) { node_t *source = pe_find_node(data_set->nodes, migrate_source); if (source && source->details->online) { native_add_running(rsc, source, data_set); } } } else if (safe_str_eq(task, CRMD_ACTION_MIGRATE)) { int stop_id = 0; int migrate_id = 0; const char *migrate_source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE); const char *migrate_target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET); xmlNode *stop_op = find_lrm_op(rsc->id, CRMD_ACTION_STOP, migrate_target, NULL, data_set); xmlNode *migrate_op = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATED, migrate_target, migrate_source, data_set); if (stop_op) { crm_element_value_int(stop_op, XML_LRM_ATTR_CALLID, &stop_id); } if (migrate_op) { crm_element_value_int(migrate_op, XML_LRM_ATTR_CALLID, &migrate_id); } /* Get our state right */ rsc->role = RSC_ROLE_STARTED; /* can be master? */ if (stop_op == NULL || stop_id < migrate_id) { node_t *target = pe_find_node(data_set->nodes, migrate_target); crm_trace("Stop: %p %d, Migrated: %p %d", stop_op, stop_id, migrate_op, migrate_id); if (target && target->details->online) { native_add_running(rsc, target, data_set); } } else if (migrate_op == NULL) { /* Make sure it gets cleaned up, the stop may pre-date the migrate_from */ rsc->dangling_migrations = g_list_prepend(rsc->dangling_migrations, node); } } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) { rsc->role = RSC_ROLE_MASTER; } else if (safe_str_eq(task, CRMD_ACTION_DEMOTE)) { /* * staying in role=master ends up putting the PE/TE into a loop * setting role=slave is not dangerous because no master will be * promoted until the failed resource has been fully stopped */ crm_warn("Forcing %s to stop after a failed demote action", rsc->id); rsc->next_role = RSC_ROLE_STOPPED; rsc->role = RSC_ROLE_SLAVE; } else if (compare_version("2.0", op_version) > 0 && safe_str_eq(task, CRMD_ACTION_START)) { crm_warn("Compatibility handling for failed op %s on %s", id, node->details->uname); resource_location(rsc, node, -INFINITY, "__legacy_start__", data_set); } if (rsc->role < RSC_ROLE_STARTED) { set_active(rsc); } crm_trace("Resource %s: role=%s, unclean=%s, on_fail=%s, fail_role=%s", rsc->id, role2text(rsc->role), node->details->unclean ? "true" : "false", fail2text(action->on_fail), role2text(action->fail_role)); if (action->fail_role != RSC_ROLE_STARTED && rsc->next_role < action->fail_role) { rsc->next_role = action->fail_role; } if (action->fail_role == RSC_ROLE_STOPPED) { int score = -INFINITY; crm_err("Making sure %s doesn't come up again", rsc->id); /* make sure it doesnt come up again */ g_hash_table_destroy(rsc->allowed_nodes); rsc->allowed_nodes = node_hash_from_list(data_set->nodes); g_hash_table_foreach(rsc->allowed_nodes, set_node_score, &score); } pe_free_action(action); action = NULL; break; case PCMK_LRM_OP_CANCELLED: /* do nothing?? */ pe_err("Dont know what to do for cancelled ops yet"); break; } done: crm_trace("Resource %s after %s: role=%s", rsc->id, task, role2text(rsc->role)); pe_free_action(action); return TRUE; } gboolean add_node_attrs(xmlNode * xml_obj, node_t * node, gboolean overwrite, pe_working_set_t * data_set) { g_hash_table_insert(node->details->attrs, strdup("#" XML_ATTR_UNAME), strdup(node->details->uname)); g_hash_table_insert(node->details->attrs, strdup("#" XML_ATTR_ID), strdup(node->details->id)); if (safe_str_eq(node->details->id, data_set->dc_uuid)) { data_set->dc_node = node; node->details->is_dc = TRUE; g_hash_table_insert(node->details->attrs, strdup("#" XML_ATTR_DC), strdup(XML_BOOLEAN_TRUE)); } else { g_hash_table_insert(node->details->attrs, strdup("#" XML_ATTR_DC), strdup(XML_BOOLEAN_FALSE)); } unpack_instance_attributes(data_set->input, xml_obj, XML_TAG_ATTR_SETS, NULL, node->details->attrs, NULL, overwrite, data_set->now); return TRUE; } static GListPtr extract_operations(const char *node, const char *rsc, xmlNode * rsc_entry, gboolean active_filter) { int counter = -1; int stop_index = -1; int start_index = -1; xmlNode *rsc_op = NULL; GListPtr gIter = NULL; GListPtr op_list = NULL; GListPtr sorted_op_list = NULL; /* extract operations */ op_list = NULL; sorted_op_list = NULL; for (rsc_op = __xml_first_child(rsc_entry); rsc_op != NULL; rsc_op = __xml_next(rsc_op)) { if (crm_str_eq((const char *)rsc_op->name, XML_LRM_TAG_RSC_OP, TRUE)) { crm_xml_add(rsc_op, "resource", rsc); crm_xml_add(rsc_op, XML_ATTR_UNAME, node); op_list = g_list_prepend(op_list, rsc_op); } } if (op_list == NULL) { /* if there are no operations, there is nothing to do */ return NULL; } sorted_op_list = g_list_sort(op_list, sort_op_by_callid); /* create active recurring operations as optional */ if (active_filter == FALSE) { return sorted_op_list; } op_list = NULL; calculate_active_ops(sorted_op_list, &start_index, &stop_index); for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) { xmlNode *rsc_op = (xmlNode *) gIter->data; counter++; if (start_index < stop_index) { crm_trace("Skipping %s: not active", ID(rsc_entry)); break; } else if (counter < start_index) { crm_trace("Skipping %s: old", ID(rsc_op)); continue; } op_list = g_list_append(op_list, rsc_op); } g_list_free(sorted_op_list); return op_list; } GListPtr find_operations(const char *rsc, const char *node, gboolean active_filter, pe_working_set_t * data_set) { GListPtr output = NULL; GListPtr intermediate = NULL; xmlNode *tmp = NULL; xmlNode *status = find_xml_node(data_set->input, XML_CIB_TAG_STATUS, TRUE); node_t *this_node = NULL; xmlNode *node_state = NULL; for (node_state = __xml_first_child(status); node_state != NULL; node_state = __xml_next(node_state)) { if (crm_str_eq((const char *)node_state->name, XML_CIB_TAG_STATE, TRUE)) { const char *uname = crm_element_value(node_state, XML_ATTR_UNAME); if (node != NULL && safe_str_neq(uname, node)) { continue; } this_node = pe_find_node(data_set->nodes, uname); CRM_CHECK(this_node != NULL, continue); determine_online_status(node_state, this_node, data_set); if (this_node->details->online || is_set(data_set->flags, pe_flag_stonith_enabled)) { /* offline nodes run no resources... * unless stonith is enabled in which case we need to * make sure rsc start events happen after the stonith */ xmlNode *lrm_rsc = NULL; tmp = find_xml_node(node_state, XML_CIB_TAG_LRM, FALSE); tmp = find_xml_node(tmp, XML_LRM_TAG_RESOURCES, FALSE); for (lrm_rsc = __xml_first_child(tmp); lrm_rsc != NULL; lrm_rsc = __xml_next(lrm_rsc)) { if (crm_str_eq((const char *)lrm_rsc->name, XML_LRM_TAG_RESOURCE, TRUE)) { const char *rsc_id = crm_element_value(lrm_rsc, XML_ATTR_ID); if (rsc != NULL && safe_str_neq(rsc_id, rsc)) { continue; } intermediate = extract_operations(uname, rsc_id, lrm_rsc, active_filter); output = g_list_concat(output, intermediate); } } } } } return output; }