diff --git a/crm/cib/main.c b/crm/cib/main.c index 0d41590e67..774ed77140 100644 --- a/crm/cib/main.c +++ b/crm/cib/main.c @@ -1,621 +1,621 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* #include */ #include /* #include */ #include #include #include #include #include #include #include #include #include #if HAVE_LIBXML2 # include #endif #ifdef HAVE_GETOPT_H # include #endif extern int init_remote_listener(int port); extern gboolean ccm_connect(void); gboolean cib_shutdown_flag = FALSE; gboolean stand_alone = FALSE; gboolean per_action_cib = FALSE; enum cib_errors cib_status = cib_ok; extern char *ccm_transition_id; extern void oc_ev_special(const oc_ev_t *, oc_ev_class_t , int ); GMainLoop* mainloop = NULL; const char* crm_system_name = CRM_SYSTEM_CIB; const char* cib_root = WORKING_DIR; char *cib_our_uname = NULL; oc_ev_t *cib_ev_token; gboolean preserve_status = FALSE; gboolean cib_writes_enabled = TRUE; void usage(const char* cmd, int exit_status); int cib_init(void); gboolean cib_register_ha(ll_cluster_t *hb_cluster, const char *client_name); gboolean cib_shutdown(int nsig, gpointer unused); void cib_ha_connection_destroy(gpointer user_data); gboolean startCib(const char *filename); extern gboolean cib_msg_timeout(gpointer data); extern int write_cib_contents(gpointer p); GHashTable *client_list = NULL; GHashTable *ccm_membership = NULL; GHashTable *peer_hash = NULL; ll_cluster_t *hb_conn = NULL; GTRIGSource *cib_writer = NULL; char *channel1 = NULL; char *channel2 = NULL; char *channel3 = NULL; char *channel4 = NULL; char *channel5 = NULL; #define OPTARGS "aswr:V?" void cib_cleanup(void); static void cib_diskwrite_complete(gpointer userdata, int status, int signo, int exitcode) { if(exitcode != LSB_EXIT_OK || signo != 0 || status != 0) { crm_err("Disk write failed: status=%d, signo=%d, exitcode=%d", status, signo, exitcode); if(cib_writes_enabled) { crm_err("Disabling disk writes after write failure"); cib_writes_enabled = FALSE; } } else { crm_debug_2("Disk write passed"); } } int main(int argc, char ** argv) { int flag; int rc = 0; int argerr = 0; #ifdef HAVE_GETOPT_H int option_index = 0; static struct option long_options[] = { {"per-action-cib", 0, 0, 'a'}, {"stand-alone", 0, 0, 's'}, {"disk-writes", 0, 0, 'w'}, {"cib-root", 1, 0, 'r'}, {"verbose", 0, 0, 'V'}, {"help", 0, 0, '?'}, {0, 0, 0, 0} }; #endif crm_log_init(crm_system_name, LOG_INFO, TRUE, FALSE, 0, NULL); G_main_add_SignalHandler( G_PRIORITY_HIGH, SIGTERM, cib_shutdown, NULL, NULL); cib_writer = G_main_add_tempproc_trigger( G_PRIORITY_LOW, write_cib_contents, "write_cib_contents", NULL, NULL, NULL, cib_diskwrite_complete); EnableProcLogging(); - set_sigchld_proctrack(G_PRIORITY_HIGH); + set_sigchld_proctrack(G_PRIORITY_HIGH,DEFAULT_MAXDISPATCHTIME); client_list = g_hash_table_new(g_str_hash, g_str_equal); ccm_membership = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, NULL); peer_hash = g_hash_table_new_full( g_str_hash, g_str_equal,g_hash_destroy_str, g_hash_destroy_str); while (1) { #ifdef HAVE_GETOPT_H flag = getopt_long(argc, argv, OPTARGS, long_options, &option_index); #else flag = getopt(argc, argv, OPTARGS); #endif if (flag == -1) break; switch(flag) { case 'V': alter_debug(DEBUG_INC); break; case 's': stand_alone = TRUE; preserve_status = TRUE; cib_writes_enabled = FALSE; cl_log_enable_stderr(1); break; case '?': /* Help message */ usage(crm_system_name, LSB_EXIT_OK); break; case 'f': per_action_cib = TRUE; break; case 'w': cib_writes_enabled = TRUE; break; case 'r': cib_root = optarg; break; default: ++argerr; break; } } crm_info("Retrieval of a per-action CIB: %s", per_action_cib?"enabled":"disabled"); if (optind > argc) { ++argerr; } if (argerr) { usage(crm_system_name,LSB_EXIT_GENERIC); } /* read local config file */ rc = cib_init(); CRM_CHECK(g_hash_table_size(client_list) == 0, crm_warn("Not all clients gone at exit")); cib_cleanup(); if(hb_conn) { hb_conn->llc_ops->delete(hb_conn); } crm_info("Done"); return rc; } void cib_cleanup(void) { g_hash_table_destroy(ccm_membership); g_hash_table_destroy(client_list); g_hash_table_destroy(peer_hash); crm_free(ccm_transition_id); crm_free(cib_our_uname); #if HAVE_LIBXML2 xmlCleanupParser(); #endif crm_free(channel1); crm_free(channel2); crm_free(channel3); crm_free(channel4); crm_free(channel5); } unsigned long cib_num_ops = 0; const char *cib_stat_interval = "10min"; unsigned long cib_num_local = 0, cib_num_updates = 0, cib_num_fail = 0; unsigned long cib_bad_connects = 0, cib_num_timeouts = 0; longclock_t cib_call_time = 0; gboolean cib_stats(gpointer data); gboolean cib_stats(gpointer data) { int local_log_level = LOG_DEBUG; static unsigned long last_stat = 0; unsigned int cib_calls_ms = 0; static unsigned long cib_stat_interval_ms = 0; if(cib_stat_interval_ms == 0) { cib_stat_interval_ms = crm_get_msec(cib_stat_interval); } cib_calls_ms = longclockto_ms(cib_call_time); if((cib_num_ops - last_stat) > 0) { unsigned long calls_diff = cib_num_ops - last_stat; double stat_1 = (1000*cib_calls_ms)/calls_diff; local_log_level = LOG_INFO; do_crm_log(local_log_level, "Processed %lu operations" " (%.2fus average, %lu%% utilization) in the last %s", calls_diff, stat_1, (100*cib_calls_ms)/cib_stat_interval_ms, cib_stat_interval); } do_crm_log(local_log_level+1, "\tDetail: %lu operations (%ums total)" " (%lu local, %lu updates, %lu failures," " %lu timeouts, %lu bad connects)", cib_num_ops, cib_calls_ms, cib_num_local, cib_num_updates, cib_num_fail, cib_bad_connects, cib_num_timeouts); last_stat = cib_num_ops; cib_call_time = 0; return TRUE; } static void ccm_connection_destroy(gpointer user_data) { crm_err("CCM connection failed... blocking while we reconnect"); CRM_ASSERT(ccm_connect()); return; } extern int current_instance; gboolean ccm_connect(void) { gboolean did_fail = TRUE; int num_ccm_fails = 0; int max_ccm_fails = 30; int ret; int cib_ev_fd; while(did_fail) { did_fail = FALSE; crm_info("Registering with CCM..."); ret = oc_ev_register(&cib_ev_token); if (ret != 0) { did_fail = TRUE; } if(did_fail == FALSE) { crm_debug_3("Setting up CCM callbacks"); ret = oc_ev_set_callback( cib_ev_token, OC_EV_MEMB_CLASS, cib_ccm_msg_callback, NULL); if (ret != 0) { crm_warn("CCM callback not set"); did_fail = TRUE; } } if(did_fail == FALSE) { oc_ev_special(cib_ev_token, OC_EV_MEMB_CLASS, 0); crm_debug_3("Activating CCM token"); ret = oc_ev_activate(cib_ev_token, &cib_ev_fd); if (ret != 0){ crm_warn("CCM Activation failed"); did_fail = TRUE; } } if(did_fail) { num_ccm_fails++; oc_ev_unregister(cib_ev_token); if(num_ccm_fails < max_ccm_fails){ crm_warn("CCM Connection failed %d times (%d max)", num_ccm_fails, max_ccm_fails); sleep(1); } else { crm_err("CCM Activation failed %d (max) times", num_ccm_fails); return FALSE; } } } current_instance = 0; crm_debug("CCM Activation passed... all set to go!"); G_main_add_fd(G_PRIORITY_HIGH, cib_ev_fd, FALSE, cib_ccm_dispatch, cib_ev_token, ccm_connection_destroy); return TRUE; } int cib_init(void) { gboolean was_error = FALSE; if(startCib("cib.xml") == FALSE){ crm_crit("Cannot start CIB... terminating"); exit(1); } if(stand_alone == FALSE) { hb_conn = ll_cluster_new("heartbeat"); if(cib_register_ha(hb_conn, CRM_SYSTEM_CIB) == FALSE) { crm_crit("Cannot sign in to heartbeat... terminating"); exit(1); } } else { cib_our_uname = crm_strdup("localhost"); } channel1 = crm_strdup(cib_channel_callback); was_error = init_server_ipc_comms( channel1, cib_client_connect_null, default_ipc_connection_destroy); channel2 = crm_strdup(cib_channel_ro); was_error = was_error || init_server_ipc_comms( channel2, cib_client_connect_rw_ro, default_ipc_connection_destroy); channel3 = crm_strdup(cib_channel_rw); was_error = was_error || init_server_ipc_comms( channel3, cib_client_connect_rw_ro, default_ipc_connection_destroy); channel4 = crm_strdup(cib_channel_rw_synchronous); was_error = was_error || init_server_ipc_comms( channel4, cib_client_connect_rw_synch, default_ipc_connection_destroy); channel5 = crm_strdup(cib_channel_ro_synchronous); was_error = was_error || init_server_ipc_comms( channel5, cib_client_connect_ro_synch, default_ipc_connection_destroy); if(stand_alone) { if(was_error) { crm_err("Couldnt start"); return 1; } cib_is_master = TRUE; /* Create the mainloop and run it... */ mainloop = g_main_new(FALSE); crm_info("Starting %s mainloop", crm_system_name); /* Gmain_timeout_add(crm_get_msec("10s"), cib_msg_timeout, NULL); */ /* Gmain_timeout_add( */ /* crm_get_msec(cib_stat_interval), cib_stats, NULL); */ g_main_run(mainloop); return_to_orig_privs(); return 0; } if(was_error == FALSE) { crm_debug_3("Be informed of CRM Client Status changes"); if (HA_OK != hb_conn->llc_ops->set_cstatus_callback( hb_conn, cib_client_status_callback, hb_conn)) { crm_err("Cannot set cstatus callback: %s", hb_conn->llc_ops->errmsg(hb_conn)); was_error = TRUE; } else { crm_debug_3("Client Status callback set"); } } if(was_error == FALSE) { was_error = (ccm_connect() == FALSE); } if(was_error == FALSE) { /* Async get client status information in the cluster */ crm_debug_3("Requesting an initial dump of CIB client_status"); hb_conn->llc_ops->client_status( hb_conn, NULL, CRM_SYSTEM_CIB, -1); /* Create the mainloop and run it... */ mainloop = g_main_new(FALSE); crm_info("Starting %s mainloop", crm_system_name); Gmain_timeout_add(crm_get_msec("10s"), cib_msg_timeout, NULL); Gmain_timeout_add( crm_get_msec(cib_stat_interval), cib_stats, NULL); g_main_run(mainloop); return_to_orig_privs(); } else { crm_err("Couldnt start all communication channels, exiting."); } return 0; } void usage(const char* cmd, int exit_status) { FILE* stream; stream = exit_status ? stderr : stdout; fprintf(stream, "usage: %s [-%s]\n", cmd, OPTARGS); fprintf(stream, "\t--%s (-%c)\t\tTurn on debug info." " Additional instances increase verbosity\n", "verbose", 'V'); fprintf(stream, "\t--%s (-%c)\t\tThis help message\n", "help", '?'); fprintf(stream, "\t--%s (-%c)\tAdvanced use only\n", "per-action-cib", 'a'); fprintf(stream, "\t--%s (-%c)\tAdvanced use only\n", "stand-alone", 's'); fprintf(stream, "\t--%s (-%c)\tAdvanced use only\n", "disk-writes", 'w'); fprintf(stream, "\t--%s (-%c)\t\tAdvanced use only\n", "cib-root", 'r'); fflush(stream); exit(exit_status); } gboolean cib_register_ha(ll_cluster_t *hb_cluster, const char *client_name) { const char *uname = NULL; crm_info("Signing in with Heartbeat"); if (hb_cluster->llc_ops->signon(hb_cluster, client_name)!= HA_OK) { crm_err("Cannot sign on with heartbeat: %s", hb_cluster->llc_ops->errmsg(hb_cluster)); return FALSE; } crm_debug_3("Be informed of CIB messages"); if (HA_OK != hb_cluster->llc_ops->set_msg_callback( hb_cluster, T_CIB, cib_peer_callback, hb_cluster)){ crm_err("Cannot set msg callback: %s", hb_cluster->llc_ops->errmsg(hb_cluster)); return FALSE; } crm_debug_3("Finding our node name"); if ((uname = hb_cluster->llc_ops->get_mynodeid(hb_cluster)) == NULL) { crm_err("get_mynodeid() failed"); return FALSE; } cib_our_uname = crm_strdup(uname); crm_info("FSA Hostname: %s", cib_our_uname); crm_debug_3("Adding channel to mainloop"); G_main_add_IPC_Channel( G_PRIORITY_DEFAULT, hb_cluster->llc_ops->ipcchan(hb_cluster), FALSE, cib_ha_dispatch, hb_cluster /* userdata */, cib_ha_connection_destroy); return TRUE; } void cib_ha_connection_destroy(gpointer user_data) { if(cib_shutdown_flag) { crm_info("Heartbeat disconnection complete... exiting"); } else { crm_err("Heartbeat connection lost! Exiting."); } uninitializeCib(); if (mainloop != NULL && g_main_is_running(mainloop)) { g_main_quit(mainloop); } else { exit(LSB_EXIT_OK); } } static void disconnect_cib_client(gpointer key, gpointer value, gpointer user_data) { cib_client_t *a_client = value; crm_debug_2("Processing client %s/%s... send=%d, recv=%d", crm_str(a_client->name), crm_str(a_client->channel_name), (int)a_client->channel->send_queue->current_qlen, (int)a_client->channel->recv_queue->current_qlen); if(a_client->channel->ch_status == IPC_CONNECT) { a_client->channel->ops->resume_io(a_client->channel); if(a_client->channel->send_queue->current_qlen != 0 || a_client->channel->recv_queue->current_qlen != 0) { crm_info("Flushed messages to/from %s/%s... send=%d, recv=%d", crm_str(a_client->name), crm_str(a_client->channel_name), (int)a_client->channel->send_queue->current_qlen, (int)a_client->channel->recv_queue->current_qlen); } } if(a_client->channel->ch_status == IPC_CONNECT) { crm_warn("Disconnecting %s/%s...", crm_str(a_client->name), crm_str(a_client->channel_name)); a_client->channel->ops->disconnect(a_client->channel); } } extern gboolean cib_process_disconnect( IPC_Channel *channel, cib_client_t *cib_client); gboolean cib_shutdown(int nsig, gpointer unused) { if(cib_shutdown_flag == FALSE) { cib_shutdown_flag = TRUE; crm_debug("Disconnecting %d clients", g_hash_table_size(client_list)); g_hash_table_foreach(client_list, disconnect_cib_client, NULL); crm_info("Disconnected %d clients", g_hash_table_size(client_list)); cib_process_disconnect(NULL, NULL); } else { crm_info("Waiting for %d clients to disconnect...", g_hash_table_size(client_list)); } return TRUE; } gboolean startCib(const char *filename) { gboolean active = FALSE; crm_data_t *cib = readCibXmlFile(cib_root, filename, !preserve_status); CRM_ASSERT(cib != NULL); if(activateCibXml(cib, TRUE) == 0) { int port = 0; active = TRUE; ha_msg_value_int(cib, "remote_access_port", &port); init_remote_listener(port); crm_info("CIB Initialization completed successfully"); if(per_action_cib) { uninitializeCib(); } } return active; } diff --git a/crm/crmd/control.c b/crm/crmd/control.c index d314765a77..ebe3e633a8 100644 --- a/crm/crmd/control.c +++ b/crm/crmd/control.c @@ -1,907 +1,907 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include char *ipc_server = NULL; extern void crmd_ha_connection_destroy(gpointer user_data); gboolean crm_shutdown(int nsig, gpointer unused); gboolean register_with_ha(ll_cluster_t *hb_cluster, const char *client_name); void populate_cib_nodes(ll_cluster_t *hb_cluster, gboolean with_client_status); GHashTable *ipc_clients = NULL; GTRIGSource *fsa_source = NULL; /* A_HA_CONNECT */ enum crmd_fsa_input do_ha_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { gboolean registered = FALSE; if(action & A_HA_DISCONNECT) { if(fsa_cluster_conn != NULL) { set_bit_inplace(fsa_input_register, R_HA_DISCONNECTED); fsa_cluster_conn->llc_ops->signoff( fsa_cluster_conn, FALSE); } crm_info("Disconnected from Heartbeat"); } if(action & A_HA_CONNECT) { if(fsa_cluster_conn == NULL) { fsa_cluster_conn = ll_cluster_new("heartbeat"); } /* make sure we are disconnected first */ fsa_cluster_conn->llc_ops->signoff(fsa_cluster_conn, FALSE); registered = register_with_ha( fsa_cluster_conn, crm_system_name); if(registered == FALSE) { register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return I_NULL; } clear_bit_inplace(fsa_input_register, R_HA_DISCONNECTED); crm_info("Connected to Heartbeat"); } if(action & ~(A_HA_CONNECT|A_HA_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__); } return I_NULL; } /* A_SHUTDOWN */ enum crmd_fsa_input do_shutdown(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { int lpc = 0; gboolean continue_shutdown = TRUE; struct crm_subsystem_s *subsystems[] = { pe_subsystem, te_subsystem }; /* just in case */ set_bit_inplace(fsa_input_register, R_SHUTDOWN); for(lpc = 0; lpc < DIMOF(subsystems); lpc++) { struct crm_subsystem_s *a_subsystem = subsystems[lpc]; if(is_set(fsa_input_register, a_subsystem->flag_connected)) { crm_info("Terminating the %s", a_subsystem->name); if(stop_subsystem(a_subsystem, TRUE) == FALSE) { /* its gone... */ crm_err("Faking %s exit", a_subsystem->name); clear_bit_inplace(fsa_input_register, a_subsystem->flag_connected); } continue_shutdown = FALSE; } } if(continue_shutdown == FALSE) { crm_info("Waiting for subsystems to exit"); crmd_fsa_stall(NULL); } return I_NULL; } /* A_SHUTDOWN_REQ */ enum crmd_fsa_input do_shutdown_req(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { HA_Message *msg = NULL; crm_info("Sending shutdown request to DC: %s", crm_str(fsa_our_dc)); msg = create_request( CRM_OP_SHUTDOWN_REQ, NULL, NULL, CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL); /* set_bit_inplace(fsa_input_register, R_STAYDOWN); */ if(send_request(msg, NULL) == FALSE) { if(AM_I_DC) { crm_info("Processing shutdown locally"); } else { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } return I_NULL; } extern char *max_generation_from; extern crm_data_t *max_generation_xml; extern GHashTable *meta_hash; extern GHashTable *resources; extern GHashTable *voted; void log_connected_client(gpointer key, gpointer value, gpointer user_data); void log_connected_client(gpointer key, gpointer value, gpointer user_data) { crmd_client_t *client = value; crm_err("%s is still connected at exit", client->table_key); } static void free_mem(fsa_data_t *msg_data) { if(fsa_cluster_conn) { fsa_cluster_conn->llc_ops->delete(fsa_cluster_conn); fsa_cluster_conn = NULL; } slist_destroy(fsa_data_t, fsa_data, fsa_message_queue, crm_info("Dropping %s: [ state=%s cause=%s origin=%s ]", fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); delete_fsa_input(fsa_data); ); delete_fsa_input(msg_data); if(ipc_clients) { crm_debug("Number of connected clients: %d", g_hash_table_size(ipc_clients)); /* g_hash_table_foreach(ipc_clients, log_connected_client, NULL); */ g_hash_table_destroy(ipc_clients); } empty_uuid_cache(); free_ccm_cache(fsa_membership_copy); if(te_subsystem->client && te_subsystem->client->client_source) { crm_debug("Full destroy: TE"); G_main_del_IPC_Channel(te_subsystem->client->client_source); } else { crm_debug("Partial destroy: TE"); crmd_ipc_connection_destroy(te_subsystem->client); } crm_free(te_subsystem); if(pe_subsystem->client && pe_subsystem->client->client_source) { crm_debug("Full destroy: PE"); G_main_del_IPC_Channel(pe_subsystem->client->client_source); } else { crm_debug("Partial destroy: PE"); crmd_ipc_connection_destroy(pe_subsystem->client); } crm_free(pe_subsystem); crm_free(cib_subsystem); if(integrated_nodes) { g_hash_table_destroy(integrated_nodes); } if(finalized_nodes) { g_hash_table_destroy(finalized_nodes); } if(confirmed_nodes) { g_hash_table_destroy(confirmed_nodes); } if(crmd_peer_state) { g_hash_table_destroy(crmd_peer_state); } if(meta_hash) { g_hash_table_destroy(meta_hash); } if(resources) { g_hash_table_destroy(resources); } if(voted) { g_hash_table_destroy(voted); } cib_delete(fsa_cib_conn); fsa_cib_conn = NULL; if(fsa_lrm_conn) { fsa_lrm_conn->lrm_ops->delete(fsa_lrm_conn); } crm_free(integration_timer); crm_free(finalization_timer); crm_free(election_trigger); crm_free(election_timeout); crm_free(shutdown_escalation_timer); crm_free(wait_timer); crm_free(recheck_timer); crm_free(fsa_our_dc_version); crm_free(fsa_our_uuid); crm_free(fsa_our_dc); crm_free(ipc_server); crm_free(max_generation_from); free_xml(max_generation_xml); } /* A_EXIT_0, A_EXIT_1 */ enum crmd_fsa_input do_exit(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { int exit_code = 0; int log_level = LOG_INFO; const char *exit_type = "gracefully"; if(action & A_EXIT_1) { exit_code = 1; log_level = LOG_ERR; exit_type = "forcefully"; } verify_stopped(cur_state, LOG_ERR); do_crm_log(log_level, "Performing %s - %s exiting the CRMd", fsa_action2string(action), exit_type); if(is_set(fsa_input_register, R_IN_RECOVERY)) { crm_err("Could not recover from internal error"); exit_code = 2; } if(is_set(fsa_input_register, R_STAYDOWN)) { crm_warn("Inhibiting respawn by Heartbeat"); exit_code = 100; } free_mem(msg_data); crm_info("[%s] stopped (%d)", crm_system_name, exit_code); cl_flush_logs(); exit(exit_code); return I_NULL; } /* A_STARTUP */ enum crmd_fsa_input do_startup(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { int was_error = 0; int interval = 1; /* seconds between DC heartbeats */ crm_debug("Registering Signal Handlers"); G_main_add_SignalHandler( G_PRIORITY_HIGH, SIGTERM, crm_shutdown, NULL, NULL); fsa_source = G_main_add_TriggerHandler( G_PRIORITY_HIGH, crm_fsa_trigger, NULL, NULL); ipc_clients = g_hash_table_new(g_str_hash, g_str_equal); crm_debug("Creating CIB and LRM objects"); fsa_cib_conn = cib_new(); fsa_lrm_conn = ll_lrm_new(XML_CIB_TAG_LRM); crm_debug("Init server comms"); if(ipc_server == NULL) { ipc_server = crm_strdup(CRM_SYSTEM_CRMD); } was_error = init_server_ipc_comms(ipc_server, crmd_client_connect, default_ipc_connection_destroy); /* set up the timers */ crm_malloc0(integration_timer, sizeof(fsa_timer_t)); crm_malloc0(finalization_timer, sizeof(fsa_timer_t)); crm_malloc0(election_trigger, sizeof(fsa_timer_t)); crm_malloc0(election_timeout, sizeof(fsa_timer_t)); crm_malloc0(shutdown_escalation_timer, sizeof(fsa_timer_t)); crm_malloc0(wait_timer, sizeof(fsa_timer_t)); crm_malloc0(recheck_timer, sizeof(fsa_timer_t)); interval = interval * 1000; if(election_trigger != NULL) { election_trigger->source_id = 0; election_trigger->period_ms = -1; election_trigger->fsa_input = I_DC_TIMEOUT; election_trigger->callback = crm_timer_popped; election_trigger->repeat = FALSE; } else { was_error = TRUE; } if(election_timeout != NULL) { election_timeout->source_id = 0; election_timeout->period_ms = -1; election_timeout->fsa_input = I_ELECTION_DC; election_timeout->callback = crm_timer_popped; election_timeout->repeat = FALSE; } else { was_error = TRUE; } if(integration_timer != NULL) { integration_timer->source_id = 0; integration_timer->period_ms = -1; integration_timer->fsa_input = I_INTEGRATED; integration_timer->callback = crm_timer_popped; integration_timer->repeat = FALSE; } else { was_error = TRUE; } if(finalization_timer != NULL) { finalization_timer->source_id = 0; finalization_timer->period_ms = -1; finalization_timer->fsa_input = I_FINALIZED; finalization_timer->callback = crm_timer_popped; finalization_timer->repeat = FALSE; /* for possible enabling... a bug in the join protocol left * a slave in S_PENDING while we think its in S_NOT_DC * * raising I_FINALIZED put us into a transition loop which is * never resolved. * in this loop we continually send probes which the node * NACK's because its in S_PENDING * * if we have nodes where heartbeat is active but the * CRM is not... then this will be handled in the * integration phase */ finalization_timer->fsa_input = I_ELECTION; } else { was_error = TRUE; } if(shutdown_escalation_timer != NULL) { shutdown_escalation_timer->source_id = 0; shutdown_escalation_timer->period_ms = -1; shutdown_escalation_timer->fsa_input = I_STOP; shutdown_escalation_timer->callback = crm_timer_popped; shutdown_escalation_timer->repeat = FALSE; } else { was_error = TRUE; } if(wait_timer != NULL) { wait_timer->source_id = 0; wait_timer->period_ms = 500; wait_timer->fsa_input = I_NULL; wait_timer->callback = crm_timer_popped; wait_timer->repeat = FALSE; } else { was_error = TRUE; } if(recheck_timer != NULL) { recheck_timer->source_id = 0; recheck_timer->period_ms = -1; recheck_timer->fsa_input = I_PE_CALC; recheck_timer->callback = crm_timer_popped; recheck_timer->repeat = FALSE; } else { was_error = TRUE; } /* set up the sub systems */ crm_malloc0(cib_subsystem, sizeof(struct crm_subsystem_s)); crm_malloc0(te_subsystem, sizeof(struct crm_subsystem_s)); crm_malloc0(pe_subsystem, sizeof(struct crm_subsystem_s)); if(cib_subsystem != NULL) { cib_subsystem->pid = -1; cib_subsystem->path = BIN_DIR; cib_subsystem->name = CRM_SYSTEM_CIB; cib_subsystem->command = BIN_DIR"/"CRM_SYSTEM_CIB; cib_subsystem->args = "-VVc"; cib_subsystem->flag_connected = R_CIB_CONNECTED; cib_subsystem->flag_required = R_CIB_REQUIRED; } else { was_error = TRUE; } if(te_subsystem != NULL) { te_subsystem->pid = -1; te_subsystem->path = BIN_DIR; te_subsystem->name = CRM_SYSTEM_TENGINE; te_subsystem->command = BIN_DIR"/"CRM_SYSTEM_TENGINE; te_subsystem->args = NULL; te_subsystem->flag_connected = R_TE_CONNECTED; te_subsystem->flag_required = R_TE_REQUIRED; } else { was_error = TRUE; } if(pe_subsystem != NULL) { pe_subsystem->pid = -1; pe_subsystem->path = BIN_DIR; pe_subsystem->name = CRM_SYSTEM_PENGINE; pe_subsystem->command = BIN_DIR"/"CRM_SYSTEM_PENGINE; pe_subsystem->args = NULL; pe_subsystem->flag_connected = R_PE_CONNECTED; pe_subsystem->flag_required = R_PE_REQUIRED; } else { was_error = TRUE; } if(was_error) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } welcomed_nodes = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); integrated_nodes = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); finalized_nodes = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); confirmed_nodes = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); crmd_peer_state = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); - set_sigchld_proctrack(G_PRIORITY_HIGH); + set_sigchld_proctrack(G_PRIORITY_HIGH,DEFAULT_MAXDISPATCHTIME); return I_NULL; } /* A_STOP */ enum crmd_fsa_input do_stop(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); return I_NULL; } /* A_STARTED */ enum crmd_fsa_input do_started(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { if(is_set(fsa_input_register, R_CCM_DATA) == FALSE) { crm_info("Delaying start, CCM (%.16llx) not connected", R_CCM_DATA); crmd_fsa_stall(NULL); return I_NULL; } else if(is_set(fsa_input_register, R_LRM_CONNECTED) == FALSE) { crm_info("Delaying start, LRM (%.16llx) not connected", R_LRM_CONNECTED); crmd_fsa_stall(NULL); return I_NULL; } else if(is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { crm_info("Delaying start, CIB (%.16llx) not connected", R_CIB_CONNECTED); crmd_fsa_stall(NULL); return I_NULL; } else if(is_set(fsa_input_register, R_READ_CONFIG) == FALSE) { crm_info("Delaying start, Config not read (%.16llx)", R_READ_CONFIG); crmd_fsa_stall(NULL); return I_NULL; } else if(is_set(fsa_input_register, R_PEER_DATA) == FALSE) { HA_Message * msg = NULL; /* try reading from HA */ crm_info("Delaying start, Peer data (%.16llx) not recieved", R_PEER_DATA); crm_debug_3("Looking for a HA message"); msg = fsa_cluster_conn->llc_ops->readmsg(fsa_cluster_conn, 0); if(msg != NULL) { crm_debug_3("There was a HA message"); crm_msg_del(msg); } crm_timer_start(wait_timer); crmd_fsa_stall(NULL); return I_NULL; } crm_info("The local CRM is operational"); clear_bit_inplace(fsa_input_register, R_STARTING); register_fsa_input(msg_data->fsa_cause, I_PENDING, NULL); return I_NULL; } /* A_RECOVER */ enum crmd_fsa_input do_recover(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { set_bit_inplace(fsa_input_register, R_IN_RECOVERY); crm_err("Action %s (%.16llx) not supported", fsa_action2string(action), action); register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); return I_NULL; } pe_cluster_option crmd_opts[] = { /* name, old-name, validate, default, description */ { XML_CONFIG_ATTR_DC_DEADTIME, NULL, "time", NULL, "10s", &check_time, "How long to wait for a response from other nodes during startup.", "The \"correct\" value will depend on the speed and load of your network." }, { XML_CONFIG_ATTR_RECHECK, NULL, "time", "Zero disables polling. Positive values are an interval in seconds (unless other SI units are specified. eg. 5min)", "0", &check_timer, "Polling interval for time based changes to options, resource parameters and constraints.", "The Cluster is primarily event driven, however the configuration can have elements that change based on time. To ensure these changes take effect, we can optionally poll the cluster's status for changes." }, { XML_CONFIG_ATTR_ELECTION_FAIL, NULL, "time", NULL, "2min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { XML_CONFIG_ATTR_FORCE_QUIT, NULL, "time", NULL, "20min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer, "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug." }, }; void crmd_metadata(void) { config_metadata("CRM Daemon", "1.0", "CRM Daemon Options", "This is a fake resource that details the options that can be configured for the CRM Daemon.", crmd_opts, DIMOF(crmd_opts)); } static void verify_crmd_options(GHashTable *options) { verify_all_options(options, crmd_opts, DIMOF(crmd_opts)); } static const char * crmd_pref(GHashTable *options, const char *name) { return get_cluster_pref(options, crmd_opts, DIMOF(crmd_opts), name); } static void config_query_callback(const HA_Message *msg, int call_id, int rc, crm_data_t *output, void *user_data) { const char *value = NULL; GHashTable *config_hash = NULL; if(rc != cib_ok) { fsa_data_t *msg_data = NULL; crm_err("Local CIB query resulted in an error: %s", cib_error2string(rc)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); if(rc == cib_bad_permissions || rc == cib_bad_digest || rc == cib_bad_config) { crm_err("The cluster is mis-configured - shutting down and staying down"); set_bit_inplace(fsa_input_register, R_STAYDOWN); } return; } crm_debug("Call %d : Parsing CIB options", call_id); config_hash = g_hash_table_new_full( g_str_hash,g_str_equal, g_hash_destroy_str,g_hash_destroy_str); unpack_instance_attributes( output, XML_CIB_TAG_PROPSET, NULL, config_hash, CIB_OPTIONS_FIRST, NULL); value = g_hash_table_lookup(config_hash, XML_CONFIG_ATTR_DC_DEADTIME); if(value == NULL) { /* apparently we're not allowed to free the result of getenv */ char *param_val = getenv(ENV_PREFIX "" KEY_INITDEAD); value = crmd_pref(config_hash, XML_CONFIG_ATTR_DC_DEADTIME); if(param_val != NULL) { int from_env = crm_get_msec(param_val) / 2; int from_defaults = crm_get_msec(value); if(from_env > from_defaults) { g_hash_table_replace( config_hash, crm_strdup(XML_CONFIG_ATTR_DC_DEADTIME), crm_strdup(param_val)); } } } verify_crmd_options(config_hash); value = crmd_pref(config_hash, XML_CONFIG_ATTR_DC_DEADTIME); election_trigger->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, XML_CONFIG_ATTR_FORCE_QUIT); shutdown_escalation_timer->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, XML_CONFIG_ATTR_ELECTION_FAIL); election_timeout->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, XML_CONFIG_ATTR_RECHECK); recheck_timer->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, "crmd-integration-timeout"); integration_timer->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, "crmd-finalization-timeout"); finalization_timer->period_ms = crm_get_msec(value); set_bit_inplace(fsa_input_register, R_READ_CONFIG); crm_debug_3("Triggering FSA: %s", __FUNCTION__); G_main_set_trigger(fsa_source); g_hash_table_destroy(config_hash); } /* A_READCONFIG */ enum crmd_fsa_input do_read_config(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { int call_id = fsa_cib_conn->cmds->query( fsa_cib_conn, XML_CIB_TAG_CRMCONFIG, NULL, cib_scope_local); add_cib_op_callback(call_id, FALSE, NULL, config_query_callback); crm_debug_2("Querying the CIB... call %d", call_id); return I_NULL; } gboolean crm_shutdown(int nsig, gpointer unused) { if (crmd_mainloop != NULL && g_main_is_running(crmd_mainloop)) { if(is_set(fsa_input_register, R_SHUTDOWN)) { crm_err("Escalating the shutdown"); register_fsa_input_before(C_SHUTDOWN, I_ERROR, NULL); } else { crm_info("Requesting shutdown"); set_bit_inplace(fsa_input_register, R_SHUTDOWN); register_fsa_input(C_SHUTDOWN,I_SHUTDOWN,NULL); if(shutdown_escalation_timer->period_ms < 1) { GHashTable *config_hash = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); const char *value = crmd_pref( config_hash, XML_CONFIG_ATTR_FORCE_QUIT); int msec = crm_get_msec(value); crm_info("Using default shutdown escalation: %dms", msec); shutdown_escalation_timer->period_ms = msec; g_hash_table_destroy(config_hash); } /* cant rely on this... */ crm_timer_start(shutdown_escalation_timer); } } else { crm_info("exit from shutdown"); exit(LSB_EXIT_OK); } return TRUE; } static void default_cib_update_callback(const HA_Message *msg, int call_id, int rc, crm_data_t *output, void *user_data) { if(rc != cib_ok) { fsa_data_t *msg_data = NULL; crm_err("CIB Update failed: %s", cib_error2string(rc)); crm_log_xml_warn(output, "update:failed"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } void populate_cib_nodes(ll_cluster_t *hb_cluster, gboolean with_client_status) { int call_id = 0; const char *ha_node = NULL; crm_data_t *cib_node_list = NULL; /* Async get client status information in the cluster */ crm_debug_2("Invoked"); if(with_client_status) { crm_debug_3("Requesting an initial dump of CRMD client_status"); fsa_cluster_conn->llc_ops->client_status( fsa_cluster_conn, NULL, CRM_SYSTEM_CRMD, -1); } crm_info("Requesting the list of configured nodes"); fsa_cluster_conn->llc_ops->init_nodewalk(fsa_cluster_conn); cib_node_list = create_xml_node(NULL, XML_CIB_TAG_NODES); do { const char *ha_node_type = NULL; const char *ha_node_uuid = NULL; crm_data_t *cib_new_node = NULL; ha_node = fsa_cluster_conn->llc_ops->nextnode(fsa_cluster_conn); if(ha_node == NULL) { continue; } ha_node_type = fsa_cluster_conn->llc_ops->node_type( fsa_cluster_conn, ha_node); if(safe_str_neq(NORMALNODE, ha_node_type)) { crm_debug("Node %s: skipping '%s'", ha_node, ha_node_type); continue; } ha_node_uuid = get_uuid(fsa_cluster_conn, ha_node); if(ha_node_uuid == NULL) { crm_warn("Node %s: no uuid found", ha_node); continue; } crm_notice("Node: %s (uuid: %s)", ha_node, ha_node_uuid); cib_new_node = create_xml_node(cib_node_list, XML_CIB_TAG_NODE); crm_xml_add(cib_new_node, XML_ATTR_ID, ha_node_uuid); crm_xml_add(cib_new_node, XML_ATTR_UNAME, ha_node); crm_xml_add(cib_new_node, XML_ATTR_TYPE, ha_node_type); } while(ha_node != NULL); fsa_cluster_conn->llc_ops->end_nodewalk(fsa_cluster_conn); /* Now update the CIB with the list of nodes */ fsa_cib_update( XML_CIB_TAG_NODES, cib_node_list, cib_scope_local|cib_quorum_override|cib_inhibit_bcast, call_id); add_cib_op_callback(call_id, FALSE, NULL, default_cib_update_callback); free_xml(cib_node_list); crm_debug_2("Complete"); } gboolean register_with_ha(ll_cluster_t *hb_cluster, const char *client_name) { const char *const_uuid = NULL; crm_debug("Signing in with Heartbeat"); if (hb_cluster->llc_ops->signon(hb_cluster, client_name)!= HA_OK) { crm_err("Cannot sign on with heartbeat: %s", hb_cluster->llc_ops->errmsg(hb_cluster)); return FALSE; } crm_debug_3("Be informed of CRM messages"); if (HA_OK != hb_cluster->llc_ops->set_msg_callback( hb_cluster, T_CRM, crmd_ha_msg_callback, hb_cluster)){ crm_err("Cannot set msg callback: %s", hb_cluster->llc_ops->errmsg(hb_cluster)); return FALSE; } crm_debug_3("Be informed of Node Status changes"); if (HA_OK != hb_cluster->llc_ops->set_nstatus_callback( hb_cluster, crmd_ha_status_callback, hb_cluster)){ crm_err("Cannot set nstatus callback: %s", hb_cluster->llc_ops->errmsg(hb_cluster)); return FALSE; } crm_debug_3("Be informed of CRM Client Status changes"); if (HA_OK != hb_cluster->llc_ops->set_cstatus_callback( hb_cluster, crmd_client_status_callback, hb_cluster)) { crm_err("Cannot set cstatus callback: %s", hb_cluster->llc_ops->errmsg(hb_cluster)); return FALSE; } crm_debug_3("Adding channel to mainloop"); G_main_add_ll_cluster( G_PRIORITY_HIGH, hb_cluster, FALSE, crmd_ha_msg_dispatch, hb_cluster /* userdata */, crmd_ha_connection_destroy); crm_debug_3("Finding our node name"); if ((fsa_our_uname = hb_cluster->llc_ops->get_mynodeid(hb_cluster)) == NULL) { crm_err("get_mynodeid() failed"); return FALSE; } crm_info("Hostname: %s", fsa_our_uname); crm_debug_3("Finding our node uuid"); const_uuid = get_uuid(fsa_cluster_conn, fsa_our_uname); if(const_uuid == NULL) { crm_err("get_uuid_by_name() failed"); return FALSE; } /* copy it so that unget_uuid() doesn't trash the value on us */ fsa_our_uuid = crm_strdup(const_uuid); crm_info("UUID: %s", fsa_our_uuid); populate_cib_nodes(hb_cluster, TRUE); return TRUE; } diff --git a/cts/CIB.py.in b/cts/CIB.py.in index bf030119a6..2a62794f05 100644 --- a/cts/CIB.py.in +++ b/cts/CIB.py.in @@ -1,248 +1,248 @@ #!@PYTHON@ '''CTS: Cluster Testing System: CIB generator ''' __copyright__=''' Author: Jia Ming Pan Copyright (C) 2006 International Business Machines ''' from UserDict import UserDict import sys, time, types, syslog, os, struct, string, signal, traceback from CTS import ClusterManager from CM_hb import HeartbeatCM class CIB: cib_option_template = ''' ''' ipaddr_template = ''' ''' hb_ipaddr_template = ''' ''' lsb_resource = ''' - + ''' dummy_resource_template = ''' ''' clustermon_resource_template = ''' ''' clustermon_location_constraint = ''' ''' master_slave_resource = ''' ''' resource_group_template = '''%s %s %s''' per_node_constraint_template = ''' ''' stonith_resource_template = """ """ cib_template =''' %s %s %s ''' def NextIP(self): fields = string.split(self.CM.Env["IPBase"], '.') fields[3] = str(int(fields[3])+1) ip = string.join(fields, '.') self.CM.Env["IPBase"]=ip return ip def __init__(self, CM): self.CM = CM #make up crm config cib_options = self.cib_option_template % CM.Env["DoFencing"] #create resources and their constraints resources = "" constraints = "" if self.CM.Env["DoBSC"] == 1: cib_options = cib_options + ''' ''' if self.CM.Env["CIBResource"] != 1: # generate cib self.cts_cib = self.cib_template % (cib_options, resources, constraints) return if self.CM.cluster_monitor == 1: resources += self.clustermon_resource_template constraints += self.clustermon_location_constraint ip1=self.NextIP() ip2=self.NextIP() ip3=self.NextIP() ip1_rsc = self.ipaddr_template % (ip1, ip1, ip1, ip1, ip1) ip2_rsc = self.hb_ipaddr_template % (ip2, ip2, ip2, ip2, ip2) ip3_rsc = self.ipaddr_template % (ip3, ip3, ip3, ip3, ip3) resources += self.resource_group_template % (ip1_rsc, ip2_rsc, ip3_rsc) # lsb resource resources += self.lsb_resource # Mirgator resources += self.dummy_resource_template % \ ("migrator", "migrator", "migrator", "migrator") # per node resource fields = string.split(self.CM.Env["IPBase"], '.') for node in self.CM.Env["nodes"]: ip = self.NextIP() per_node_resources = self.ipaddr_template % \ ("rsc_"+node, "rsc_"+node, "rsc_"+node, "rsc_"+node, ip) per_node_constraint = self.per_node_constraint_template % \ ("rsc_"+node, "rsc_"+node, "rsc_"+node, "rsc_"+node, node) resources += per_node_resources constraints += per_node_constraint # fencing resource nodelist = "" len = 0 for node in self.CM.Env["nodes"]: nodelist += node + " " len = len + 1 stonith_resource = self.stonith_resource_template % \ (self.CM.Env["reset"].stonithtype, self.CM.Env["reset"].configName, self.CM.Env["reset"].configValue) resources += stonith_resource #master slave resource resources += self.master_slave_resource % (2*len, 2, len, 1) # generate cib self.cts_cib = self.cib_template % (cib_options, resources, constraints) def cib(self): return self.cts_cib diff --git a/lib/plugins/lrm/raexecstonith.c b/lib/plugins/lrm/raexecstonith.c index 5ea99fb90f..d5414f42e5 100644 --- a/lib/plugins/lrm/raexecstonith.c +++ b/lib/plugins/lrm/raexecstonith.c @@ -1,389 +1,390 @@ /* * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA * * File: raexecocf.c * Author: Sun Jiang Dong * Copyright (c) 2004 International Business Machines * * This code implements the Resource Agent Plugin Module for LSB style. * It's a part of Local Resource Manager. Currently it's used by lrmd only. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* Add it for compiling on OSX */ #include #include #include #include # define PIL_PLUGINTYPE RA_EXEC_TYPE # define PIL_PLUGINTYPE_S "RAExec" # define PIL_PLUGINLICENSE LICENSE_PUBDOM # define PIL_PLUGINLICENSEURL URL_PUBDOM # define PIL_PLUGIN stonith # define PIL_PLUGIN_S "stonith" static PIL_rc close_stonithRA(PILInterface*, void* ud_interface); /* static const char * RA_PATH = STONITH_RA_DIR; */ /* Temporarily use it */ static const char * RA_PATH = HA_LIBHBDIR "/stonith/plugins/stonith/"; /* The begin of exported function list */ static int execra(const char * rsc_id, const char * rsc_type, const char * provider, const char * op_type, const int timeout, GHashTable * params); static uniform_ret_execra_t map_ra_retvalue(int ret_execra , const char * op_type, const char * std_output); static int get_resource_list(GList ** rsc_info); static char* get_resource_meta(const char* rsc_type, const char* provider); static int get_provider_list(const char* op_type, GList ** providers); /* The end of exported function list */ /* The begin of internal used function & data list */ static int get_providers(const char* class_path, const char* op_type, GList ** providers); static void stonithRA_ops_callback(stonithRA_ops_t * op, void * private_data); static int exit_value; /* The end of internal function & data list */ /* Rource agent execution plugin operations */ static struct RAExecOps raops = { execra, map_ra_retvalue, get_resource_list, get_provider_list, get_resource_meta }; static const char META_TEMPLATE[] = "\n" "\n" "\n" "1.0\n" "\n" "%s\n" "\n" "%s\n" "%s\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "\n" "2.0\n" "\n" "\n"; static const char * no_parameter_info = ""; #define CHECKMETANULL(ret, which) \ if (ret == NULL) { \ cl_log(LOG_WARNING, "stonithRA plugin: cannot get %s " \ "segment of %s's metadata.", which, rsc_type); \ ret = no_parameter_info; \ } #define xmlize(p) \ ( p ? (char *)xmlEncodeEntitiesReentrant(NULL, \ (const unsigned char *)p) \ : NULL ) #define zapxml(p) do { \ if( p ) { \ xmlFree(p); \ } \ } while(0) PIL_PLUGIN_BOILERPLATE2("1.0", Debug); static const PILPluginImports* PluginImports; static PILPlugin* OurPlugin; static PILInterface* OurInterface; static void* OurImports; static void* interfprivate; /* * Our plugin initialization and registration function * It gets called when the plugin gets loaded. */ PIL_rc PIL_PLUGIN_INIT(PILPlugin * us, const PILPluginImports* imports); PIL_rc PIL_PLUGIN_INIT(PILPlugin * us, const PILPluginImports* imports) { /* Force the compiler to do a little type checking */ (void)(PILPluginInitFun)PIL_PLUGIN_INIT; PluginImports = imports; OurPlugin = us; /* Register ourself as a plugin */ imports->register_plugin(us, &OurPIExports); /* Register our interfaces */ return imports->register_interface(us, PIL_PLUGINTYPE_S, PIL_PLUGIN_S, &raops, close_stonithRA, &OurInterface, &OurImports, interfprivate); } static PIL_rc close_stonithRA(PILInterface* pif, void* ud_interface) { return PIL_OK; } /* * Most of the oprations will be sent to sotnithd directly, such as 'start', * 'stop', 'monitor'. And others like 'meta-data' will be handled by itself * locally. * Some of important parameters' name: * config_file * config_string */ static int execra(const char * rsc_id, const char * rsc_type, const char * provider, const char * op_type,const int timeout, GHashTable * params) { stonithRA_ops_t * op; int call_id = -1; char buffer_tmp[32]; /* Handling "meta-data" operation in a special way. * Now handle "meta-data" operation locally. * Should be changed in the future? */ if ( 0 == STRNCMP_CONST(op_type, "meta-data")) { char * tmp; tmp = get_resource_meta(rsc_type, provider); printf("%s", tmp); g_free(tmp); exit(0); } g_snprintf(buffer_tmp, sizeof(buffer_tmp), "%s_%d" , "STONITH_RA_EXEC", getpid()); if (ST_OK != stonithd_signon(buffer_tmp)) { cl_log(LOG_ERR, "%s:%d: Cannot sign on the stonithd." , __FUNCTION__, __LINE__); exit(EXECRA_UNKNOWN_ERROR); } stonithd_set_stonithRA_ops_callback(stonithRA_ops_callback, &call_id); /* Temporarily donnot use it, but how to deal with the global OCF * variables. This is a important thing to think about and do. */ /* send the RA operation to stonithd to simulate a RA's actions */ if ( 0==STRNCMP_CONST(op_type, "start") || 0==STRNCMP_CONST(op_type, "stop") ) { cl_log(LOG_INFO , "Try to %s STONITH resource : Device=%s" , op_type, rsc_id, rsc_type); } op = g_new(stonithRA_ops_t, 1); op->ra_name = g_strdup(rsc_type); op->op_type = g_strdup(op_type); op->params = params; op->rsc_id = g_strdup(rsc_id); if (ST_OK != stonithd_virtual_stonithRA_ops(op, &call_id)) { cl_log(LOG_ERR, "sending stonithRA op to stonithd failed."); /* Need to improve the granularity for error return code */ stonithd_signoff(); exit(EXECRA_EXEC_UNKNOWN_ERROR); } /* May be redundant */ /* while (stonithd_op_result_ready() != TRUE) { ; } */ /* cl_log(LOG_DEBUG, "Will call stonithd_receive_ops_result."); */ if (ST_OK != stonithd_receive_ops_result(TRUE)) { cl_log(LOG_ERR, "stonithd_receive_ops_result failed."); /* Need to improve the granularity for error return code */ stonithd_signoff(); exit(EXECRA_EXEC_UNKNOWN_ERROR); } /* exit_value will be setted by the callback function */ g_free(op->ra_name); g_free(op->op_type); g_free(op->rsc_id); g_free(op); stonithd_signoff(); /* cl_log(LOG_DEBUG, "stonithRA orignal exit code=%d", exit_value); */ exit(map_ra_retvalue(exit_value, op_type, NULL)); } static void stonithRA_ops_callback(stonithRA_ops_t * op, void * private_data) { /* cl_log(LOG_DEBUG, "setting exit code=%d", exit_value); */ exit_value = op->op_result; } static uniform_ret_execra_t map_ra_retvalue(int ret_execra, const char * op_type, const char * std_output) { /* Because the UNIFORM_RET_EXECRA is compatible with OCF standard, no * actual mapping except validating, which ensure the return code * will be in the range 0 to 7. Too strict? */ - if (ret_execra < 0 || ret_execra > 7) { + if (ret_execra < EXECRA_EXEC_UNKNOWN_ERROR || + ret_execra > EXECRA_STATUS_UNKNOWN) { cl_log(LOG_WARNING, "mapped the invalid return code %d." , ret_execra); ret_execra = EXECRA_UNKNOWN_ERROR; } return ret_execra; } static int get_resource_list(GList ** rsc_info) { int rc; int needprivs = !cl_have_full_privs(); if ( rsc_info == NULL ) { cl_log(LOG_ERR, "Parameter error: get_resource_list"); return -2; } if ( *rsc_info != NULL ) { cl_log(LOG_ERR, "Parameter error: get_resource_list."\ "will cause memory leak."); *rsc_info = NULL; } if (needprivs) { return_to_orig_privs(); } if (ST_OK != stonithd_signon("STONITH_RA")) { cl_log(LOG_ERR, "%s:%d: Can not signon to the stonithd." , __FUNCTION__, __LINE__); rc = -1; } else { rc = stonithd_list_stonith_types(rsc_info); stonithd_signoff(); } if (needprivs) { return_to_dropped_privs(); } return rc; } static int get_provider_list(const char* op_type, GList ** providers) { int ret; ret = get_providers(RA_PATH, op_type, providers); if (0>ret) { cl_log(LOG_ERR, "scandir failed in stonith RA plugin"); } return ret; } static char * get_resource_meta(const char* rsc_type, const char* provider) { char * buffer; int bufferlen = 0; const char * meta_param = NULL; const char * meta_longdesc = NULL; const char * meta_shortdesc = NULL; char *xml_meta_longdesc = NULL; char *xml_meta_shortdesc = NULL; Stonith * stonith_obj = NULL; if ( provider != NULL ) { cl_log(LOG_DEBUG, "stonithRA plugin: provider attribute " "is not needed and will be ignored."); } stonith_obj = stonith_new(rsc_type); meta_longdesc = stonith_get_info(stonith_obj, ST_DEVICEDESCR); CHECKMETANULL(meta_longdesc, "longdesc") xml_meta_longdesc = xmlize(meta_longdesc); meta_shortdesc = stonith_get_info(stonith_obj, ST_DEVICENAME); CHECKMETANULL(meta_shortdesc, "shortdesc") xml_meta_shortdesc = xmlize(meta_shortdesc); meta_param = stonith_get_info(stonith_obj, ST_CONF_XML); CHECKMETANULL(meta_param, "parameters") bufferlen = STRLEN_CONST(META_TEMPLATE) + strlen(rsc_type) + strlen(xml_meta_longdesc) + strlen(xml_meta_shortdesc) + strlen(meta_param) + 1; buffer = g_new(char, bufferlen); buffer[bufferlen-1] = '\0'; snprintf(buffer, bufferlen-1, META_TEMPLATE, rsc_type , xml_meta_longdesc, xml_meta_shortdesc, meta_param); stonith_delete(stonith_obj); zapxml(xml_meta_longdesc); zapxml(xml_meta_shortdesc); return buffer; } /* * Currently should return *providers = NULL, but remain the old code for * possible unsing in the future */ static int get_providers(const char* class_path, const char* op_type, GList ** providers) { if ( providers == NULL ) { cl_log(LOG_ERR, "%s:%d: Parameter error: providers==NULL" , __FUNCTION__, __LINE__); return -2; } if ( *providers != NULL ) { cl_log(LOG_ERR, "%s:%d: Parameter error: *providers==NULL." "This will cause memory leak." , __FUNCTION__, __LINE__); } /* Now temporarily make it fixed */ *providers = g_list_append(*providers, g_strdup("heartbeat")); return g_list_length(*providers); } diff --git a/tools/Makefile.am b/tools/Makefile.am index af745d46ea..5351c68b09 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -1,89 +1,91 @@ # # heartbeat: Linux-HA heartbeat code # # Copyright (C) 2001 Michael Moerz # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # MAINTAINERCLEANFILES = Makefile.in ccdv INCLUDES = -I$(top_builddir)/include -I$(top_srcdir)/include \ -I$(top_builddir)/linux-ha -I$(top_srcdir)/linux-ha \ -I$(top_builddir)/libltdl -I$(top_srcdir)/libltdl -EXTRA_DIST = ccdv.c attrd.h ocf-tester +EXTRA_DIST = ccdv.c attrd.h $(hanoarch_DATA) $(sbin_SCRIPTS) apigid = @HA_APIGID@ habindir = @bindir@ halibdir = $(libdir)/@HB_PKG@ +hanoarchdir = @HA_NOARCHDATAHBDIR@ gliblib = @GLIBLIB@ habin_PROGRAMS = cl_status cl_respawn halib_SCRIPTS = haresources2cib.py -sbin_SCRIPTS = ocf-tester +hanoarch_DATA = utillib.sh README.hb_report +sbin_SCRIPTS = ocf-tester hb_report if CRM_BUILD halib_PROGRAMS = attrd pingd sbin_PROGRAMS = attrd_updater endif ## SOURCES ccdv: $(top_srcdir)/tools/ccdv.c gcc $(AM_CFLAGS) $(CFLAGS) -o ccdv $(top_srcdir)/tools/ccdv.c cl_status_SOURCES = cl_status.c # A little trick. Now ccdv can be auto-built but not auto-cleaned. cl_status_DEPENDENCIES = ccdv cl_status_LDADD = $(top_builddir)/lib/hbclient/libhbclient.la \ $(top_builddir)/lib/clplumbing/libplumb.la \ $(gliblib) \ $(top_builddir)/replace/libreplace.la cl_respawn_SOURCES = cl_respawn.c cl_respawn_LDADD = $(top_builddir)/lib/clplumbing/libplumb.la \ $(top_builddir)/lib/apphb/libapphb.la \ $(gliblib) \ $(top_builddir)/replace/libreplace.la attrd_SOURCES = attrd.c attrd_LDADD = \ $(top_builddir)/lib/clplumbing/libplumb.la \ $(top_builddir)/lib/crm/common/libcrmcommon.la \ $(top_builddir)/lib/hbclient/libhbclient.la \ $(top_builddir)/lib/crm/cib/libcib.la \ $(GLIBLIB) \ $(LIBRT) pingd_SOURCES = pingd.c pingd_LDADD = \ $(top_builddir)/lib/clplumbing/libplumb.la \ $(top_builddir)/lib/crm/common/libcrmcommon.la \ $(top_builddir)/lib/hbclient/libhbclient.la \ $(GLIBLIB) \ $(LIBRT) attrd_updater_SOURCES = attrd_updater.c attrd_updater_LDADD = \ $(top_builddir)/lib/clplumbing/libplumb.la \ $(top_builddir)/lib/crm/common/libcrmcommon.la \ $(GLIBLIB) \ $(LIBRT) install-data-hook: # install-exec-hook doesn't work (!) -chgrp $(apigid) $(DESTDIR)/$(habindir)/cl_status -chmod g+s,a-w $(DESTDIR)/$(habindir)/cl_status .PHONY: install-exec-hook diff --git a/tools/README.hb_report b/tools/README.hb_report new file mode 100644 index 0000000000..043898184c --- /dev/null +++ b/tools/README.hb_report @@ -0,0 +1,297 @@ +Heartbeat reporting +=================== +Dejan Muhamedagic +v1.0 + +`hb_report` is a utility to collect all information relevant to +Heartbeat over the given period of time. + +Quick start +----------- + +Run `hb_report` on one of the nodes or on the host which serves as +a central log server. Run `hb_report` without parameters to see usage. + +A few examples: + +1. Last night during the backup there were several warnings +encountered (logserver is the log host): ++ + logserver# hb_report -f 3:00 -t 4:00 /tmp/report ++ +collects everything from all nodes from 3am to 4am last night. +The files are stored in /tmp/report and compressed to a tarball +/tmp/report.tar.gz. + +2. Just found a problem during testing: + + node1# date : note the current time + node1# /etc/init.d/heartbeat start + node1# nasty_command_that_breaks_things + node1# sleep 120 : wait for the cluster to settle + node1# hb_report -f time /tmp/hb1 + +Introduction +------------ + +Managing clusters is cumbersome. Heartbeat v2 with its numerous +configuration files and multi-node clusters just adds to the +complexity. No wonder then that most problem reports were less +than optimal. This is an attempt to rectify that situation and +make life easier for both the users and the developers. + +On security +----------- + +`hb_report` is a fairly complex program. As some of you are +probably going to run it as root let us state a few important +things you should keep in mind: + +1. Don't run `hb_report` as root! It is fairly simple to setup +things in such a way that root access is not needed. I won't go +into details, just to stress that all information collected +should be readable by accounts belonging the haclient group. + +2. If you still have to run this as root. Well, don't use the +`-C` option. + +3. Of course, every possible precaution has been taken not to +disturb processes, or touch or remove files out of the given +destination directory. If you (by mistake) specify an existing +directory, `hb_report` will bail out soon. If you specify a +relative path, it won't work either. The final product of +`hb_report` is a tarball. However, the destination directory is +not removed on any node, unless the user specifies `-C`. If you're +too lazy to cleanup the previous run, do yourself a favour and +just supply a new destination directory. You've been warned. If +you worry about the space used, just put all your directories +under /tmp and setup a cronjob to remove those directories once a +week: +.......... + for d in /tmp/*; do + test -d $d || + continue + test -f $d/description.txt || test -f $d/.env || + continue + grep -qs 'By: hb_report' $d/description.txt || + grep -qs '^UNIQUE_MSG=Mark' $d/.env || + continue + rm -r $d + done +.......... + +Mode of operation +----------------- + +Cluster data collection is straightforward: just run the same +procedure on all nodes and collect the reports. There is, +apart from many small ones, one large complication: central +syslog destination. So, in order to allow this to be fully +automated, we should sometimes run the procedure on the log host +too. Actually, if there is a log host, then the best way is to +run `hb_report` there. + +We use ssh for the remote program invocation. Even though it is +possible to run `hb_report` without ssh by doing a more menial job, +the overall user experience is much better if ssh works. Anyway, +how else do you manage your cluster? + +Another ssh related point: In case your security policy +proscribes loghost-to-cluster-over-ssh communications, then +you'll have to copy the log file to one of the nodes and point +`hb_report` to it. + +Prerequisites +------------- + +1. ssh ++ +This is not strictly required, but you won't regret having a +password-less ssh. It is not too difficult to setup and will save +you a lot of time. If you can't have it, for example because your +security policy does not allow such a thing, or you just prefer +menial work, then you will have to resort to the semi-manual +semi-automated report generation. See below for instructions. + +2. Times ++ +In order to find files and messages in the given period and to +parse the `-f` and `-t` options, `hb_report` uses perl and one of the +`Date::Parse` or `Date::Manip` perl modules. Note that you need +only one of these. ++ +On rpm based distributions, you can find `Date::Parse` in +`perl-TimeDate` and on Debian and its derivatives in +`libtimedate-perl`. + +3. Core dumps ++ +To backtrace core dumps gdb is needed and the Heartbeat packages +with the debugging info. The debug info packages may be installed +at the time the report is created. Let's hope that you will need +this really seldom. + +What is in the report +--------------------- + +1. Heartbeat related +- heartbeat version/release information +- heartbeat configuration (CIB, ha.cf, logd.cf) +- heartbeat status (output from crm_mon, crm_verify, ccm_tool) +- pengine transition graphs (if any) +- backtraces of core dumps (if any) +- heartbeat logs (if any) +2. System related +- general platform information (`uname`, `arch`, `distribution`) +- system statistics (`uptime`, `top`, `ps`) +3. User created :) +- problem description (template to be edited) +4. Generated +- problem analysis (generated) + +It is preferred that the Heartbeat is running at the time of the +report, but not absolutely required. `hb_report` will also do a +quick analysis of the collected information. + +Times +----- + +Specifying times can at times be a nuisance. That is why we have +chosen to use one of the perl modules--they do allow certain +freedom when talking dates. You can either read the instructions +at the +http://search.cpan.org/dist/TimeDate/lib/Date/Parse.pm#EXAMPLE_DATES[Date::Parse +examples page]. + +or just rely on common sense and try stuff like: + + 3:00 (today at 3am) + 15:00 (today at 3pm) + 2007/9/1 2pm (September 1st at 2pm) + +`hb_report` will (probably) complain if it can't figure out what do +you mean. + +Try to delimit the event as close as possible in order to reduce +the size of the report, but still leaving a minute or two around +for good measure. + +Note that `-f` is not an optional option. And don't forget to quote +dates when they contain spaces. + +Should I send all this to the rest of Internet? +----------------------------------------------- + +We make an effort to remove sensitive data from the Heartbeat +configuration (CIB, ha.cf, and transition graphs). However, you +_have_ to tell us what is sensitive! Use the `-p` option to specify +additional regular expressions to match variable names which may +contain information you don't want to leak. For example: + + # hb_report -f 18:00 -p "user.*" -p "secret.*" /var/tmp/report + +We look by default for variable names matching "pass.*" and the +stonith_host ha.cf directive. + +Logs and other files are not filtered. Please filter them +yourself if necessary. + +Logs +---- + +It may be tricky to find syslog logs. The scheme used is to log a +unique message on all nodes and then look it up in the usual +syslog locations. This procedure is not foolproof, in particular +if the syslog files are in a non-standard directory. We look in +/var/log /var/logs /var/syslog /var/adm /var/log/ha +/var/log/cluster. In case we can't find the logs, please supply +their location: + + # hb_report -f 5pm -l /var/log/cluster1/ha-log -S /tmp/report_node1 + +If you have different log locations on different nodes, well, +perhaps you'd like to make them the same. Or read about the +manual report collection. + +The log files are collected from all hosts where found. In case +your syslog is configured to log to both the log server and local +files and `hb_report` is run on the log server you will end up with +multiple logs with same content. + +Files starting with "ha-" are preferred. In case syslog sends +messages to more than one file, if one of them is named ha-log or +ha-debug those will be favoured to syslog or messages. + +If there is no separate log for Heartbeat, possibly unrelated +messages from other programs are included. We don't filter logs, +just pick a segment for the period you specified. + +NB: Don't have a central log host? Read the CTS README and setup +one. + +Manual report collection +------------------------ + +So, your ssh doesn't work. In that case, you will have to run +this procedure on all nodes. Use `-S` so that we don't bother with +ssh: + + # hb_report -f 5:20pm -t 5:30pm -S /tmp/report_node1 + +If you also have a log host which is not in the cluster, then +you'll have to copy the log to one of the nodes and tell us where +it is: + + # hb_report -f 5:20pm -t 5:30pm -l /var/tmp/ha-log -S /tmp/report_node1 + +Furthermore, to prevent `hb_report` from asking you to edit the +report to describe the problem on every node use `-D` on all but +one: + + # hb_report -f 5:20pm -t 5:30pm -DS /tmp/report_node1 + +If you reconsider and want the ssh setup, take a look at the CTS +README file for instructions. + +Analysis +-------- + +The point of analysis is to get out the most important +information from probably several thousand lines worth of text. +Perhaps this should be more properly named as report review as it +is rather simple, but let's pretend that we are doing something +utterly sophisticated. + +The analysis consists of the following: + +- compare files coming from different nodes; if they are equal, + make one copy in the top level directory, remove duplicates, + and create soft links instead +- print errors, warnings, and lines matching `-L` patterns from logs +- report if there were coredumps and by whom +- report crm_verify results + +The goods +--------- + +1. Common ++ +- ha-log (if found on the log host) +- description.txt (template and user report) +- analysis.txt + +2. Per node ++ +- ha.cf +- logd.cf +- ha-log (if found) +- cib.xml (`cibadmin -Ql` or `cp` if Heartbeat is not running) +- ccm_tool.txt (`ccm_tool -p`) +- crm_mon.txt (`crm_mon -1`) +- crm_verify.txt (`crm_verify -V`) +- pengine/ (only on DC, directory with pengine transitions) +- sysinfo.txt (static info) +- sysstats.txt (dynamic info) +- backtraces.txt (if coredumps found) +- DC (well...) + diff --git a/tools/hb_report.in b/tools/hb_report.in new file mode 100755 index 0000000000..c02a3df378 --- /dev/null +++ b/tools/hb_report.in @@ -0,0 +1,608 @@ +#!/bin/sh + + # Copyright (C) 2007 Dejan Muhamedagic + # + # This program is free software; you can redistribute it and/or + # modify it under the terms of the GNU General Public + # License as published by the Free Software Foundation; either + # version 2.1 of the License, or (at your option) any later version. + # + # This software is distributed in the hope that it will be useful, + # but WITHOUT ANY WARRANTY; without even the implied warranty of + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + # General Public License for more details. + # + # You should have received a copy of the GNU General Public + # License along with this library; if not, write to the Free Software + # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + # + +. @sysconfdir@/ha.d/shellfuncs +. $HA_NOARCHBIN/utillib.sh + +PROG=`basename $0` +# FIXME: once this is part of the package! +PROGDIR=`dirname $0` +echo "$PROGDIR" | grep -qs '^/' || { + test -f @sbindir@/$PROG && + PROGDIR=@sbindir@ + test -f $HA_NOARCHBIN/$PROG && + PROGDIR=$HA_NOARCHBIN +} + +LOGD_CF=`findlogdcf @sysconfdir@ $HA_DIR` +export LOGD_CF + +: ${SSH_OPTS="-T -o Batchmode=yes"} +LOG_PATTERNS="CRIT: ERROR:" + +# +# the instance where user runs hb_report is the master +# the others are slaves +# +if [ x"$1" = x__slave ]; then + SLAVE=1 +fi + +# +# if this is the master, allow ha.cf and logd.cf in the current dir +# (because often the master is the log host) +# +if [ "$SLAVE" = "" ]; then + [ -f ha.cf ] && HA_CF=ha.cf + [ -f logd.cf ] && LOGD_CF=logd.cf +fi + +usage() { + cat< $DESTDIR/.env" + done +} +start_remote_collectors() { + for node in `getnodes`; do + [ "$node" = "$WE" ] && continue + ssh $SSH_OPTS $SSH_USER@$node "$PROGDIR/hb_report __slave $DESTDIR" | + (cd $DESTDIR && tar xf -) & + SLAVEPIDS="$SLAVEPIDS $!" + done +} + +# +# does ssh work? +# +findsshuser() { + for n in `getnodes`; do + [ "$node" = "$WE" ] && continue + trysshusers $n $TRY_SSH && break + done +} +checkssh() { + for n in `getnodes`; do + [ "$node" = "$WE" ] && continue + checksshuser $n $SSH_USER || return 1 + done + return 0 +} + +# +# the usual stuff +# +getbacktraces() { + flist=`find_files $HA_VARLIB/cores $1 $2` + [ "$flist" ] && + getbt $flist > $3 +} +getpeinputs() { + n=`basename $3` + flist=$( + if [ -f $3/ha-log ]; then + grep " $n peng.*PEngine Input stored" $3/ha-log | awk '{print $NF}' + else + find_files $HA_VARLIB/pengine $1 $2 + fi | sed "s,$HA_VARLIB/,,g" + ) + [ "$flist" ] && + (cd $HA_VARLIB && tar cf - $flist) | (cd $3 && tar xf -) +} +touch_DC_if_dc() { + dc=`crmadmin -D 2>/dev/null | awk '{print $NF}'` + if [ "$WE" = "$dc" ]; then + touch $1/DC + fi +} + +# +# some basic system info and stats +# +sys_info() { + echo "Heartbeat version: `hb_ver`" + crm_info + echo "Platform: `uname`" + echo "Kernel release: `uname -r`" + echo "Architecture: `arch`" + [ `uname` = Linux ] && + echo "Distribution: `distro`" +} +sys_stats() { + set -x + uptime + ps axf + ps auxw + top -b -n 1 + netstat -i + set +x +} + +# +# replace sensitive info with '****' +# +sanitize() { + for f in $1/ha.cf $1/cib.xml $1/pengine/*; do + [ -f "$f" ] && sanitize_one $f + done +} + +# +# remove duplicates if files are same, make links instead +# +consolidate() { + for n in `getnodes`; do + if [ -f $1/$2 ]; then + rm $1/$n/$2 + else + mv $1/$n/$2 $1 + fi + ln -s ../$2 $1/$n + done +} + +# +# some basic analysis of the report +# +checkcrmvfy() { + for n in `getnodes`; do + if [ -s $1/$n/crm_verify.txt ]; then + echo "WARN: crm_verify reported warnings at $n:" + cat $1/$n/crm_verify.txt + fi + done +} +checkbacktraces() { + for n in `getnodes`; do + [ -s $1/$n/backtraces.txt ] && { + echo "WARN: coredumps found at $n:" + egrep 'Core was generated|Program terminated' \ + $1/$n/backtraces.txt | + sed 's/^/ /' + } + done +} +checklogs() { + logs=`find $1 -name ha-log` + [ "$logs" ] || return + pattfile=`maketempfile` || + fatal "cannot create temporary files" + for p in $LOG_PATTERNS; do + echo "$p" + done > $pattfile + echo "" + echo "Log patterns:" + for n in `getnodes`; do + cat $logs | grep -f $pattfile + done + rm -f $pattfile +} + +# +# check if files have same content in the cluster +# +cibdiff() { + crm_diff -c -n $1 -o $2 +} +txtdiff() { + diff $1 $2 +} +diffcheck() { + case `basename $1` in + ccm_tool.txt) + txtdiff $1 $2;; # worddiff? + cib.xml) + cibdiff $1 $2;; + ha.cf) + txtdiff $1 $2;; # confdiff? + crm_mon.txt|sysinfo.txt) + txtdiff $1 $2;; + esac +} +analyze_one() { + rc=0 + node0="" + for n in `getnodes`; do + if [ "$node0" ]; then + diffcheck $1/$node0/$2 $1/$n/$2 + rc=$((rc+$?)) + else + node0=$n + fi + done + return $rc +} +analyze() { + flist="ccm_tool.txt cib.xml crm_mon.txt ha.cf sysinfo.txt" + for f in $flist; do + perl -e "printf \"Diff $f... \"" + ls $1/*/$f >/dev/null 2>&1 || continue + if analyze_one $1 $f; then + echo "OK" + consolidate $1 $f + else + echo "varies" + fi + done + checkcrmvfy $1 + checkbacktraces $1 + checklogs $1 +} + +# +# description template, editing, and other notes +# +mktemplate() { + cat<=100{exit 1}' || + cat < $DESTDIR/$WE/ha-log + else + cat > $DESTDIR/ha-log # we are log server, probably + fi +else + warning "could not find the log file on $WE" +fi + +# +# part 6: get all other info (config, stats, etc) +# +if [ "$THIS_IS_NODE" ]; then + getconfig $DESTDIR/$WE + getpeinputs $FROM_TIME $TO_TIME $DESTDIR/$WE + getbacktraces $FROM_TIME $TO_TIME $DESTDIR/$WE/backtraces.txt + touch_DC_if_dc $DESTDIR/$WE + sanitize $DESTDIR/$WE + sys_info > $DESTDIR/$WE/sysinfo.txt + sys_stats > $DESTDIR/$WE/sysstats.txt 2>&1 +fi + +# +# part 7: endgame: +# slaves tar their results to stdout, the master waits +# for them, analyses results, asks the user to edit the +# problem description template, and prints final notes +# +if [ "$SLAVE" ]; then + (cd $DESTDIR && tar cf - $WE) +else + wait $SLAVEPIDS + analyze $DESTDIR > $DESTDIR/analysis.txt + mktemplate > $DESTDIR/description.txt + [ "$NO_DESCRIPTION" ] || { + echo press enter to edit the problem description... + read junk + edittemplate $DESTDIR/description.txt + } + cd $DESTDIR/.. + tar czf $DESTDIR.tar.gz $DESTDIR/ + finalword + checksize +fi + +[ "$REMOVE_DEST" ] && + rm -r $DESTDIR diff --git a/tools/utillib.sh b/tools/utillib.sh new file mode 100644 index 0000000000..05e259120a --- /dev/null +++ b/tools/utillib.sh @@ -0,0 +1,384 @@ + # Copyright (C) 2007 Dejan Muhamedagic + # + # This program is free software; you can redistribute it and/or + # modify it under the terms of the GNU General Public + # License as published by the Free Software Foundation; either + # version 2.1 of the License, or (at your option) any later version. + # + # This software is distributed in the hope that it will be useful, + # but WITHOUT ANY WARRANTY; without even the implied warranty of + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + # General Public License for more details. + # + # You should have received a copy of the GNU General Public + # License along with this library; if not, write to the Free Software + # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + # + +# +# ha.cf/logd.cf parsing +# +getcfvar() { + [ -f $HA_CF ] || return + sed 's/#.*//' < $HA_CF | + grep -w "^$1" | + sed 's/^[^[:space:]]*[[:space:]]*//' +} +iscfvarset() { + test "`getcfvar \"$1\"`" +} +iscfvartrue() { + getcfvar "$1" | + egrep -qsi "^(true|y|yes|on|1)" +} +getnodes() { + getcfvar node +} + +# +# ssh +# +checksshuser() { + ssh -o Batchmode=yes $2@$1 true 2>/dev/null +} +trysshusers() { + n=$1 + shift 1 + for u; do + if checksshuser $n $u; then + echo $u + break + fi + done +} + +# +# logging +# +syslogmsg() { + severity=$1 + shift 1 + logtag="" + [ "$HA_LOGTAG" ] && logtag="-t $HA_LOGTAG" + logger -p ${HA_LOGFACILITY:-"daemon"}.$severity $logtag $* +} + +# +# find log destination +# +uselogd() { + iscfvartrue use_logd && + return 0 # if use_logd true + iscfvarset logfacility || + iscfvarset logfile || + iscfvarset debugfile || + return 0 # or none of the log options set + false +} +findlogdcf() { + for f in \ + `which strings > /dev/null 2>&1 && + strings $HA_BIN/ha_logd | grep 'logd\.cf'` \ + `for d; do echo $d/logd.cf $d/ha_logd.cf; done` + do + if [ -f "$f" ]; then + echo $f + return 0 + fi + done + return 1 +} +getlogvars() { + savecf=$HA_CF + if uselogd; then + [ -f "$LOGD_CF" ] || + fatal "could not find logd.cf or ha_logd.cf" + HA_CF=$LOGD_CF + fi + HA_LOGFACILITY=`getcfvar logfacility` + HA_LOGFILE=`getcfvar logfile` + HA_DEBUGFILE=`getcfvar debugfile` + HA_SYSLOGMSGFMT="" + iscfvartrue syslogmsgfmt && + HA_SYSLOGMSGFMT=1 + HA_CF=$savecf +} +findmsg() { + # this is tricky, we try a few directories + syslogdir="/var/log /var/logs /var/syslog /var/adm /var/log/ha /var/log/cluster" + favourites="ha-*" + mark=$1 + log="" + for d in $syslogdir; do + [ -d $d ] || continue + log=`fgrep -l "$mark" $d/$favourites` && break + log=`fgrep -l "$mark" $d/*` && break + done 2>/dev/null + echo $log +} + +# +# print a segment of a log file +# +str2time() { + perl -e "\$time='$*';" -e ' + eval "use Date::Parse"; + if (!$@) { + print str2time($time); + } else { + eval "use Date::Manip"; + if (!$@) { + print UnixDate(ParseDateString($time), "%s"); + } + } + ' +} +getstamp() { + if [ "$HA_SYSLOGMSGFMT" -o "$HA_LOGFACILITY" ]; then + awk '{print $1,$2,$3}' + else + awk '{print $2}' | sed 's/_/ /' + fi +} +linetime() { + l=`tail -n +$2 $1 | head -1 | getstamp` + str2time "$l" +} +findln_by_time() { + logf=$1 + tm=$2 + first=1 + last=`wc -l < $logf` + while [ $first -le $last ]; do + mid=$(((last+first)/2)) + tmid=`linetime $logf $mid` + if [ -z "$tmid" ]; then + warning "cannot extract time: $logf:$mid" + return + fi + if [ $tmid -gt $tm ]; then + last=$((mid-1)) + elif [ $tmid -lt $tm ]; then + first=$((mid+1)) + else + break + fi + done + echo $mid +} +dumplog() { + logf=$1 + from_time=$2 + to_time=$3 + from_line=`findln_by_time $logf $from_time` + if [ -z "$from_line" ]; then + warning "couldn't find line for time $from_time; corrupt log file?" + return + fi + tail -n +$from_line $logf | + if [ "$to_time" != 0 ]; then + to_line=`findln_by_time $logf $to_time` + if [ -z "$to_line" ]; then + warning "couldn't find line for time $to_time; corrupt log file?" + return + fi + head -$((to_line-from_line+1)) + else + cat + fi +} + +# +# find files newer than a and older than b +# +touchfile() { + t=`maketempfile` && + perl -e "\$file=\"$t\"; \$tm=$1;" -e 'utime $tm, $tm, $file;' && + echo $t +} +find_files() { + dir=$1 + from_time=$2 + to_time=$3 + from_stamp=`touchfile $from_time` + findexp="-newer $from_stamp" + if [ "$to_time" -a "$to_time" -gt 0 ]; then + to_stamp=`touchfile $to_time` + findexp="$findexp ! -newer $to_stamp" + fi + find $dir -type f $findexp + rm -f $from_stamp $to_stamp +} + +# +# coredumps +# +findbinary() { + random_binary=`which cat 2>/dev/null` # suppose we are lucky + binary=`gdb $random_binary $1 < /dev/null 2>/dev/null | + grep 'Core was generated' | awk '{print $5}' | + sed "s/^.//;s/[.']*$//"` + [ x = x"$binary" ] && return + fullpath=`which $binary 2>/dev/null` + if [ x = x"$fullpath" ]; then + [ -x $HA_BIN/$binary ] && echo $HA_BIN/$binary + else + echo $fullpath + fi +} +getbt() { + which gdb > /dev/null 2>&1 || { + warning "please install gdb to get backtraces" + return + } + for corefile; do + absbinpath=`findbinary $corefile` + [ x = x"$absbinpath" ] && return 1 + echo "====================== start backtrace ======================" + ls -l $corefile + gdb -batch -n -quiet -ex ${BT_OPTS:-"thread apply all bt full"} -ex quit \ + $absbinpath $corefile 2>/dev/null + echo "======================= end backtrace =======================" + done +} + +# +# heartbeat configuration/status +# +iscrmrunning() { + crmadmin -D >/dev/null 2>&1 +} +dumpstate() { + crm_mon -1 | grep -v '^Last upd' > $1/crm_mon.txt + cibadmin -Ql > $1/cib.xml + ccm_tool -p > $1/ccm_tool.txt 2>&1 +} +getconfig() { + cp -p $HA_CF $1/ + [ -f $LOGD_CF ] && + cp -p $LOGD_CF $1/ + if iscrmrunning; then + dumpstate $1 + else + cp -p $HA_VARLIB/crm/cib.xml $1/ 2>/dev/null + fi + [ -f "$1/cib.xml" ] && + crm_verify -V -x $1/cib.xml >$1/crm_verify.txt 2>&1 +} + +# +# remove values of sensitive attributes +# +# this is not proper xml parsing, but it will work under the +# circumstances +sanitize_xml_attrs() { + sed $( + for patt in $SANITIZE; do + echo "-e /name=\"$patt\"/s/value=\"[^\"]*\"/value=\"****\"/" + done + ) +} +sanitize_hacf() { + awk ' + $1=="stonith_host"{ for( i=5; i<=NF; i++ ) $i="****"; } + {print} + ' +} +sanitize_one() { + file=$1 + compress="" + echo $file | grep -qs 'gz$' && compress=gzip + echo $file | grep -qs 'bz2$' && compress=bzip2 + if [ "$compress" ]; then + decompress="$compress -dc" + else + compress=cat + decompress=cat + fi + tmp=`maketempfile` && ref=`maketempfile` || + fatal "cannot create temporary files" + touch -r $file $ref # save the mtime + if [ "`basename $file`" = ha.cf ]; then + sanitize_hacf + else + $decompress | sanitize_xml_attrs | $compress + fi < $file > $tmp + mv $tmp $file + touch -r $ref $file + rm -f $ref +} + +# +# keep the user posted +# +fatal() { + echo "ERROR: $*" >&2 + exit 1 +} +warning() { + echo "WARN: $*" >&2 +} +info() { + echo "INFO: $*" >&2 +} +pickfirst() { + for x; do + which $x >/dev/null 2>&1 && { + echo $x + return 0 + } + done + return 1 +} + +# +# run a command everywhere +# +forall() { + c="$*" + for n in `getnodes`; do + if [ "$n" = "`uname -n`" ]; then + $c + else + if [ "$SSH_USER" ]; then + echo $c | ssh $SSH_OPTS $SSH_USER@$n + fi + fi + done +} + +# +# get some system info +# +distro() { + which lsb_release >/dev/null 2>&1 && { + lsb_release -d + return + } + relf=`ls /etc/debian_version 2>/dev/null` || + relf=`ls /etc/slackware-version 2>/dev/null` || + relf=`ls -d /etc/*-release 2>/dev/null` && { + for f in $relf; do + test -f $f && { + echo "`ls $f` `cat $f`" + return + } + done + } + warning "no lsb_release no /etc/*-release no /etc/debian_version" +} +hb_ver() { + which dpkg > /dev/null 2>&1 && { + dpkg-query -f '${Version}' -W heartbeat 2>/dev/null || + dpkg-query -f '${Version}' -W heartbeat-2 + return + } + which rpm > /dev/null 2>&1 && { + rpm -q --qf '%{version}' heartbeat + return + } + # more packagers? +} +crm_info() { + $HA_BIN/crmd version 2>&1 +}