diff --git a/crmd/control.c b/crmd/control.c index b006d6cc4d..ed0689d460 100644 --- a/crmd/control.c +++ b/crmd/control.c @@ -1,999 +1,1026 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include qb_ipcs_service_t *ipcs = NULL; extern gboolean crm_connect_corosync(crm_cluster_t * cluster); extern void crmd_ha_connection_destroy(gpointer user_data); void crm_shutdown(int nsig); gboolean crm_read_options(gpointer user_data); gboolean fsa_has_quorum = FALSE; crm_trigger_t *fsa_source = NULL; crm_trigger_t *config_read = NULL; static gboolean election_timeout_popped(gpointer data) { /* Not everyone voted */ crm_info("Election failed: Declaring ourselves the winner"); register_fsa_input(C_TIMER_POPPED, I_ELECTION_DC, NULL); return FALSE; } /* A_HA_CONNECT */ void do_ha_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean registered = FALSE; static crm_cluster_t *cluster = NULL; if (cluster == NULL) { cluster = calloc(1, sizeof(crm_cluster_t)); } if (action & A_HA_DISCONNECT) { crm_cluster_disconnect(cluster); crm_info("Disconnected from the cluster"); set_bit(fsa_input_register, R_HA_DISCONNECTED); } if (action & A_HA_CONNECT) { crm_set_status_callback(&peer_update_callback); if (is_openais_cluster()) { #if SUPPORT_COROSYNC registered = crm_connect_corosync(cluster); #endif } else if (is_heartbeat_cluster()) { #if SUPPORT_HEARTBEAT cluster->destroy = crmd_ha_connection_destroy; cluster->hb_dispatch = crmd_ha_msg_callback; registered = crm_cluster_connect(cluster); fsa_cluster_conn = cluster->hb_conn; crm_trace("Be informed of Node Status changes"); if (registered && fsa_cluster_conn->llc_ops->set_nstatus_callback(fsa_cluster_conn, crmd_ha_status_callback, fsa_cluster_conn) != HA_OK) { crm_err("Cannot set nstatus callback: %s", fsa_cluster_conn->llc_ops->errmsg(fsa_cluster_conn)); registered = FALSE; } crm_trace("Be informed of CRM Client Status changes"); if (registered && fsa_cluster_conn->llc_ops->set_cstatus_callback(fsa_cluster_conn, crmd_client_status_callback, fsa_cluster_conn) != HA_OK) { crm_err("Cannot set cstatus callback: %s", fsa_cluster_conn->llc_ops->errmsg(fsa_cluster_conn)); registered = FALSE; } if (registered) { crm_trace("Requesting an initial dump of CRMD client_status"); fsa_cluster_conn->llc_ops->client_status(fsa_cluster_conn, NULL, CRM_SYSTEM_CRMD, -1); } #endif } fsa_election = election_init(NULL, cluster->uname, 60000/*60s*/, election_timeout_popped); fsa_our_uname = cluster->uname; fsa_our_uuid = cluster->uuid; if(cluster->uuid == NULL) { crm_err("Could not obtain local uuid"); registered = FALSE; } if (registered == FALSE) { set_bit(fsa_input_register, R_HA_DISCONNECTED); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return; } populate_cib_nodes(node_update_none, __FUNCTION__); clear_bit(fsa_input_register, R_HA_DISCONNECTED); crm_info("Connected to the cluster"); } if (action & ~(A_HA_CONNECT | A_HA_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__); } } /* A_SHUTDOWN */ void do_shutdown(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { /* just in case */ set_bit(fsa_input_register, R_SHUTDOWN); if (is_heartbeat_cluster()) { if (is_set(fsa_input_register, pe_subsystem->flag_connected)) { crm_info("Terminating the %s", pe_subsystem->name); if (stop_subsystem(pe_subsystem, TRUE) == FALSE) { /* its gone... */ crm_err("Faking %s exit", pe_subsystem->name); clear_bit(fsa_input_register, pe_subsystem->flag_connected); } else { crm_info("Waiting for subsystems to exit"); crmd_fsa_stall(FALSE); } } crm_info("All subsystems stopped, continuing"); } if (stonith_api) { /* Prevent it from comming up again */ clear_bit(fsa_input_register, R_ST_REQUIRED); crm_info("Disconnecting STONITH..."); stonith_api->cmds->disconnect(stonith_api); } } /* A_SHUTDOWN_REQ */ void do_shutdown_req(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { xmlNode *msg = NULL; crm_info("Sending shutdown request to %s", crm_str(fsa_our_dc)); msg = create_request(CRM_OP_SHUTDOWN_REQ, NULL, NULL, CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL); /* set_bit(fsa_input_register, R_STAYDOWN); */ if (send_cluster_message(NULL, crm_msg_crmd, msg, TRUE) == FALSE) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } free_xml(msg); } extern crm_ipc_t *attrd_ipc; extern char *max_generation_from; extern xmlNode *max_generation_xml; extern GHashTable *resource_history; extern GHashTable *voted; extern GHashTable *reload_hash; extern char *te_client_id; void log_connected_client(gpointer key, gpointer value, gpointer user_data); void log_connected_client(gpointer key, gpointer value, gpointer user_data) { crm_client_t *client = value; crm_err("%s is still connected at exit", crm_client_name(client)); } int crmd_fast_exit(int rc) { if (is_set(fsa_input_register, R_STAYDOWN)) { crm_warn("Inhibiting respawn: %d -> %d", rc, 100); rc = 100; } if (rc == pcmk_ok && is_set(fsa_input_register, R_IN_RECOVERY)) { crm_err("Could not recover from internal error"); rc = pcmk_err_generic; } return crm_exit(rc); } int crmd_exit(int rc) { GListPtr gIter = NULL; GMainLoop *mloop = crmd_mainloop; static bool in_progress = FALSE; if(in_progress && rc == 0) { crm_debug("Exit is already in progress"); return rc; } else if(in_progress) { crm_notice("Error during shutdown process, terminating now: %s (%d)", pcmk_strerror(rc), rc); crm_write_blackbox(SIGTRAP, NULL); crmd_fast_exit(rc); } in_progress = TRUE; crm_trace("Preparing to exit: %d", rc); /* Suppress secondary errors resulting from us disconnecting everything */ set_bit(fsa_input_register, R_HA_DISCONNECTED); /* Close all IPC servers and clients to ensure any and all shared memory files are cleaned up */ if(ipcs) { crm_trace("Closing IPC server"); mainloop_del_ipc_server(ipcs); ipcs = NULL; } if (attrd_ipc) { crm_trace("Closing attrd connection"); crm_ipc_close(attrd_ipc); crm_ipc_destroy(attrd_ipc); attrd_ipc = NULL; } if (pe_subsystem && pe_subsystem->client && pe_subsystem->client->ipcs) { crm_trace("Disconnecting Policy Engine"); qb_ipcs_disconnect(pe_subsystem->client->ipcs); } if(stonith_api) { crm_trace("Disconnecting fencing API"); clear_bit(fsa_input_register, R_ST_REQUIRED); stonith_api->cmds->free(stonith_api); stonith_api = NULL; } if (rc == pcmk_ok && crmd_mainloop == NULL) { crm_debug("No mainloop detected"); rc = EPROTO; } /* On an error, just get out. * * Otherwise, make the effort to have mainloop exit gracefully so * that it (mostly) cleans up after itself and valgrind has less * to report on - allowing real errors stand out */ if(rc != pcmk_ok) { crm_notice("Forcing immediate exit: %s (%d)", pcmk_strerror(rc), rc); crm_write_blackbox(SIGTRAP, NULL); return crmd_fast_exit(rc); } /* Clean up as much memory as possible for valgrind */ -#if SUPPORT_HEARTBEAT - if (fsa_cluster_conn) { - crm_trace("Disconnecting heartbeat"); - fsa_cluster_conn->llc_ops->delete(fsa_cluster_conn); - fsa_cluster_conn = NULL; - } -#endif - for (gIter = fsa_message_queue; gIter != NULL; gIter = gIter->next) { fsa_data_t *fsa_data = gIter->data; crm_info("Dropping %s: [ state=%s cause=%s origin=%s ]", fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); delete_fsa_input(fsa_data); } clear_bit(fsa_input_register, R_MEMBERSHIP); g_list_free(fsa_message_queue); fsa_message_queue = NULL; free(pe_subsystem); pe_subsystem = NULL; free(te_subsystem); te_subsystem = NULL; free(cib_subsystem); cib_subsystem = NULL; if (reload_hash) { crm_trace("Destroying reload cache with %d members", g_hash_table_size(reload_hash)); g_hash_table_destroy(reload_hash); reload_hash = NULL; } election_fini(fsa_election); fsa_election = NULL; cib_delete(fsa_cib_conn); fsa_cib_conn = NULL; verify_stopped(fsa_state, LOG_WARNING); clear_bit(fsa_input_register, R_LRM_CONNECTED); lrm_state_destroy_all(); /* This basically will not work, since mainloop has a reference to it */ mainloop_destroy_trigger(fsa_source); fsa_source = NULL; mainloop_destroy_trigger(config_read); config_read = NULL; mainloop_destroy_trigger(stonith_reconnect); stonith_reconnect = NULL; mainloop_destroy_trigger(transition_trigger); transition_trigger = NULL; crm_client_cleanup(); crm_peer_destroy(); crm_timer_stop(transition_timer); crm_timer_stop(integration_timer); crm_timer_stop(finalization_timer); crm_timer_stop(election_trigger); election_timeout_stop(fsa_election); crm_timer_stop(shutdown_escalation_timer); crm_timer_stop(wait_timer); crm_timer_stop(recheck_timer); free(transition_timer); transition_timer = NULL; free(integration_timer); integration_timer = NULL; free(finalization_timer); finalization_timer = NULL; free(election_trigger); election_trigger = NULL; election_fini(fsa_election); free(shutdown_escalation_timer); shutdown_escalation_timer = NULL; free(wait_timer); wait_timer = NULL; free(recheck_timer); recheck_timer = NULL; free(fsa_our_dc_version); fsa_our_dc_version = NULL; free(fsa_our_uname); fsa_our_uname = NULL; free(fsa_our_uuid); fsa_our_uuid = NULL; free(fsa_our_dc); fsa_our_dc = NULL; free(te_uuid); te_uuid = NULL; free(te_client_id); te_client_id = NULL; free(fsa_pe_ref); fsa_pe_ref = NULL; free(failed_stop_offset); failed_stop_offset = NULL; free(failed_start_offset); failed_start_offset = NULL; free(max_generation_from); max_generation_from = NULL; free_xml(max_generation_xml); max_generation_xml = NULL; mainloop_destroy_signal(SIGUSR1); mainloop_destroy_signal(SIGTERM); mainloop_destroy_signal(SIGTRAP); mainloop_destroy_signal(SIGCHLD); if (mloop) { int lpc = 0; GMainContext *ctx = g_main_loop_get_context(crmd_mainloop); /* Don't re-enter this block */ crmd_mainloop = NULL; crm_trace("Draining mainloop %d %d", g_main_loop_is_running(mloop), g_main_context_pending(ctx)); while(g_main_context_pending(ctx) && lpc < 10) { lpc++; crm_trace("Iteration %d", lpc); g_main_context_dispatch(ctx); } crm_trace("Closing mainloop %d %d", g_main_loop_is_running(mloop), g_main_context_pending(ctx)); g_main_loop_quit(mloop); +#if SUPPORT_HEARTBEAT + /* Do this only after g_main_loop_quit(). + * + * This interface was broken (incomplete) since it was introduced. + * ->delete() does cleanup and free most of it, but it does not + * actually remove and destroy the corresponding GSource, so the next + * prepare/check iteratioin would find a corrupt (because partially + * freed) GSource, and segfault. + * + * Apparently one was supposed to store the GSource as returned by + * G_main_add_ll_cluster(), and g_source_destroy() that "by hand". + * + * But no-one ever did this, not even in the old hb code when this was + * introduced. + * + * Note that fsa_cluster_conn was set as an "alias" to cluster->hb_conn + * in do_ha_control() right after crm_cluster_connect(), and only + * happens to still point at that object, because do_ha_control() does + * not reset it to NULL after crm_cluster_disconnect() above does + * reset cluster->hb_conn to NULL. + * Not sure if that's something to cleanup, too. + * + * I'll try to fix this up in heartbeat proper, so ->delete + * will actually remove, and destroy, and unref, and free this thing. + * Doing so after g_main_loop_quit() is valid with both old, + * and eventually fixed heartbeat. + * + * If we introduce the "by hand" destroy/remove/unref, + * this may break again once heartbeat is fixed :-( + * + * -- Lars Ellenberg + */ + if (fsa_cluster_conn) { + crm_trace("Deleting heartbeat api object"); + fsa_cluster_conn->llc_ops->delete(fsa_cluster_conn); + fsa_cluster_conn = NULL; + } +#endif + /* Won't do anything yet, since we're inside it now */ g_main_loop_unref(mloop); crm_trace("Done %d", rc); } /* Graceful */ return rc; } /* A_EXIT_0, A_EXIT_1 */ void do_exit(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int exit_code = pcmk_ok; int log_level = LOG_INFO; const char *exit_type = "gracefully"; if (action & A_EXIT_1) { /* exit_code = pcmk_err_generic; */ log_level = LOG_ERR; exit_type = "forcefully"; exit_code = pcmk_err_generic; } verify_stopped(cur_state, LOG_ERR); do_crm_log(log_level, "Performing %s - %s exiting the CRMd", fsa_action2string(action), exit_type); crm_info("[%s] stopped (%d)", crm_system_name, exit_code); crmd_exit(exit_code); } /* A_STARTUP */ void do_startup(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int was_error = 0; int interval = 1; /* seconds between DC heartbeats */ crm_debug("Registering Signal Handlers"); mainloop_add_signal(SIGTERM, crm_shutdown); fsa_source = mainloop_add_trigger(G_PRIORITY_HIGH, crm_fsa_trigger, NULL); config_read = mainloop_add_trigger(G_PRIORITY_HIGH, crm_read_options, NULL); transition_trigger = mainloop_add_trigger(G_PRIORITY_LOW, te_graph_trigger, NULL); crm_debug("Creating CIB and LRM objects"); fsa_cib_conn = cib_new(); lrm_state_init_local(); /* set up the timers */ transition_timer = calloc(1, sizeof(fsa_timer_t)); integration_timer = calloc(1, sizeof(fsa_timer_t)); finalization_timer = calloc(1, sizeof(fsa_timer_t)); election_trigger = calloc(1, sizeof(fsa_timer_t)); shutdown_escalation_timer = calloc(1, sizeof(fsa_timer_t)); wait_timer = calloc(1, sizeof(fsa_timer_t)); recheck_timer = calloc(1, sizeof(fsa_timer_t)); interval = interval * 1000; if (election_trigger != NULL) { election_trigger->source_id = 0; election_trigger->period_ms = -1; election_trigger->fsa_input = I_DC_TIMEOUT; election_trigger->callback = crm_timer_popped; election_trigger->repeat = FALSE; } else { was_error = TRUE; } if (transition_timer != NULL) { transition_timer->source_id = 0; transition_timer->period_ms = -1; transition_timer->fsa_input = I_PE_CALC; transition_timer->callback = crm_timer_popped; transition_timer->repeat = FALSE; } else { was_error = TRUE; } if (integration_timer != NULL) { integration_timer->source_id = 0; integration_timer->period_ms = -1; integration_timer->fsa_input = I_INTEGRATED; integration_timer->callback = crm_timer_popped; integration_timer->repeat = FALSE; } else { was_error = TRUE; } if (finalization_timer != NULL) { finalization_timer->source_id = 0; finalization_timer->period_ms = -1; finalization_timer->fsa_input = I_FINALIZED; finalization_timer->callback = crm_timer_popped; finalization_timer->repeat = FALSE; /* for possible enabling... a bug in the join protocol left * a slave in S_PENDING while we think its in S_NOT_DC * * raising I_FINALIZED put us into a transition loop which is * never resolved. * in this loop we continually send probes which the node * NACK's because its in S_PENDING * * if we have nodes where heartbeat is active but the * CRM is not... then this will be handled in the * integration phase */ finalization_timer->fsa_input = I_ELECTION; } else { was_error = TRUE; } if (shutdown_escalation_timer != NULL) { shutdown_escalation_timer->source_id = 0; shutdown_escalation_timer->period_ms = -1; shutdown_escalation_timer->fsa_input = I_STOP; shutdown_escalation_timer->callback = crm_timer_popped; shutdown_escalation_timer->repeat = FALSE; } else { was_error = TRUE; } if (wait_timer != NULL) { wait_timer->source_id = 0; wait_timer->period_ms = 2000; wait_timer->fsa_input = I_NULL; wait_timer->callback = crm_timer_popped; wait_timer->repeat = FALSE; } else { was_error = TRUE; } if (recheck_timer != NULL) { recheck_timer->source_id = 0; recheck_timer->period_ms = -1; recheck_timer->fsa_input = I_PE_CALC; recheck_timer->callback = crm_timer_popped; recheck_timer->repeat = FALSE; } else { was_error = TRUE; } /* set up the sub systems */ cib_subsystem = calloc(1, sizeof(struct crm_subsystem_s)); te_subsystem = calloc(1, sizeof(struct crm_subsystem_s)); pe_subsystem = calloc(1, sizeof(struct crm_subsystem_s)); if (cib_subsystem != NULL) { cib_subsystem->pid = -1; cib_subsystem->name = CRM_SYSTEM_CIB; cib_subsystem->flag_connected = R_CIB_CONNECTED; cib_subsystem->flag_required = R_CIB_REQUIRED; } else { was_error = TRUE; } if (te_subsystem != NULL) { te_subsystem->pid = -1; te_subsystem->name = CRM_SYSTEM_TENGINE; te_subsystem->flag_connected = R_TE_CONNECTED; te_subsystem->flag_required = R_TE_REQUIRED; } else { was_error = TRUE; } if (pe_subsystem != NULL) { pe_subsystem->pid = -1; pe_subsystem->path = CRM_DAEMON_DIR; pe_subsystem->name = CRM_SYSTEM_PENGINE; pe_subsystem->command = CRM_DAEMON_DIR "/" CRM_SYSTEM_PENGINE; pe_subsystem->args = NULL; pe_subsystem->flag_connected = R_PE_CONNECTED; pe_subsystem->flag_required = R_PE_REQUIRED; } else { was_error = TRUE; } if (was_error == FALSE && is_heartbeat_cluster()) { if (start_subsystem(pe_subsystem) == FALSE) { was_error = TRUE; } } if (was_error) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } static int32_t crmd_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) { crm_trace("Connection %p", c); if (crm_client_new(c, uid, gid) == NULL) { return -EIO; } return 0; } static void crmd_ipc_created(qb_ipcs_connection_t * c) { crm_trace("Connection %p", c); } static int32_t crmd_ipc_dispatch(qb_ipcs_connection_t * c, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; crm_client_t *client = crm_client_get(c); xmlNode *msg = crm_ipcs_recv(client, data, size, &id, &flags); crm_trace("Invoked: %s", crm_client_name(client)); crm_ipcs_send_ack(client, id, flags, "ack", __FUNCTION__, __LINE__); if (msg == NULL) { return 0; } #if ENABLE_ACL determine_request_user(client->user, msg, F_CRM_USER); #endif crm_trace("Processing msg from %s", crm_client_name(client)); crm_log_xml_trace(msg, "CRMd[inbound]"); crm_xml_add(msg, F_CRM_SYS_FROM, client->id); if (crmd_authorize_message(msg, client, NULL)) { route_message(C_IPC_MESSAGE, msg); } trigger_fsa(fsa_source); free_xml(msg); return 0; } static int32_t crmd_ipc_closed(qb_ipcs_connection_t * c) { crm_client_t *client = crm_client_get(c); struct crm_subsystem_s *the_subsystem = NULL; crm_trace("Connection %p", c); if (client->userdata == NULL) { crm_trace("Client hadn't registered with us yet"); } else if (strcasecmp(CRM_SYSTEM_PENGINE, client->userdata) == 0) { the_subsystem = pe_subsystem; } else if (strcasecmp(CRM_SYSTEM_TENGINE, client->userdata) == 0) { the_subsystem = te_subsystem; } else if (strcasecmp(CRM_SYSTEM_CIB, client->userdata) == 0) { the_subsystem = cib_subsystem; } if (the_subsystem != NULL) { the_subsystem->source = NULL; the_subsystem->client = NULL; crm_info("Received HUP from %s:[%d]", the_subsystem->name, the_subsystem->pid); } else { /* else that was a transient client */ crm_trace("Received HUP from transient client"); } crm_trace("Disconnecting client %s (%p)", crm_client_name(client), client); free(client->userdata); crm_client_destroy(client); trigger_fsa(fsa_source); return 0; } static void crmd_ipc_destroy(qb_ipcs_connection_t * c) { crm_trace("Connection %p", c); } /* A_STOP */ void do_stop(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { - if (is_heartbeat_cluster()) { - stop_subsystem(pe_subsystem, FALSE); - } - crm_trace("Closing IPC server"); mainloop_del_ipc_server(ipcs); ipcs = NULL; register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } /* A_STARTED */ void do_started(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { static struct qb_ipcs_service_handlers crmd_callbacks = { .connection_accept = crmd_ipc_accept, .connection_created = crmd_ipc_created, .msg_process = crmd_ipc_dispatch, .connection_closed = crmd_ipc_closed, .connection_destroyed = crmd_ipc_destroy }; if (cur_state != S_STARTING) { crm_err("Start cancelled... %s", fsa_state2string(cur_state)); return; } else if (is_set(fsa_input_register, R_MEMBERSHIP) == FALSE) { crm_info("Delaying start, no membership data (%.16llx)", R_MEMBERSHIP); crmd_fsa_stall(TRUE); return; } else if (is_set(fsa_input_register, R_LRM_CONNECTED) == FALSE) { crm_info("Delaying start, LRM not connected (%.16llx)", R_LRM_CONNECTED); crmd_fsa_stall(TRUE); return; } else if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { crm_info("Delaying start, CIB not connected (%.16llx)", R_CIB_CONNECTED); crmd_fsa_stall(TRUE); return; } else if (is_set(fsa_input_register, R_READ_CONFIG) == FALSE) { crm_info("Delaying start, Config not read (%.16llx)", R_READ_CONFIG); crmd_fsa_stall(TRUE); return; } else if (is_set(fsa_input_register, R_PEER_DATA) == FALSE) { /* try reading from HA */ crm_info("Delaying start, No peer data (%.16llx)", R_PEER_DATA); #if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { HA_Message *msg = NULL; crm_trace("Looking for a HA message"); msg = fsa_cluster_conn->llc_ops->readmsg(fsa_cluster_conn, 0); if (msg != NULL) { crm_trace("There was a HA message"); ha_msg_del(msg); } } #endif crmd_fsa_stall(TRUE); return; } crm_debug("Init server comms"); ipcs = crmd_ipc_server_init(&crmd_callbacks); if (ipcs == NULL) { crm_err("Failed to create IPC server: shutting down and inhibiting respawn"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } if (stonith_reconnect == NULL) { int dummy; stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, te_connect_stonith, &dummy); } set_bit(fsa_input_register, R_ST_REQUIRED); mainloop_set_trigger(stonith_reconnect); crm_notice("The local CRM is operational"); clear_bit(fsa_input_register, R_STARTING); register_fsa_input(msg_data->fsa_cause, I_PENDING, NULL); } /* A_RECOVER */ void do_recover(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { set_bit(fsa_input_register, R_IN_RECOVERY); crm_warn("Fast-tracking shutdown in response to errors"); register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } /* *INDENT-OFF* */ pe_cluster_option crmd_opts[] = { /* name, old-name, validate, default, description */ { "dc-version", NULL, "string", NULL, "none", NULL, "Version of Pacemaker on the cluster's DC.", "Includes the hash which identifies the exact Mercurial changeset it was built from. Used for diagnostic purposes." }, { "cluster-infrastructure", NULL, "string", NULL, "heartbeat", NULL, "The messaging stack on which Pacemaker is currently running.", "Used for informational and diagnostic purposes." }, { XML_CONFIG_ATTR_DC_DEADTIME, "dc_deadtime", "time", NULL, "20s", &check_time, "How long to wait for a response from other nodes during startup.", "The \"correct\" value will depend on the speed/load of your network and the type of switches used." }, { XML_CONFIG_ATTR_RECHECK, "cluster_recheck_interval", "time", "Zero disables polling. Positive values are an interval in seconds (unless other SI units are specified. eg. 5min)", "15min", &check_timer, "Polling interval for time based changes to options, resource parameters and constraints.", "The Cluster is primarily event driven, however the configuration can have elements that change based on time." " To ensure these changes take effect, we can optionally poll the cluster's status for changes." }, { "load-threshold", NULL, "percentage", NULL, "80%", &check_utilization, "The maximum amount of system resources that should be used by nodes in the cluster", "The cluster will slow down its recovery process when the amount of system resources used" " (currently CPU) approaches this limit", }, { "node-action-limit", "migration-limit", "integer", NULL, "0", &check_number, "The maximum number of jobs that can be scheduled per node. Defaults to 2x cores"}, { XML_CONFIG_ATTR_ELECTION_FAIL, "election_timeout", "time", NULL, "2min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { XML_CONFIG_ATTR_FORCE_QUIT, "shutdown_escalation", "time", NULL, "20min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer, "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-transition-delay", NULL, "time", NULL, "0s", &check_timer, "*** Advanced Use Only ***\nEnabling this option will slow down cluster recovery under all conditions", "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\nUseful if your configuration is sensitive to the order in which ping updates arrive." }, #if SUPPORT_PLUGIN { XML_ATTR_EXPECTED_VOTES, NULL, "integer", NULL, "2", &check_number, "The number of nodes expected to be in the cluster", "Used to calculate quorum in openais based clusters." }, #endif }; /* *INDENT-ON* */ void crmd_metadata(void) { config_metadata("CRM Daemon", "1.0", "CRM Daemon Options", "This is a fake resource that details the options that can be configured for the CRM Daemon.", crmd_opts, DIMOF(crmd_opts)); } static void verify_crmd_options(GHashTable * options) { verify_all_options(options, crmd_opts, DIMOF(crmd_opts)); } static const char * crmd_pref(GHashTable * options, const char *name) { return get_cluster_pref(options, crmd_opts, DIMOF(crmd_opts), name); } static void config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { const char *value = NULL; GHashTable *config_hash = NULL; crm_time_t *now = crm_time_new(NULL); if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; crm_err("Local CIB query resulted in an error: %s", pcmk_strerror(rc)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); if (rc == -EACCES || rc == -pcmk_err_dtd_validation) { crm_err("The cluster is mis-configured - shutting down and staying down"); set_bit(fsa_input_register, R_STAYDOWN); } goto bail; } crm_debug("Call %d : Parsing CIB options", call_id); config_hash = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); unpack_instance_attributes(output, output, XML_CIB_TAG_PROPSET, NULL, config_hash, CIB_OPTIONS_FIRST, FALSE, now); verify_crmd_options(config_hash); value = crmd_pref(config_hash, XML_CONFIG_ATTR_DC_DEADTIME); election_trigger->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, "node-action-limit"); /* Also checks migration-limit */ throttle_update_job_max(value); value = crmd_pref(config_hash, "load-threshold"); if(value) { throttle_load_target = strtof(value, NULL) / 100; } value = crmd_pref(config_hash, XML_CONFIG_ATTR_FORCE_QUIT); shutdown_escalation_timer->period_ms = crm_get_msec(value); /* How long to declare an election over - even if not everyone voted */ crm_debug("Shutdown escalation occurs after: %dms", shutdown_escalation_timer->period_ms); value = crmd_pref(config_hash, XML_CONFIG_ATTR_ELECTION_FAIL); election_timeout_set_period(fsa_election, crm_get_msec(value)); value = crmd_pref(config_hash, XML_CONFIG_ATTR_RECHECK); recheck_timer->period_ms = crm_get_msec(value); crm_debug("Checking for expired actions every %dms", recheck_timer->period_ms); value = crmd_pref(config_hash, "crmd-transition-delay"); transition_timer->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, "crmd-integration-timeout"); integration_timer->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, "crmd-finalization-timeout"); finalization_timer->period_ms = crm_get_msec(value); #if SUPPORT_COROSYNC if (is_classic_ais_cluster()) { value = crmd_pref(config_hash, XML_ATTR_EXPECTED_VOTES); crm_debug("Sending expected-votes=%s to corosync", value); send_cluster_text(crm_class_quorum, value, TRUE, NULL, crm_msg_ais); } #endif set_bit(fsa_input_register, R_READ_CONFIG); crm_trace("Triggering FSA: %s", __FUNCTION__); mainloop_set_trigger(fsa_source); g_hash_table_destroy(config_hash); bail: crm_time_free(now); } gboolean crm_read_options(gpointer user_data) { int call_id = fsa_cib_conn->cmds->query(fsa_cib_conn, XML_CIB_TAG_CRMCONFIG, NULL, cib_scope_local); fsa_register_cib_callback(call_id, FALSE, NULL, config_query_callback); crm_trace("Querying the CIB... call %d", call_id); return TRUE; } /* A_READCONFIG */ void do_read_config(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { throttle_init(); mainloop_set_trigger(config_read); } void crm_shutdown(int nsig) { if (crmd_mainloop != NULL && g_main_is_running(crmd_mainloop)) { if (is_set(fsa_input_register, R_SHUTDOWN)) { crm_err("Escalating the shutdown"); register_fsa_input_before(C_SHUTDOWN, I_ERROR, NULL); } else { set_bit(fsa_input_register, R_SHUTDOWN); register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL); if (shutdown_escalation_timer->period_ms < 1) { const char *value = crmd_pref(NULL, XML_CONFIG_ATTR_FORCE_QUIT); int msec = crm_get_msec(value); crm_debug("Using default shutdown escalation: %dms", msec); shutdown_escalation_timer->period_ms = msec; } /* cant rely on this... */ crm_notice("Requesting shutdown, upper limit is %dms", shutdown_escalation_timer->period_ms); crm_timer_start(shutdown_escalation_timer); } } else { crm_info("exit from shutdown"); crmd_exit(pcmk_ok); } } diff --git a/crmd/pengine.c b/crmd/pengine.c index 2f3eba85b3..dde1b13692 100644 --- a/crmd/pengine.c +++ b/crmd/pengine.c @@ -1,298 +1,301 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include /* for access */ #include /* for calls to open */ #include /* for calls to open */ #include /* for calls to open */ #include /* for getpwuid */ #include /* for initgroups */ #include /* for getrlimit */ #include /* for getrlimit */ #include #include #include #include #include #include #include #include struct crm_subsystem_s *pe_subsystem = NULL; void do_pe_invoke_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data); static void save_cib_contents(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { char *id = user_data; register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, NULL, NULL, __FUNCTION__); CRM_CHECK(id != NULL, return); if (rc == pcmk_ok) { int len = 15; char *filename = NULL; len += strlen(id); len += strlen(PE_STATE_DIR); filename = calloc(1, len); CRM_CHECK(filename != NULL, return); sprintf(filename, PE_STATE_DIR "/pe-core-%s.bz2", id); if (write_xml_file(output, filename, TRUE) < 0) { crm_err("Could not save CIB contents after PE crash to %s", filename); } else { crm_notice("Saved CIB contents after PE crash to %s", filename); } free(filename); } free(id); } static void pe_ipc_destroy(gpointer user_data) { - clear_bit(fsa_input_register, pe_subsystem->flag_connected); if (is_set(fsa_input_register, pe_subsystem->flag_required)) { int rc = pcmk_ok; char *uuid_str = crm_generate_uuid(); crm_crit("Connection to the Policy Engine failed (pid=%d, uuid=%s)", pe_subsystem->pid, uuid_str); /* *The PE died... * * Save the current CIB so that we have a chance of * figuring out what killed it. * * Delay raising the I_ERROR until the query below completes or * 5s is up, whichever comes first. * */ rc = fsa_cib_conn->cmds->query(fsa_cib_conn, NULL, NULL, cib_scope_local); fsa_register_cib_callback(rc, FALSE, uuid_str, save_cib_contents); } else { + if (is_heartbeat_cluster()) { + stop_subsystem(pe_subsystem, FALSE); + } crm_info("Connection to the Policy Engine released"); } + clear_bit(fsa_input_register, pe_subsystem->flag_connected); pe_subsystem->pid = -1; pe_subsystem->source = NULL; pe_subsystem->client = NULL; mainloop_set_trigger(fsa_source); return; } static int pe_ipc_dispatch(const char *buffer, ssize_t length, gpointer userdata) { xmlNode *msg = string2xml(buffer); if (msg) { route_message(C_IPC_MESSAGE, msg); } free_xml(msg); return 0; } /* A_PE_START, A_PE_STOP, A_TE_RESTART */ void do_pe_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { struct crm_subsystem_s *this_subsys = pe_subsystem; long long stop_actions = A_PE_STOP; long long start_actions = A_PE_START; static struct ipc_client_callbacks pe_callbacks = { .dispatch = pe_ipc_dispatch, .destroy = pe_ipc_destroy }; if (action & stop_actions) { clear_bit(fsa_input_register, pe_subsystem->flag_required); mainloop_del_ipc_client(pe_subsystem->source); pe_subsystem->source = NULL; clear_bit(fsa_input_register, pe_subsystem->flag_connected); } if ((action & start_actions) && (is_set(fsa_input_register, R_PE_CONNECTED) == FALSE)) { if (cur_state != S_STOPPING) { set_bit(fsa_input_register, pe_subsystem->flag_required); pe_subsystem->source = mainloop_add_ipc_client(CRM_SYSTEM_PENGINE, G_PRIORITY_DEFAULT, 5 * 1024 * 1024 /* 5Mb */ , NULL, &pe_callbacks); if (pe_subsystem->source == NULL) { crm_warn("Setup of client connection failed, not adding channel to mainloop"); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); return; } /* if (is_openais_cluster()) { */ /* pe_subsystem->pid = pe_subsystem->ipc->farside_pid; */ /* } */ set_bit(fsa_input_register, pe_subsystem->flag_connected); } else { crm_info("Ignoring request to start %s while shutting down", this_subsys->name); } } } int fsa_pe_query = 0; char *fsa_pe_ref = NULL; /* A_PE_INVOKE */ void do_pe_invoke(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { if (AM_I_DC == FALSE) { crm_err("Not DC: No need to invoke the PE (anymore): %s", fsa_action2string(action)); return; } if (is_set(fsa_input_register, R_PE_CONNECTED) == FALSE) { if (is_set(fsa_input_register, R_SHUTDOWN)) { crm_err("Cannot shut down gracefully without the PE"); register_fsa_input_before(C_FSA_INTERNAL, I_TERMINATE, NULL); } else { crm_info("Waiting for the PE to connect"); crmd_fsa_stall(FALSE); register_fsa_action(A_PE_START); } return; } if (cur_state != S_POLICY_ENGINE) { crm_notice("No need to invoke the PE in state %s", fsa_state2string(cur_state)); return; } if (is_set(fsa_input_register, R_HAVE_CIB) == FALSE) { crm_err("Attempted to invoke the PE without a consistent copy of the CIB!"); /* start the join from scratch */ register_fsa_input_before(C_FSA_INTERNAL, I_ELECTION, NULL); return; } fsa_pe_query = fsa_cib_conn->cmds->query(fsa_cib_conn, NULL, NULL, cib_scope_local); crm_debug("Query %d: Requesting the current CIB: %s", fsa_pe_query, fsa_state2string(fsa_state)); /* Make sure any queued calculations are discarded */ free(fsa_pe_ref); fsa_pe_ref = NULL; fsa_register_cib_callback(fsa_pe_query, FALSE, NULL, do_pe_invoke_callback); } void do_pe_invoke_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { int sent; xmlNode *cmd = NULL; if (rc != pcmk_ok) { crm_err("Cant retrieve the CIB: %s (call %d)", pcmk_strerror(rc), call_id); register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, NULL, NULL, __FUNCTION__); return; } else if (call_id != fsa_pe_query) { crm_trace("Skipping superceeded CIB query: %d (current=%d)", call_id, fsa_pe_query); return; } else if (AM_I_DC == FALSE || is_set(fsa_input_register, R_PE_CONNECTED) == FALSE) { crm_debug("No need to invoke the PE anymore"); return; } else if (fsa_state != S_POLICY_ENGINE) { crm_debug("Discarding PE request in state: %s", fsa_state2string(fsa_state)); return; } else if (last_peer_update != 0) { crm_debug("Re-asking for the CIB: peer update %d still pending", last_peer_update); sleep(1); register_fsa_action(A_PE_INVOKE); return; } else if (fsa_state != S_POLICY_ENGINE) { crm_err("Invoking PE in state: %s", fsa_state2string(fsa_state)); return; } CRM_LOG_ASSERT(output != NULL); /* refresh our remote-node cache when the pengine is invoked */ crm_remote_peer_cache_refresh(output); crm_xml_add(output, XML_ATTR_DC_UUID, fsa_our_uuid); crm_xml_add_int(output, XML_ATTR_HAVE_QUORUM, fsa_has_quorum); if (ever_had_quorum && crm_have_quorum == FALSE) { crm_xml_add_int(output, XML_ATTR_QUORUM_PANIC, 1); } cmd = create_request(CRM_OP_PECALC, output, NULL, CRM_SYSTEM_PENGINE, CRM_SYSTEM_DC, NULL); free(fsa_pe_ref); fsa_pe_ref = crm_element_value_copy(cmd, XML_ATTR_REFERENCE); sent = crm_ipc_send(mainloop_get_ipc_client(pe_subsystem->source), cmd, 0, 0, NULL); if (sent <= 0) { crm_err("Could not contact the pengine: %d", sent); register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, NULL, NULL, __FUNCTION__); } crm_debug("Invoking the PE: query=%d, ref=%s, seq=%llu, quorate=%d", fsa_pe_query, fsa_pe_ref, crm_peer_seq, fsa_has_quorum); free_xml(cmd); } diff --git a/doc/Pacemaker_Remote/en-US/Ch-Baremetal-Tutorial.txt b/doc/Pacemaker_Remote/en-US/Ch-Baremetal-Tutorial.txt new file mode 100644 index 0000000000..e33f5cb6a0 --- /dev/null +++ b/doc/Pacemaker_Remote/en-US/Ch-Baremetal-Tutorial.txt @@ -0,0 +1,251 @@ += Baremetal Walk-through = + ++What this tutorial is:+ This tutorial is an in-depth walk-through of how to get pacemaker to integrate a baremetal remote-node into the cluster as a node capable of running cluster resources. + ++What this tutorial is not:+ This tutorial is not a realistic deployment scenario. The steps shown here are meant to get users familiar with the concept of remote-nodes as quickly as possible. + +== Step 1: Setup == + +This tutorial requires three machines. Two machines to act as cluster-nodes and a third to act as the baremetal remote-node. + +This tutorial was tested using Fedora 18 on both the cluster-nodes and baremetal remote-node. Anything that is capable of running pacemaker v1.1.11 or greater will do though. An installation guide for installing Fedora 18 can be found here, http://docs.fedoraproject.org/en-US/Fedora/18/html/Installation_Guide/. + +Fedora 18 (or similar distro) host preparation steps. + +=== SElinux and Firewall Considerations === +In order to simply this tutorial we will disable selinux and the firewall on all the nodes. ++WARNING:+ These actions will open a significant security threat to machines exposed to the outside world. +[source,C] +---- +# setenforce 0 +# sed -i.bak "s/SELINUX=enforcing/SELINUX=permissive/g" /etc/selinux/config +# firewall-cmd --add-port 3121/tcp --permanent +# systemctl disable iptables.service +# systemctl disable ip6tables.service +# rm '/etc/systemd/system/basic.target.wants/iptables.service' +# rm '/etc/systemd/system/basic.target.wants/ip6tables.service' +# systemctl stop iptables.service +# systemctl stop ip6tables.service +---- + +=== Setup Pacemaker Remote on Baremetal remote-node === + +On the baremetal remote-node machine run these commands to generate an authkey and copy it to the /etc/pacemaker folder. + +[source,C] +---- +# mkdir /etc/pacemaker +# dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1 +---- + +Make sure to distribute this key to both of the cluster-nodes as well. All the nodes must have the same /etc/pacemaker/authkey installed for the communication to work correctly. + +Now install and start the pacemaker_remote daemon on the baremetal remote-node. + +[source,C] +---- +# yum install -y paceamaker-remote resource-agents pcs +# systemctl enable pacemaker_remote.service +# systemctl start pacemaker_remote.service +---- + +Verify the start is successful. + +[source,C] +---- +# systemctl status pacemaker_remote + + pacemaker_remote.service - Pacemaker Remote Service + Loaded: loaded (/usr/lib/systemd/system/pacemaker_remote.service; enabled) + Active: active (running) since Thu 2013-03-14 18:24:04 EDT; 2min 8s ago + Main PID: 1233 (pacemaker_remot) + CGroup: name=systemd:/system/pacemaker_remote.service + └─1233 /usr/sbin/pacemaker_remoted + + Mar 14 18:24:04 remote1 systemd[1]: Starting Pacemaker Remote Service... + Mar 14 18:24:04 remote1 systemd[1]: Started Pacemaker Remote Service. + Mar 14 18:24:04 remote1 pacemaker_remoted[1233]: notice: lrmd_init_remote_tls_server: Starting a tls listener on port 3121. +---- + +=== Verify cluster-node Connection to baremetal-node === + +Before moving forward it's worth going ahead and verifying the cluster-nodes can contact the baremetal node on port 3121. Here's a trick you can use. Connect using telnet from each of the cluster-nodes. The connection will get destroyed, but how it is destroyed tells you whether it worked or not. + +First add the baremetal remote-node's hostname (we're using remote1 in this tutorial) to the cluster-nodes' /etc/hosts files if you haven't already. This is required unless you have dns setup in a way where remote1's address can be discovered. + +Execute the following on each cluster-node, replacing the ip address with the actual ip address of the baremetal remote-node. +[source,C] +---- +# cat << END >> /etc/hosts +192.168.122.10 remote1 +END +---- + +If running the telnet command on one of the cluster-nodes results in this output before disconnecting, the connection works. +[source,C] +---- +# telnet remote1 3121 + Trying 192.168.122.10... + Connected to remote1. + Escape character is '^]'. + Connection closed by foreign host. +---- + +If you see this, the connection is not working. +[source,C] +---- +# telnet remote1 3121 +Trying 192.168.122.10... +telnet: connect to address 192.168.122.10: No route to host +---- + +Once you can successfully connect to the baremetal remote-node from the both cluster-nodes, move on to setting up pacemaker on the cluster-nodes. + +=== Install cluster-node Software === + +On the two cluster-nodes install the following packages. + +[source,C] +---- +# yum install -y pacemaker corosync pcs resource-agents +---- + +=== Setup Corosync on cluster-nodes === + +On one of the cluster nodes, execute the following. + +[source,C] +---- +# export corosync_addr=`ip addr | grep "inet " | tail -n 1 | awk '{print $4}' | sed s/255/0/g` +---- + +Display and verify that address is correct + +[source,C] +---- +# echo $corosync_addr +---- + +In many cases the address will be 192.168.1.0 if you are behind a standard home router. + +Now copy over the example corosync.conf. This code will inject your bindaddress and enable the vote quorum api which is required by pacemaker. + +[source,C] +---- +# cp /etc/corosync/corosync.conf.example /etc/corosync/corosync.conf +# sed -i.bak "s/.*\tbindnetaddr:.*/bindnetaddr:\ $corosync_addr/g" /etc/corosync/corosync.conf +# cat << END >> /etc/corosync/corosync.conf +quorum { + provider: corosync_votequorum + expected_votes: 2 + two_node: 1 +} +END +---- + +Make sure to copy the newly created /etc/corosync/corosync.conf file to the second cluster-node before continuing. + +=== Start Pacemaker on cluster-nodes === + +Start the cluster stack on both cluster nodes using the following command. + +[source,C] +---- +# pcs cluster start +---- + +Verify corosync membership + +[source,C] +---- +# pcs status corosync + +Membership information + Nodeid Votes Name +1795270848 1 node1 (local) +---- + +Verify pacemaker status. At first the 'pcs cluster status' output will look like this. + +[source,C] +---- +# pcs status + + Last updated: Thu Mar 14 12:26:00 2013 + Last change: Thu Mar 14 12:25:55 2013 via crmd on example-host + Stack: corosync + Current DC: + Version: 1.1.11 + 1 Nodes configured, unknown expected votes + 0 Resources configured. +---- + +After about a minute you should see your two cluster-nodes come online. + +[source,C] +---- +# pcs status + + Last updated: Thu Mar 14 12:28:23 2013 + Last change: Thu Mar 14 12:25:55 2013 via crmd on node1 + Stack: corosync + Current DC: node1 (1795270848) - partition with quorum + Version: 1.1.11 + 2 Nodes configured, unknown expected votes + 0 Resources configured. + + Online: [ node1 node2 ] +---- + +For the sake of this tutorial, we are going to disable stonith to avoid having to cover fencing device configuration. + +[source,C] +---- +# pcs property set stonith-enabled=false +---- + +=== Integrate Baremetal remote-node into Cluster === + +Integrating a baremetal remote-node into the cluster is achieved through the creation of a remote-node connection resource. The remote-node connection resource both establishes the connection to the remote-node and defines that the remote-node exists. Note that this resource is actually internal to Pacemaker's crmd component. A metadata file for this resource can be found in the /usr/lib/ocf/resource.d/pacemaker/remote file that describes what options are available, but there is no actual ocf:pacemaker:remote resource agent script that performs any work. + +Define the remote-node connection resource to our baremetal remote-node, remote1, using the following command. + +[source,C] +---- +# pcs resource create remote1 ocf:pacemaker:remote +---- + +That's it. After a moment you should see the remote-node come online. + +[source,C] +---- +Last updated: Fri Oct 18 18:47:21 2013 +Last change: Fri Oct 18 18:46:14 2013 via cibadmin on node1 +Stack: corosync +Current DC: node1 (1) - partition with quorum +Version: 1.1.11 +3 Nodes configured +1 Resources configured + +Online: [ node1 node2 ] +RemoteOnline: [ remote1 ] + +remote1 (ocf::pacemaker:remote): Started node1 +---- + +=== Starting Resources on baremetal remote-node === + ++"Warning: Never involve a remote-node connection resource in a resource group, colocation, or order constraint"+ + +Once the baremetal remote-node is integrated into the cluster, starting resources on a baremetal remote-node is the exact same as the cluster nodes. Refer to the Clusters from Scratch document for examples on resource creation. http://clusterlabs.org/doc/ + +=== Fencing baremetal remote-nodes === + +The cluster understands how to fence baremetal remote-nodes and can use standard fencing devices to do so. No special considerations are required. Note however that remote-nodes can never initiate a fencing action. Only cluster-nodes are capable of actually executing the fencing operation on another node. + +=== Accessing Cluster Tools from a Baremetal remote-node === + +Besides allowing the cluster to manage resources on a remote-node, pacemaker_remote has one other trick. +The pacemaker_remote daemon allows nearly all the pacemaker tools (crm_resource, crm_mon, crm_attribute, crm_master) to work on remote nodes natively.+ + +Try it, run +crm_mon+ or +pcs status+ on the baremetal node after pacemaker has integrated the remote-node into the cluster. These tools just work. These means resource agents such as master/slave resources which need access to tools like crm_master work seamlessly on the remote-nodes. + diff --git a/doc/Pacemaker_Remote/en-US/Ch-Example.txt b/doc/Pacemaker_Remote/en-US/Ch-Example.txt index ca94044945..5db250f551 100644 --- a/doc/Pacemaker_Remote/en-US/Ch-Example.txt +++ b/doc/Pacemaker_Remote/en-US/Ch-Example.txt @@ -1,107 +1,107 @@ -= Quick Example = += KVM Remote-node Quick Example = If you already know how to use pacemaker, you'll likely be able to grasp this new concept of remote-nodes by reading through this quick example without having to sort through all the detailed walk-through steps. Here are the key configuration ingredients that make this possible using libvirt and KVM virtual guests. These steps strip everything down to the very basics. == Mile High View of Configuration Steps == * +Put an authkey with this path, /etc/pacemaker/authkey, on every cluster-node and virtual machine+. This secures remote communication and authentication. Run this command if you want to make a somewhat random authkey. [source,C] ---- dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1 ---- -* +Install pacemaker_remote packages every virtual machine, enable pacemaker_remote on startup, and poke hole in firewall for tcp port 3121.+ +* +Install pacemaker_remote packages on every virtual machine, enable pacemaker_remote on startup, and poke hole in firewall for tcp port 3121.+ [source,C] ---- yum install pacemaker-remote resource-agents systemctl enable pacemaker_remote # If you just want to see this work, disable iptables and ip6tables on most distros. # You may have to put selinux in permissive mode as well for the time being. firewall-cmd --add-port 3121/tcp --permanent ---- * +Give each virtual machine a static network address and unique hostname+ * +Tell pacemaker to launch a virtual machine and that the virtual machine is a remote-node capable of running resources by using the "remote-node" meta-attribute.+ with pcs [source,C] ---- # pcs resource create vm-guest1 VirtualDomain hypervisor="qemu:///system" config="vm-guest1.xml" meta +remote-node=guest1+ ---- raw xml [source,XML] ---- ---- In the example above the meta-attribute 'remote-node=guest1' tells pacemaker that this resource is a remote-node with the hostname 'guest1' that is capable of being integrated into the cluster. The cluster will attempt to contact the virtual machine's pacemaker_remote service at the hostname 'guest1' after it launches. == What those steps just did == Those steps just told pacemaker to launch a virtual machine called vm-guest1 and integrate that virtual machine as a remote-node called 'guest1'. Example crm_mon output after guest1 is integrated into cluster. [source,C] ---- Last updated: Wed Mar 13 13:52:39 2013 Last change: Wed Mar 13 13:25:17 2013 via crmd on node1 Stack: corosync Current DC: node1 (24815808) - partition with quorum Version: 1.1.10 2 Nodes configured, unknown expected votes 2 Resources configured. Online: [ node1 guest1] vm-guest1 (ocf::heartbeat:VirtualDomain): Started node1 ---- Now, you could place a resource, such as a webserver on guest1. [source,C] ---- # pcs resource create webserver apache params configfile=/etc/httpd/conf/httpd.conf op monitor interval=30s # pcs constraint webserver prefers guest1 ---- Now the crm_mon output would show a webserver launched on the guest1 remote-node. [source,C] ---- Last updated: Wed Mar 13 13:52:39 2013 Last change: Wed Mar 13 13:25:17 2013 via crmd on node1 Stack: corosync Current DC: node1 (24815808) - partition with quorum Version: 1.1.10 2 Nodes configured, unknown expected votes 2 Resources configured. Online: [ node1 guest1] vm-guest1 (ocf::heartbeat:VirtualDomain): Started node1 webserver (ocf::heartbeat::apache): Started guest1 ---- == Accessing Cluster from Remote-node == It is worth noting that after 'guest1' is integrated into the cluster, all the pacemaker cli tools immediately become available to the remote node. This means things like crm_mon, crm_resource, and crm_attribute will work natively on the remote-node as long as the connection between the remote-node and cluster-node exists. This is particularly important for any master/slave resources executing on the remote-node that need access to crm_master to set the nodes transient attributes. diff --git a/doc/Pacemaker_Remote/en-US/Ch-Future.txt b/doc/Pacemaker_Remote/en-US/Ch-Future.txt index 93c082f328..2c493067e6 100644 --- a/doc/Pacemaker_Remote/en-US/Ch-Future.txt +++ b/doc/Pacemaker_Remote/en-US/Ch-Future.txt @@ -1,15 +1,16 @@ = Future Features = Basic KVM and Linux container integration was the first phase of development for pacemaker_remote and was completed for Pacemaker v1.1.10. Here are some planned features that expand upon this initial functionality. == Libvirt Sandbox Support == Once the libvirt-sandbox project is integrated with pacemaker_remote, we will gain the ability to preform per-resource linux container isolation with very little performance impact. This functionality will allow resources living on a single node to be isolated from one another. At that point CPU and memory limits could be set per-resource dynamically just using the cluster config. == Bare-metal Support == ++"This feature has already been introduced into Pacemaker's master github branch and is scheduled for Pacemaker v1.1.11"+ The pacemaker_remote daemon already has the ability to run on bare-metal hardware nodes, but the policy engine logic for integrating bare-metal nodes is not complete. There are some complications involved with understanding a bare-metal node's state that virtual nodes don't have. Once this logic is complete, pacemaker will be able to integrate bare-metal nodes in the same way virtual remote-nodes currently are. Some special considerations for fencing will need to be addressed. == KVM Migration Support == -Pacemaker's policy engine is limited in its ability to perform live migrations of KVM resources when resource dependencies are involved. This limitation affects how resources living within a KVM remote-node are handled when a live migration takes place. Currently when a live migration is performed on a KVM remote-node, all the resources within that remote-node have to be stopped before the migration takes place and started once again after migration has finished. This policy engine limitation is fully explained in this bug report, http://bugs.clusterlabs.org/show_bug.cgi?id=5055#c3 +Pacemaker's policy engine is limited in its ability to perform live migrations of KVM resources when resource dependencies are involved. This limitation affects how resources living within a KVM remote-node are handled when a live migration takes place. Currently when a live migration is performed on a KVM remote-node, all the resources within that remote-node have to be stopped before the migration takes place and started once again after migration has finished. This policy engine limitation is fully explained in this bug report, http://bugs.clusterlabs.org/show_bug.cgi?id=5055#c3 diff --git a/doc/Pacemaker_Remote/en-US/Ch-Intro.txt b/doc/Pacemaker_Remote/en-US/Ch-Intro.txt index c7b3001f38..d8699b3acf 100644 --- a/doc/Pacemaker_Remote/en-US/Ch-Intro.txt +++ b/doc/Pacemaker_Remote/en-US/Ch-Intro.txt @@ -1,55 +1,85 @@ = Extending High Availability Cluster into Virtual Nodes = == Overview == The recent addition of the +pacemaker_remote+ service supported by +Pacemaker version 1.1.10 and greater+ allows nodes not running the cluster stack (pacemaker+corosync) to integrate into the cluster and have the cluster manage their resources just as if they were a real cluster node. This means that pacemaker clusters are now capable of managing both launching virtual environments (KVM/LXC) as well as launching the resources that live within those virtual environments without requiring the virtual environments to run pacemaker or corosync. + == Terms == -+cluster-node+ - A baremetal hardware node running the High Availability stack (pacemaker + corosync) ++cluster-node+ - A node running the High Availability stack (pacemaker + corosync) + ++remote-node+ - A node running pacemaker_remote without the rest of the High Availability stack. There are two types of remote-nodes, container and baremetal. + ++container+ - A pacemaker resource that contains additional resources. For example, a KVM virtual machine resource that contains a webserver resource. + ++container remote-node+ - A virtual guest remote-node running the pacemaker_remote service. This describes a specific remote-node use case where a virtual guest resource managed by the cluster is both started by the cluster and integrated into the cluster as a remote-node. -+remote-node+ - A virtual guest node running the pacemaker_remote service. ++baremetal+ - Term used to describe an environment that is not virtualized. -+pacemaker_remote+ - A service daemon capable of performing remote application management within virtual guests (kvm and lxc) in both pacemaker cluster environments and standalone (non-cluster) environments. This service is an enhanced version of pacemaker's local resource manage daemon (LRMD) that is capable of managing and monitoring LSB, OCF, upstart, and systemd resources on a guest remotely. It also allows for most of pacemaker's cli tools (crm_mon, crm_resource, crm_master, crm_attribute, ect..) to work natively on remote-nodes. ++baremetal remote-node+ - A baremetal hardware node running pacemaker_remote. This describes a specific remote-node use case where a hardware node not running the High Availability stack is integrated into the cluster as a remote-node through the use of pacemaker_remote. + ++pacemaker_remote+ - A service daemon capable of performing remote application management within guest nodes (baremetal, kvm, and lxc) in both pacemaker cluster environments and standalone (non-cluster) environments. This service is an enhanced version of pacemaker's local resource manage daemon (LRMD) that is capable of managing and monitoring LSB, OCF, upstart, and systemd resources on a guest remotely. It also allows for most of pacemaker's cli tools (crm_mon, crm_resource, crm_master, crm_attribute, ect..) to work natively on remote-nodes. +LXC+ - A Linux Container defined by the libvirt-lxc Linux container driver. http://libvirt.org/drvlxc.html +== Version Info == + +This feature is in ongoing development. + ++Pacemaker v1.1.10+ + +* Initial pacemaker_remote daemon and integration support. +* Only supports pacemaker in KVM/LXC environments. +* pacemaker_remote daemon unit test suite. +* Known bugs include (These are likely resolved if you have received an 1.1.10.x point release): Errors when setting remote-node attributes, Failures when stopping orphaned (deleted from cib while running) remote-nodes, Fixes remote-node usage in asymmetric clusters. + ++Currently in Master github branch and scheduled for Pacemaker v1.1.11+ + +* Baremetal remote-node support. +* Improvements to scaling remote-node integration. Performance testing here included 16 cluster nodes running 64 remote-nodes living in LXC containers. As part of this testing, several performance enhancements were introduced into the integration code. +* CTS tests. RemoteLXC and RemoteBaremetal. These two CTS tests allow us to perform automated verification of pacemaker_remote integration. +* Fixes for known bugs in 1.1.10 release. + == Virtual Machine Use Case == The use of pacemaker_remote in virtual machines solves a deployment scenario that has traditionally been difficult to solve. +"I want a pacemaker cluster to manage virtual machine resources, but I also want pacemaker to be able to manage the resources that live within those virtual machines."+ In the past, users desiring this deployment had to make a decision. They would either have to sacrifice the ability of monitoring resources residing within virtual guests by running the cluster stack on the baremetal nodes, or run another cluster instance on the virtual guests where they potentially run into corosync scalability issues. There is a third scenario where the virtual guests run the cluster stack and join the same network as the baremetal nodes, but that can quickly hit issues with scalability as well. With the pacemaker_remote service we have a new option. * The baremetal cluster-nodes run the cluster stack (paceamaker+corosync). * The virtual remote-nodes run the pacemaker_remote service (nearly zero configuration required on the virtual machine side) * The cluster stack on the cluster-nodes launch the virtual machines and immediately connect to the pacemaker_remote service, allowing the virtual machines to integrate into the cluster just as if they were a real cluster-node. -The key difference here between the virtual machine remote-nodes and the cluster-nodes is that the remote-nodes are not running the cluster stack. This means the remote nodes will never become the DC, and they do not take place in quorum. On the hand this also means that the remote-nodes are not bound to the scalability limits associated with the cluster stack either. +No 16 node corosync member limits+ to deal with. That isn't to say remote-nodes can scale indefinitely, but the expectation is that remote-nodes scale horizontally much further than cluster-nodes. Other than the quorum limitation, these remote-nodes behave just like cluster nodes in respects to resource management. The cluster is fully capable of managing and monitoring resources on each remote-node. You can build constraints against remote-nodes, put them in standby, or whatever else you'd expect to be able to do with normal cluster-nodes. They even show up in the crm_mon output as you would expect cluster-nodes to. +The key difference here between the virtual machine remote-nodes and the cluster-nodes is that the remote-nodes are not running the cluster stack. This means the remote nodes will never become the DC, and they do not take place in quorum. On the other hand this also means that remote-nodes are not bound to the scalability limits associated with the cluster stack either. +No 16 node corosync member limits+ to deal with. That isn't to say remote-nodes can scale indefinitely, but it is known that remote-nodes scale horizontally much further than cluster-nodes. Other than the quorum limitation, these remote-nodes behave just like cluster nodes in respects to resource management. The cluster is fully capable of managing and monitoring resources on each remote-node. You can build constraints against remote-nodes, put them in standby, or whatever else you'd expect to be able to do with normal cluster-nodes. They even show up in the crm_mon output as you would expect cluster-nodes to. -To solidify the concept, an example cluster deployment integrating remote-nodes could look like this. +To solidify the concept, below is an example deployment that is very similar to an actual deployment we test in our developer environment to verify remote-node scalability. * 16 cluster-nodes running corosync+pacemaker stack. * 64 pacemaker managed virtual machine resources running pacemaker_remote configured as remote-nodes. * 64 pacemaker managed webserver and database resources configured to run on the 64 remote-nodes. -With this deployment you would have 64 webservers and databases running on 64 virtual machines on 16 hardware nodes all of which are managed and monitored by the same pacemaker deployment. +With this deployment you would have 64 webservers and databases running on 64 virtual machines on 16 hardware nodes all of which are managed and monitored by the same pacemaker deployment. It is known that pacemaker_remote can scale to these lengths and possibly much further depending on the specific scenario. + +== Baremetal remote-node Use Case == + ++"I want my traditional High Availability cluster to scale beyond the limits imposed by the corosync messaging layer."+ + +Ultimately the primary advantage of baremetal remote-nodes over traditional nodes running the Corosync+Pacemaker stack is scalability. There are likely some other use cases related to geographically distributed HA clusters that baremetal remote-nodes may serve a purpose in, but those use cases not well understood at this point. The only limitations baremetal remote-nodes have that cluster-nodes do not is the ability to take place in cluster quorum, and the ability to execute fencing agents via stonith. That is not to say however that fencing of a baremetal node works any differently than that of a normal cluster-node. The Pacemaker policy engine understands how to fence baremetal remote-nodes. As long as a fencing device exists, the cluster is capable of ensuring baremetal nodes are fenced in the exact same way as normal cluster-nodes are fenced. == Linux Container Use Case == +I want to isolate and limit the system resources (cpu, memory, filesystem) a cluster resource can consume without using virtual machines.+ Using pacemaker_remote with Linux containers (libvirt-lxc) opens up some interesting possibilities for isolating resources in the cluster without the use of a hypervisor. We now have the ability to both define a contained environment with cpu and memory utilization limits and then assign resources to that contained environment all managed from within pacemaker. The LXC Walk-through section of this document outlines how pacemaker_remote can be used to bring Linux containers into the cluster as remote-nodes capable of executing resources. == Expanding the Cluster Stack == === Traditional HA Stack === image::images/pcmk-ha-cluster-stack.png["The Traditional Pacemaker Corosync HA Stack.",width="17cm",height="9cm",align="center"] - -=== Remote-Node Enabled HA Stack === - -The stack grows one additional layer vertical so we can go further horizontal. +=== Remote-Node Enabled HA Stack Using Virtual guest nodes === image::images/pcmk-ha-remote-stack.png["Placing Pacemaker Remote into the Traditional HA Stack.",width="20cm",height="10cm",align="center"] diff --git a/doc/Pacemaker_Remote/en-US/Ch-LXC-Tutorial.txt b/doc/Pacemaker_Remote/en-US/Ch-LXC-Tutorial.txt index c3459c086a..f6be9cec0a 100644 --- a/doc/Pacemaker_Remote/en-US/Ch-LXC-Tutorial.txt +++ b/doc/Pacemaker_Remote/en-US/Ch-LXC-Tutorial.txt @@ -1,328 +1,330 @@ = Linux Container (LXC) Walk-through = ++Warning: Continued development in the VirtualDomain agent, libvirt, and the lxc_autogen script have rendered this tutorial (in its current form) obsolete.+ The high level approach of this tutorial remains accurate, but many of the specifics related to configuring the lxc environment have changed. This walk-through needs to be updated to reflect the current tested methodology. + +What this tutorial is:+ This tutorial demonstrates how pacemaker_remote can be used with Linux containers (managed by libvirt-lxc) to run cluster resources in an isolated environment. +What this tutorial is not:+ This tutorial is not a realistic deployment scenario. The steps shown here are meant to introduce users to the concept of managing Linux container environments with Pacemaker. == Step 1: Setup LXC Host == This tutorial was tested with Fedora 18. Anything that is capable of running libvirt and pacemaker v1.1.10 or greater will do though. An installation guide for installing Fedora 18 can be found here, http://docs.fedoraproject.org/en-US/Fedora/18/html/Installation_Guide/. Fedora 18 (or similar distro) host preparation steps. === SElinux and Firewall Rules === In order to simply this tutorial we will disable the selinux and the firewall on the host. WARNING: These actions pose a significant security issues to machines exposed to the outside world. Basically, just don't do this on your production system. [source,C] ---- # setenforce 0 # sed -i.bak "s/SELINUX=enforcing/SELINUX=permissive/g" /etc/selinux/config # firewall-cmd --add-port 3121/tcp --permanent # systemctl disable iptables.service # systemctl disable ip6tables.service # rm '/etc/systemd/system/basic.target.wants/iptables.service' # rm '/etc/systemd/system/basic.target.wants/ip6tables.service' # systemctl stop iptables.service # systemctl stop ip6tables.service ---- === Install Cluster Software on Host === [source,C] ---- # yum install -y pacemaker pacemaker-remote corosync pcs resource-agents ---- === Configure Corosync === Running the command below will attempt to detect the network address corosync should bind to. [source,C] ---- # export corosync_addr=`ip addr | grep "inet " | tail -n 1 | awk '{print $4}' | sed s/255/0/g` ---- Display and verify the address is correct [source,C] ---- # echo $corosync_addr ---- In most cases the address will be 192.168.1.0 if you are behind a standard home router. Now copy over the example corosync.conf. This code will inject your bindaddress and enable the vote quorum api which is required by pacemaker. [source,C] ---- # cp /etc/corosync/corosync.conf.example /etc/corosync/corosync.conf # sed -i.bak "s/.*\tbindnetaddr:.*/bindnetaddr:\ $corosync_addr/g" /etc/corosync/corosync.conf # cat << END >> /etc/corosync/corosync.conf quorum { provider: corosync_votequorum expected_votes: 2 } END ---- === Verify Cluster === Start the cluster [source,C] ---- # pcs cluster start ---- Verify corosync membership [source,C] ---- # pcs status corosync Membership information Nodeid Votes Name 1795270848 1 example-host (local) ---- Verify pacemaker status. At first the 'pcs cluster status' output will look like this. [source,C] ---- # pcs status Last updated: Thu Mar 14 12:26:00 2013 Last change: Thu Mar 14 12:25:55 2013 via crmd on example-host Stack: corosync Current DC: Version: 1.1.10 1 Nodes configured, unknown expected votes 0 Resources configured. ---- After about a minute you should see your host as a single node in the cluster. [source,C] ---- # pcs status Last updated: Thu Mar 14 12:28:23 2013 Last change: Thu Mar 14 12:25:55 2013 via crmd on example-host Stack: corosync Current DC: example-host (1795270848) - partition WITHOUT quorum Version: 1.1.8-9b13ea1 1 Nodes configured, unknown expected votes 0 Resources configured. Online: [ example-host ] ---- Go ahead and stop the cluster for now after verifying everything is in order. [source,C] ---- # pcs cluster stop ---- == Step 2: Setup LXC Environment == === Install Libvirt LXC software === [source,C] ---- # yum install -y libvirt libvirt-daemon-lxc wget # systemctl enable libvirtd ---- At this point, restart the host. === Generate Libvirt LXC domains === I've attempted to simply this tutorial by creating a script to auto generate the libvirt-lxc xml domain definitions. Download the script to whatever directory you want the containers to live in. In this example I am using the /root/lxc/ directory. [source,C] ---- # mkdir /root/lxc/ # cd /root/lxc/ # wget https://raw.github.com/davidvossel/pcmk-lxc-autogen/master/lxc-autogen # chmod 755 lxc-autogen ---- Now execute the script. [source,C] ---- # ./lxc-autogen ---- After executing the script you will see a bunch of directories and xml files are generated. Those xml files are the libvirt-lxc domain definitions, and the directories are used as some special mount points for each container. If you open up one of the xml files you'll be able to see how the cpu, memory, and filesystem resources for the container are defined. You can use the libvirt-lxc driver's documentation found here, http://libvirt.org/drvlxc.html, as a reference to help understand all the parts of the xml file. The lxc-autogen script is not complicated and is worth exploring in order to grasp how the environment is generated. It is worth noting that this environment is dependent on use of libvirt's default network interface. Verify the commands below look the same as your environment. The default network address 192.168.122.1 should have been generated by automatically when you installed the virtualization software. [source,C] ---- # virsh net-list Name State Autostart Persistent ________________________________________________________ default active yes yes # virsh net-dumpxml default | grep -e "ip address=" ---- === Generate the Authkey === Generate the authkey used to secure connections between the host and the lxc guest pacemaker_remote instances. This is sort of a funny case because the lxc guests and the host will share the same key file in the /etc/pacemaker/ directory. If in a different deployment where the lxc guests do not share the host's /etc/pacemaker directory, this key will have to be copied into each lxc guest. [source,C] ---- # dd if=/dev/urandom of=/etc/pacemaker/authkey bs=4096 count=1 ---- == Step 3: Integrate LXC guests into Cluster. == === Start Cluster === On the host, start pacemaker. [source,C] ---- # pcs cluster start ---- Wait for the host to become the DC. The output of 'pcs status' should look similar to this after about a minute. [source,C] ---- Last updated: Thu Mar 14 16:41:22 2013 Last change: Thu Mar 14 16:41:08 2013 via crmd on example-host Stack: corosync Current DC: example-host (1795270848) - partition WITHOUT quorum Version: 1.1.10 1 Nodes configured, unknown expected votes 0 Resources configured. Online: [ example-host ] ---- Now enable the cluster to work without quorum or stonith. This is required just for the sake of getting this tutorial to work with a single cluster-node. [source,C] ---- # pcs property set stonith-enabled=false # pcs property set no-quorum-policy=ignore ---- === Integrate LXC Guests as remote-nodes === If you ran the 'lxc-autogen' script with default parameters, 3 lxc domain definitions were created as .xml files. If you used the same directory I used for the lxc environment, the config files will be located in /root/lxc. Replace the 'config' parameters in the following pcs commands if yours should be different. The pcs commands below each configure a lxc guest as a remote-node in pacemaker. Behind the scenes each lxc guest is launching an instance of pacemaker_remote allowing pacemaker to integrate the lxc guests as remote-nodes. The meta-attribute 'remote-node=' used in each command is what tells pacemaker that the lxc guest is both a resource and a remote-node capable of running resources. In this case, the 'remote-node' attribute also indicates to pacemaker that it can contact each lxc's pacemaker_remote service by using the remote-node name as the hostname. If you look in the /etc/hosts/ file you will see entries for each lxc guest. These entries were auto-generated earlier by the 'lxc-autogen' script. [source,C] ---- # pcs resource create container1 VirtualDomain force_stop="true" hypervisor="lxc:///" config="/root/lxc/lxc1.xml" meta remote-node=lxc1 # pcs resource create container2 VirtualDomain force_stop="true" hypervisor="lxc:///" config="/root/lxc/lxc2.xml" meta remote-node=lxc2 # pcs resource create container3 VirtualDomain force_stop="true" hypervisor="lxc:///" config="/root/lxc/lxc3.xml" meta remote-node=lxc3 ---- After creating the container resources you 'pcs status' should look like this. [source,C] ---- Last updated: Mon Mar 18 17:15:46 2013 Last change: Mon Mar 18 17:15:26 2013 via cibadmin on guest1 Stack: corosync Current DC: example-host (175810752) - partition WITHOUT quorum Version: 1.1.10 4 Nodes configured, unknown expected votes 6 Resources configured. Online: [ example-host lxc1 lxc2 lxc3 ] Full list of resources: container3 (ocf::heartbeat:VirtualDomain): Started example-host container1 (ocf::heartbeat:VirtualDomain): Started example-host container2 (ocf::heartbeat:VirtualDomain): Started example-host ---- === Starting Resources on LXC Guests === Now that the lxc guests are integrated into the cluster, lets generate some Dummy resources to run on them. Dummy resources are real resource agents used just for testing purposes. They actually execute on the node they are assigned to just like an apache server or database would, except their execution just means a file was created. When the resource is stopped, that the file it created is removed. [source,C] ---- # pcs resource create FAKE1 ocf:pacemaker:Dummy # pcs resource create FAKE2 ocf:pacemaker:Dummy # pcs resource create FAKE3 ocf:pacemaker:Dummy # pcs resource create FAKE4 ocf:pacemaker:Dummy # pcs resource create FAKE5 ocf:pacemaker:Dummy ---- After creating the Dummy resources you will see that the resource got distributed among all the nodes. The 'pcs status' output should look similar to this. [source,C] ---- Last updated: Mon Mar 18 17:31:54 2013 Last change: Mon Mar 18 17:31:05 2013 via cibadmin on example-host Stack: corosync Current DC: example=host (175810752) - partition WITHOUT quorum Version: 1.1.10 4 Nodes configured, unknown expected votes 11 Resources configured. Online: [ example-host lxc1 lxc2 lxc3 ] Full list of resources: container3 (ocf::heartbeat:VirtualDomain): Started example-host container1 (ocf::heartbeat:VirtualDomain): Started example-host container2 (ocf::heartbeat:VirtualDomain): Started example-host FAKE1 (ocf::pacemaker:Dummy): Started lxc1 FAKE2 (ocf::pacemaker:Dummy): Started lxc2 FAKE3 (ocf::pacemaker:Dummy): Started lxc3 FAKE4 (ocf::pacemaker:Dummy): Started lxc1 FAKE5 (ocf::pacemaker:Dummy): Started lxc2 ---- To witness that Dummy agents are running within the lxc guests browse one of the lxc domain's filesystem folders. Each lxc guest has a custom mount point for the '/var/run/'directory, which is the location the Dummy resources write their state files to. [source,C] ---- # ls lxc1-filesystem/var/run/ Dummy-FAKE4.state Dummy-FAKE.state ---- If you are curious, take a look at lxc1.xml to see how the filesystem is mounted. === Testing LXC Guest Failure === You will be able to see each pacemaker_remoted process running in each lxc guest from the host machine. [source,C] ---- # ps -A | grep -e pacemaker_remote* 9142 pts/2 00:00:00 pacemaker_remot 10148 pts/4 00:00:00 pacemaker_remot 10942 pts/6 00:00:00 pacemaker_remot ---- In order to see how the cluster reacts to a failed lxc guest. Try killing one of the pacemaker_remote instances. [source,C] ---- # kill -9 9142 ---- After a few moments the lxc guest that was running that instance of pacemaker_remote will be recovered along with all the resources running within that container. diff --git a/doc/Pacemaker_Remote/en-US/Ch-Options.txt b/doc/Pacemaker_Remote/en-US/Ch-Options.txt index 9e14b310c4..5d68b4f9d0 100644 --- a/doc/Pacemaker_Remote/en-US/Ch-Options.txt +++ b/doc/Pacemaker_Remote/en-US/Ch-Options.txt @@ -1,51 +1,77 @@ = Configuration Explained = The walk-through examples use some of these options, but don't explain exactly what they mean or do. This section is meant to be the go-to resource for all the options available for configuring remote-nodes. -== Resource Options == +== Container remote-node Resource Options == When configuring a virtual machine or lxc resource to act as a remote-node, these are the metadata options available to both enable the resource as a remote-node and define the connection parameters. .Metadata Options for configuring KVM/LXC resources as remote-nodes [width="95%",cols="1m,1,4<",options="header",align="center"] |========================================================= |Option |Default |Description |+remote-node+ | |The name of the remote-node this resource defines. This both enables the resource as a remote-node and defines the unique name used to identify the remote-node. If no other parameters are set, this value will also be assumed as the hostname to connect to at port 3121. +WARNING+ This value cannot overlap with any resource or node IDs. |+remote-port+ |3121 |Configure a custom port to use for the guest connection to pacemaker_remote. |+remote-addr+ |+remote-node+ value used as hostname |The ip address or hostname to connect to if remote-node's name is not the hostname of the guest. |+remote-connect-timeout+ |60s |How long before a pending guest connection will time out. |========================================================= +== Baremetal remote-node Options == + +Baremetal remote-nodes are defined by a connection resource. That connection resource has the following instance attributes that define where the baremetal remote-node is located on the network and how to communicate with that remote-node. Descriptions of these options can be retrieved using the following pcs command. + +[source,C] +---- +# pcs resource describe remote + Resource options for: ocf:pacemaker:remote + server: Server location to connect to. This can be an ip address or hostname. + port: tcp port to connect to. +---- + +When defining a baremetal remote-node's connection resource, it is common and recommended to name the connection resource the same name as the baremeatal remote-node's hostname. By default, if no "server" option is provided, the cluster will attempt to contact the remote-node using the resource name as the hostname. + +Example, defining a baremetal remote-node with the hostname "remote1" +[source,C] +---- +# pcs resource create remote1 remote +---- + +Example, defining a baremetal remote-node to connect to a specific ip and port. +[source,C] +---- +# pcs resource create remote1 remote server=192.168.122.200 port=8938 +---- + == Host and Guest Authentication == Authentication and encryption of the connection between cluster-nodes (pacemaker) to remote-nodes (pacemaker_remote) is achieved using TLS with PSK encryption/authentication on +tcp port 3121+. This means both the cluster-node and remote-node must share the same private key. By default this +key must be placed at "/etc/pacemaker/authkey" on both cluster-nodes and remote-nodes+. == Pacemaker and pacemaker_remote Options == If you need to change the default port or authkey location for either pacemaker or pacemaker_remote, there are environment variables you can set that affect both of those daemons. These environment variables can be enabled by placing them in the /etc/sysconfig/pacemaker file. [source,C] ---- #==#==# Pacemaker Remote # Use a custom directory for finding the authkey. PCMK_authkey_location=/etc/pacemaker/authkey # # Specify a custom port for Pacemaker Remote connections PCMK_remote_port=3121 ---- diff --git a/doc/Pacemaker_Remote/en-US/Pacemaker_Remote.xml b/doc/Pacemaker_Remote/en-US/Pacemaker_Remote.xml index 9ee710c1c0..9a5e119481 100644 --- a/doc/Pacemaker_Remote/en-US/Pacemaker_Remote.xml +++ b/doc/Pacemaker_Remote/en-US/Pacemaker_Remote.xml @@ -1,17 +1,18 @@ %BOOK_ENTITIES; ]> + diff --git a/doc/Pacemaker_Remote/en-US/Revision_History.xml b/doc/Pacemaker_Remote/en-US/Revision_History.xml index 257ecbdaa5..c5b33a2bf2 100644 --- a/doc/Pacemaker_Remote/en-US/Revision_History.xml +++ b/doc/Pacemaker_Remote/en-US/Revision_History.xml @@ -1,25 +1,31 @@ %BOOK_ENTITIES; ]> Revision History 1-0 Tue Mar 19 2013 DavidVosseldvossel@redhat.com Import from Pages.app 2-0 Tue May 13 2013 DavidVosseldvossel@redhat.com Added Future Features Section + + 3-0 + Fri Oct 18 2013 + DavidVosseldvossel@redhat.com + Added Baremetal remote-node feature documentation + diff --git a/tools/crm_master b/tools/crm_master index deefa5e050..cd96877c45 100755 --- a/tools/crm_master +++ b/tools/crm_master @@ -1,53 +1,56 @@ #!/bin/bash +target=`crm_node -n` + TEMP=`getopt -o qDGQVN:U:v:i:l:r: --long version,help,resource:,node:,uname:,attr-value:,id:,update:,delete-attr,get-value,attr-id:,lifetime:,quiet \ -n 'crm_master' -- "$@"` if [ $? != 0 ] ; then echo "crm_master - A convenience wrapper for crm_attribute"; echo ""; crm_attribute -?; exit 1 ; fi # Note the quotes around `$TEMP': they are essential! eval set -- "$TEMP" while true ; do case "$1" in - -N|--node|-U|--uname|-v|--attr-value|--update|-i|--id|--attr-id|-l|--lifetime) options="$options $1 $2"; shift; shift;; + -N|--node|-U|--uname) target="$2"; shift; shift;; + -v|--attr-value|--update|-i|--id|--attr-id|-l|--lifetime) options="$options $1 $2"; shift; shift;; -Q|-q|--quiet|-D|--delete-attr|-G|--get-value|-V) options="$options $1"; shift;; -r|--resource) OCF_RESOURCE_INSTANCE=$2; shift; shift;; --version) crm_attribute --version; exit 0;; --help) echo "crm_master - A convenience wrapper for crm_attribute"; echo ""; echo "Set, update or delete a resource's promotion score"; echo ""; echo "This program should normally only be invoked from inside an OCF resource agent" echo ""; echo "Usage: crm_master command [options]"; echo "Options:" echo " --help This text" echo " --version Version information" echo " -V, --verbose Increase debug output" echo " -q, --quiet Print only the value on stdout" echo "" echo "Commands:" echo " -G, --query Query the current value of the attribute/option" echo " -v, --update=value Update the value of the attribute/option" echo " -D, --delete Delete the attribute/option" echo "" echo "Additional Options:" echo " -N, --node=value Set an attribute for the named node (instead of the current one)." echo " -l, --lifetime=value Until when should the setting take affect." echo " Valid values: reboot, forever" echo " -i, --id=value (Advanced) The ID used to identify the attribute" exit 0;; --) shift ; break ;; *) echo "Unknown option: $1. See --help for details." exit 1;; esac done if [ -z "$OCF_RESOURCE_INSTANCE" ]; then echo "This program should normally only be invoked from inside an OCF resource agent" echo "To set the prmotion/master score from the command line, please specify a resource ID with -r" exit 1 fi -crm_attribute -N `crm_node -n` -n master-$OCF_RESOURCE_INSTANCE $options +crm_attribute -N $target -n master-$OCF_RESOURCE_INSTANCE $options