diff --git a/crmd/control.c b/crmd/control.c index 31d49f924c..b006d6cc4d 100644 --- a/crmd/control.c +++ b/crmd/control.c @@ -1,1008 +1,999 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include qb_ipcs_service_t *ipcs = NULL; extern gboolean crm_connect_corosync(crm_cluster_t * cluster); extern void crmd_ha_connection_destroy(gpointer user_data); void crm_shutdown(int nsig); gboolean crm_read_options(gpointer user_data); gboolean fsa_has_quorum = FALSE; crm_trigger_t *fsa_source = NULL; crm_trigger_t *config_read = NULL; static gboolean election_timeout_popped(gpointer data) { /* Not everyone voted */ crm_info("Election failed: Declaring ourselves the winner"); register_fsa_input(C_TIMER_POPPED, I_ELECTION_DC, NULL); return FALSE; } /* A_HA_CONNECT */ void do_ha_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { gboolean registered = FALSE; static crm_cluster_t *cluster = NULL; if (cluster == NULL) { cluster = calloc(1, sizeof(crm_cluster_t)); } if (action & A_HA_DISCONNECT) { crm_cluster_disconnect(cluster); crm_info("Disconnected from the cluster"); set_bit(fsa_input_register, R_HA_DISCONNECTED); } if (action & A_HA_CONNECT) { crm_set_status_callback(&peer_update_callback); if (is_openais_cluster()) { #if SUPPORT_COROSYNC registered = crm_connect_corosync(cluster); #endif } else if (is_heartbeat_cluster()) { #if SUPPORT_HEARTBEAT cluster->destroy = crmd_ha_connection_destroy; cluster->hb_dispatch = crmd_ha_msg_callback; registered = crm_cluster_connect(cluster); fsa_cluster_conn = cluster->hb_conn; crm_trace("Be informed of Node Status changes"); if (registered && fsa_cluster_conn->llc_ops->set_nstatus_callback(fsa_cluster_conn, crmd_ha_status_callback, fsa_cluster_conn) != HA_OK) { crm_err("Cannot set nstatus callback: %s", fsa_cluster_conn->llc_ops->errmsg(fsa_cluster_conn)); registered = FALSE; } crm_trace("Be informed of CRM Client Status changes"); if (registered && fsa_cluster_conn->llc_ops->set_cstatus_callback(fsa_cluster_conn, crmd_client_status_callback, fsa_cluster_conn) != HA_OK) { crm_err("Cannot set cstatus callback: %s", fsa_cluster_conn->llc_ops->errmsg(fsa_cluster_conn)); registered = FALSE; } if (registered) { crm_trace("Requesting an initial dump of CRMD client_status"); fsa_cluster_conn->llc_ops->client_status(fsa_cluster_conn, NULL, CRM_SYSTEM_CRMD, -1); } #endif } fsa_election = election_init(NULL, cluster->uname, 60000/*60s*/, election_timeout_popped); fsa_our_uname = cluster->uname; fsa_our_uuid = cluster->uuid; if(cluster->uuid == NULL) { crm_err("Could not obtain local uuid"); registered = FALSE; } if (registered == FALSE) { set_bit(fsa_input_register, R_HA_DISCONNECTED); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return; } populate_cib_nodes(node_update_none, __FUNCTION__); clear_bit(fsa_input_register, R_HA_DISCONNECTED); crm_info("Connected to the cluster"); } if (action & ~(A_HA_CONNECT | A_HA_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__); } } /* A_SHUTDOWN */ void do_shutdown(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { /* just in case */ set_bit(fsa_input_register, R_SHUTDOWN); if (is_heartbeat_cluster()) { if (is_set(fsa_input_register, pe_subsystem->flag_connected)) { crm_info("Terminating the %s", pe_subsystem->name); if (stop_subsystem(pe_subsystem, TRUE) == FALSE) { /* its gone... */ crm_err("Faking %s exit", pe_subsystem->name); clear_bit(fsa_input_register, pe_subsystem->flag_connected); } else { crm_info("Waiting for subsystems to exit"); crmd_fsa_stall(FALSE); } } crm_info("All subsystems stopped, continuing"); } if (stonith_api) { /* Prevent it from comming up again */ clear_bit(fsa_input_register, R_ST_REQUIRED); crm_info("Disconnecting STONITH..."); stonith_api->cmds->disconnect(stonith_api); } } /* A_SHUTDOWN_REQ */ void do_shutdown_req(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { xmlNode *msg = NULL; crm_info("Sending shutdown request to %s", crm_str(fsa_our_dc)); msg = create_request(CRM_OP_SHUTDOWN_REQ, NULL, NULL, CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL); /* set_bit(fsa_input_register, R_STAYDOWN); */ if (send_cluster_message(NULL, crm_msg_crmd, msg, TRUE) == FALSE) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } free_xml(msg); } extern crm_ipc_t *attrd_ipc; extern char *max_generation_from; extern xmlNode *max_generation_xml; extern GHashTable *resource_history; extern GHashTable *voted; extern GHashTable *reload_hash; extern char *te_client_id; void log_connected_client(gpointer key, gpointer value, gpointer user_data); void log_connected_client(gpointer key, gpointer value, gpointer user_data) { crm_client_t *client = value; crm_err("%s is still connected at exit", crm_client_name(client)); } int crmd_fast_exit(int rc) { if (is_set(fsa_input_register, R_STAYDOWN)) { crm_warn("Inhibiting respawn: %d -> %d", rc, 100); rc = 100; } if (rc == pcmk_ok && is_set(fsa_input_register, R_IN_RECOVERY)) { crm_err("Could not recover from internal error"); rc = pcmk_err_generic; } return crm_exit(rc); } int crmd_exit(int rc) { GListPtr gIter = NULL; GMainLoop *mloop = crmd_mainloop; static bool in_progress = FALSE; if(in_progress && rc == 0) { crm_debug("Exit is already in progress"); return rc; } else if(in_progress) { crm_notice("Error during shutdown process, terminating now: %s (%d)", pcmk_strerror(rc), rc); crm_write_blackbox(SIGTRAP, NULL); crmd_fast_exit(rc); } in_progress = TRUE; crm_trace("Preparing to exit: %d", rc); /* Suppress secondary errors resulting from us disconnecting everything */ set_bit(fsa_input_register, R_HA_DISCONNECTED); /* Close all IPC servers and clients to ensure any and all shared memory files are cleaned up */ if(ipcs) { crm_trace("Closing IPC server"); mainloop_del_ipc_server(ipcs); ipcs = NULL; } if (attrd_ipc) { crm_trace("Closing attrd connection"); crm_ipc_close(attrd_ipc); crm_ipc_destroy(attrd_ipc); attrd_ipc = NULL; } if (pe_subsystem && pe_subsystem->client && pe_subsystem->client->ipcs) { crm_trace("Disconnecting Policy Engine"); qb_ipcs_disconnect(pe_subsystem->client->ipcs); } if(stonith_api) { crm_trace("Disconnecting fencing API"); clear_bit(fsa_input_register, R_ST_REQUIRED); stonith_api->cmds->free(stonith_api); stonith_api = NULL; } if (rc == pcmk_ok && crmd_mainloop == NULL) { crm_debug("No mainloop detected"); rc = EPROTO; } /* On an error, just get out. * * Otherwise, make the effort to have mainloop exit gracefully so * that it (mostly) cleans up after itself and valgrind has less * to report on - allowing real errors stand out */ if(rc != pcmk_ok) { crm_notice("Forcing immediate exit: %s (%d)", pcmk_strerror(rc), rc); crm_write_blackbox(SIGTRAP, NULL); return crmd_fast_exit(rc); } /* Clean up as much memory as possible for valgrind */ #if SUPPORT_HEARTBEAT if (fsa_cluster_conn) { crm_trace("Disconnecting heartbeat"); fsa_cluster_conn->llc_ops->delete(fsa_cluster_conn); fsa_cluster_conn = NULL; } #endif for (gIter = fsa_message_queue; gIter != NULL; gIter = gIter->next) { fsa_data_t *fsa_data = gIter->data; crm_info("Dropping %s: [ state=%s cause=%s origin=%s ]", fsa_input2string(fsa_data->fsa_input), fsa_state2string(fsa_state), fsa_cause2string(fsa_data->fsa_cause), fsa_data->origin); delete_fsa_input(fsa_data); } clear_bit(fsa_input_register, R_MEMBERSHIP); g_list_free(fsa_message_queue); fsa_message_queue = NULL; free(pe_subsystem); pe_subsystem = NULL; free(te_subsystem); te_subsystem = NULL; free(cib_subsystem); cib_subsystem = NULL; if (reload_hash) { crm_trace("Destroying reload cache with %d members", g_hash_table_size(reload_hash)); g_hash_table_destroy(reload_hash); reload_hash = NULL; } election_fini(fsa_election); fsa_election = NULL; cib_delete(fsa_cib_conn); fsa_cib_conn = NULL; verify_stopped(fsa_state, LOG_WARNING); clear_bit(fsa_input_register, R_LRM_CONNECTED); lrm_state_destroy_all(); /* This basically will not work, since mainloop has a reference to it */ mainloop_destroy_trigger(fsa_source); fsa_source = NULL; mainloop_destroy_trigger(config_read); config_read = NULL; mainloop_destroy_trigger(stonith_reconnect); stonith_reconnect = NULL; mainloop_destroy_trigger(transition_trigger); transition_trigger = NULL; crm_client_cleanup(); crm_peer_destroy(); crm_timer_stop(transition_timer); crm_timer_stop(integration_timer); crm_timer_stop(finalization_timer); crm_timer_stop(election_trigger); election_timeout_stop(fsa_election); crm_timer_stop(shutdown_escalation_timer); crm_timer_stop(wait_timer); crm_timer_stop(recheck_timer); free(transition_timer); transition_timer = NULL; free(integration_timer); integration_timer = NULL; free(finalization_timer); finalization_timer = NULL; free(election_trigger); election_trigger = NULL; election_fini(fsa_election); free(shutdown_escalation_timer); shutdown_escalation_timer = NULL; free(wait_timer); wait_timer = NULL; free(recheck_timer); recheck_timer = NULL; free(fsa_our_dc_version); fsa_our_dc_version = NULL; free(fsa_our_uname); fsa_our_uname = NULL; free(fsa_our_uuid); fsa_our_uuid = NULL; free(fsa_our_dc); fsa_our_dc = NULL; free(te_uuid); te_uuid = NULL; free(te_client_id); te_client_id = NULL; free(fsa_pe_ref); fsa_pe_ref = NULL; free(failed_stop_offset); failed_stop_offset = NULL; free(failed_start_offset); failed_start_offset = NULL; free(max_generation_from); max_generation_from = NULL; free_xml(max_generation_xml); max_generation_xml = NULL; mainloop_destroy_signal(SIGUSR1); mainloop_destroy_signal(SIGTERM); mainloop_destroy_signal(SIGTRAP); mainloop_destroy_signal(SIGCHLD); if (mloop) { int lpc = 0; GMainContext *ctx = g_main_loop_get_context(crmd_mainloop); /* Don't re-enter this block */ crmd_mainloop = NULL; crm_trace("Draining mainloop %d %d", g_main_loop_is_running(mloop), g_main_context_pending(ctx)); while(g_main_context_pending(ctx) && lpc < 10) { lpc++; crm_trace("Iteration %d", lpc); g_main_context_dispatch(ctx); } crm_trace("Closing mainloop %d %d", g_main_loop_is_running(mloop), g_main_context_pending(ctx)); g_main_loop_quit(mloop); /* Won't do anything yet, since we're inside it now */ g_main_loop_unref(mloop); crm_trace("Done %d", rc); } /* Graceful */ return rc; } /* A_EXIT_0, A_EXIT_1 */ void do_exit(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int exit_code = pcmk_ok; int log_level = LOG_INFO; const char *exit_type = "gracefully"; if (action & A_EXIT_1) { /* exit_code = pcmk_err_generic; */ log_level = LOG_ERR; exit_type = "forcefully"; exit_code = pcmk_err_generic; } verify_stopped(cur_state, LOG_ERR); do_crm_log(log_level, "Performing %s - %s exiting the CRMd", fsa_action2string(action), exit_type); crm_info("[%s] stopped (%d)", crm_system_name, exit_code); crmd_exit(exit_code); } /* A_STARTUP */ void do_startup(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { int was_error = 0; int interval = 1; /* seconds between DC heartbeats */ crm_debug("Registering Signal Handlers"); mainloop_add_signal(SIGTERM, crm_shutdown); fsa_source = mainloop_add_trigger(G_PRIORITY_HIGH, crm_fsa_trigger, NULL); config_read = mainloop_add_trigger(G_PRIORITY_HIGH, crm_read_options, NULL); transition_trigger = mainloop_add_trigger(G_PRIORITY_LOW, te_graph_trigger, NULL); crm_debug("Creating CIB and LRM objects"); fsa_cib_conn = cib_new(); lrm_state_init_local(); /* set up the timers */ transition_timer = calloc(1, sizeof(fsa_timer_t)); integration_timer = calloc(1, sizeof(fsa_timer_t)); finalization_timer = calloc(1, sizeof(fsa_timer_t)); election_trigger = calloc(1, sizeof(fsa_timer_t)); shutdown_escalation_timer = calloc(1, sizeof(fsa_timer_t)); wait_timer = calloc(1, sizeof(fsa_timer_t)); recheck_timer = calloc(1, sizeof(fsa_timer_t)); interval = interval * 1000; if (election_trigger != NULL) { election_trigger->source_id = 0; election_trigger->period_ms = -1; election_trigger->fsa_input = I_DC_TIMEOUT; election_trigger->callback = crm_timer_popped; election_trigger->repeat = FALSE; } else { was_error = TRUE; } if (transition_timer != NULL) { transition_timer->source_id = 0; transition_timer->period_ms = -1; transition_timer->fsa_input = I_PE_CALC; transition_timer->callback = crm_timer_popped; transition_timer->repeat = FALSE; } else { was_error = TRUE; } if (integration_timer != NULL) { integration_timer->source_id = 0; integration_timer->period_ms = -1; integration_timer->fsa_input = I_INTEGRATED; integration_timer->callback = crm_timer_popped; integration_timer->repeat = FALSE; } else { was_error = TRUE; } if (finalization_timer != NULL) { finalization_timer->source_id = 0; finalization_timer->period_ms = -1; finalization_timer->fsa_input = I_FINALIZED; finalization_timer->callback = crm_timer_popped; finalization_timer->repeat = FALSE; /* for possible enabling... a bug in the join protocol left * a slave in S_PENDING while we think its in S_NOT_DC * * raising I_FINALIZED put us into a transition loop which is * never resolved. * in this loop we continually send probes which the node * NACK's because its in S_PENDING * * if we have nodes where heartbeat is active but the * CRM is not... then this will be handled in the * integration phase */ finalization_timer->fsa_input = I_ELECTION; } else { was_error = TRUE; } if (shutdown_escalation_timer != NULL) { shutdown_escalation_timer->source_id = 0; shutdown_escalation_timer->period_ms = -1; shutdown_escalation_timer->fsa_input = I_STOP; shutdown_escalation_timer->callback = crm_timer_popped; shutdown_escalation_timer->repeat = FALSE; } else { was_error = TRUE; } if (wait_timer != NULL) { wait_timer->source_id = 0; wait_timer->period_ms = 2000; wait_timer->fsa_input = I_NULL; wait_timer->callback = crm_timer_popped; wait_timer->repeat = FALSE; } else { was_error = TRUE; } if (recheck_timer != NULL) { recheck_timer->source_id = 0; recheck_timer->period_ms = -1; recheck_timer->fsa_input = I_PE_CALC; recheck_timer->callback = crm_timer_popped; recheck_timer->repeat = FALSE; } else { was_error = TRUE; } /* set up the sub systems */ cib_subsystem = calloc(1, sizeof(struct crm_subsystem_s)); te_subsystem = calloc(1, sizeof(struct crm_subsystem_s)); pe_subsystem = calloc(1, sizeof(struct crm_subsystem_s)); if (cib_subsystem != NULL) { cib_subsystem->pid = -1; cib_subsystem->name = CRM_SYSTEM_CIB; cib_subsystem->flag_connected = R_CIB_CONNECTED; cib_subsystem->flag_required = R_CIB_REQUIRED; } else { was_error = TRUE; } if (te_subsystem != NULL) { te_subsystem->pid = -1; te_subsystem->name = CRM_SYSTEM_TENGINE; te_subsystem->flag_connected = R_TE_CONNECTED; te_subsystem->flag_required = R_TE_REQUIRED; } else { was_error = TRUE; } if (pe_subsystem != NULL) { pe_subsystem->pid = -1; pe_subsystem->path = CRM_DAEMON_DIR; pe_subsystem->name = CRM_SYSTEM_PENGINE; pe_subsystem->command = CRM_DAEMON_DIR "/" CRM_SYSTEM_PENGINE; pe_subsystem->args = NULL; pe_subsystem->flag_connected = R_PE_CONNECTED; pe_subsystem->flag_required = R_PE_REQUIRED; } else { was_error = TRUE; } if (was_error == FALSE && is_heartbeat_cluster()) { if (start_subsystem(pe_subsystem) == FALSE) { was_error = TRUE; } } if (was_error) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } static int32_t crmd_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) { crm_trace("Connection %p", c); if (crm_client_new(c, uid, gid) == NULL) { return -EIO; } return 0; } static void crmd_ipc_created(qb_ipcs_connection_t * c) { crm_trace("Connection %p", c); } static int32_t crmd_ipc_dispatch(qb_ipcs_connection_t * c, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; crm_client_t *client = crm_client_get(c); xmlNode *msg = crm_ipcs_recv(client, data, size, &id, &flags); crm_trace("Invoked: %s", crm_client_name(client)); crm_ipcs_send_ack(client, id, flags, "ack", __FUNCTION__, __LINE__); if (msg == NULL) { return 0; } #if ENABLE_ACL determine_request_user(client->user, msg, F_CRM_USER); #endif crm_trace("Processing msg from %s", crm_client_name(client)); crm_log_xml_trace(msg, "CRMd[inbound]"); crm_xml_add(msg, F_CRM_SYS_FROM, client->id); if (crmd_authorize_message(msg, client, NULL)) { route_message(C_IPC_MESSAGE, msg); } trigger_fsa(fsa_source); free_xml(msg); return 0; } static int32_t crmd_ipc_closed(qb_ipcs_connection_t * c) { crm_client_t *client = crm_client_get(c); struct crm_subsystem_s *the_subsystem = NULL; crm_trace("Connection %p", c); if (client->userdata == NULL) { crm_trace("Client hadn't registered with us yet"); } else if (strcasecmp(CRM_SYSTEM_PENGINE, client->userdata) == 0) { the_subsystem = pe_subsystem; } else if (strcasecmp(CRM_SYSTEM_TENGINE, client->userdata) == 0) { the_subsystem = te_subsystem; } else if (strcasecmp(CRM_SYSTEM_CIB, client->userdata) == 0) { the_subsystem = cib_subsystem; } if (the_subsystem != NULL) { the_subsystem->source = NULL; the_subsystem->client = NULL; crm_info("Received HUP from %s:[%d]", the_subsystem->name, the_subsystem->pid); } else { /* else that was a transient client */ crm_trace("Received HUP from transient client"); } crm_trace("Disconnecting client %s (%p)", crm_client_name(client), client); free(client->userdata); crm_client_destroy(client); trigger_fsa(fsa_source); return 0; } static void crmd_ipc_destroy(qb_ipcs_connection_t * c) { crm_trace("Connection %p", c); } /* A_STOP */ void do_stop(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { if (is_heartbeat_cluster()) { stop_subsystem(pe_subsystem, FALSE); } crm_trace("Closing IPC server"); mainloop_del_ipc_server(ipcs); ipcs = NULL; register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } /* A_STARTED */ void do_started(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { static struct qb_ipcs_service_handlers crmd_callbacks = { .connection_accept = crmd_ipc_accept, .connection_created = crmd_ipc_created, .msg_process = crmd_ipc_dispatch, .connection_closed = crmd_ipc_closed, .connection_destroyed = crmd_ipc_destroy }; if (cur_state != S_STARTING) { crm_err("Start cancelled... %s", fsa_state2string(cur_state)); return; } else if (is_set(fsa_input_register, R_MEMBERSHIP) == FALSE) { crm_info("Delaying start, no membership data (%.16llx)", R_MEMBERSHIP); crmd_fsa_stall(TRUE); return; } else if (is_set(fsa_input_register, R_LRM_CONNECTED) == FALSE) { crm_info("Delaying start, LRM not connected (%.16llx)", R_LRM_CONNECTED); crmd_fsa_stall(TRUE); return; } else if (is_set(fsa_input_register, R_CIB_CONNECTED) == FALSE) { crm_info("Delaying start, CIB not connected (%.16llx)", R_CIB_CONNECTED); crmd_fsa_stall(TRUE); return; } else if (is_set(fsa_input_register, R_READ_CONFIG) == FALSE) { crm_info("Delaying start, Config not read (%.16llx)", R_READ_CONFIG); crmd_fsa_stall(TRUE); return; } else if (is_set(fsa_input_register, R_PEER_DATA) == FALSE) { /* try reading from HA */ crm_info("Delaying start, No peer data (%.16llx)", R_PEER_DATA); #if SUPPORT_HEARTBEAT if (is_heartbeat_cluster()) { HA_Message *msg = NULL; crm_trace("Looking for a HA message"); msg = fsa_cluster_conn->llc_ops->readmsg(fsa_cluster_conn, 0); if (msg != NULL) { crm_trace("There was a HA message"); ha_msg_del(msg); } } #endif crmd_fsa_stall(TRUE); return; } crm_debug("Init server comms"); ipcs = crmd_ipc_server_init(&crmd_callbacks); if (ipcs == NULL) { crm_err("Failed to create IPC server: shutting down and inhibiting respawn"); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } if (stonith_reconnect == NULL) { int dummy; stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW, te_connect_stonith, &dummy); } set_bit(fsa_input_register, R_ST_REQUIRED); mainloop_set_trigger(stonith_reconnect); crm_notice("The local CRM is operational"); clear_bit(fsa_input_register, R_STARTING); register_fsa_input(msg_data->fsa_cause, I_PENDING, NULL); } /* A_RECOVER */ void do_recover(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { set_bit(fsa_input_register, R_IN_RECOVERY); crm_warn("Fast-tracking shutdown in response to errors"); register_fsa_input(C_FSA_INTERNAL, I_TERMINATE, NULL); } /* *INDENT-OFF* */ pe_cluster_option crmd_opts[] = { /* name, old-name, validate, default, description */ { "dc-version", NULL, "string", NULL, "none", NULL, "Version of Pacemaker on the cluster's DC.", "Includes the hash which identifies the exact Mercurial changeset it was built from. Used for diagnostic purposes." }, { "cluster-infrastructure", NULL, "string", NULL, "heartbeat", NULL, "The messaging stack on which Pacemaker is currently running.", "Used for informational and diagnostic purposes." }, { XML_CONFIG_ATTR_DC_DEADTIME, "dc_deadtime", "time", NULL, "20s", &check_time, "How long to wait for a response from other nodes during startup.", "The \"correct\" value will depend on the speed/load of your network and the type of switches used." }, { XML_CONFIG_ATTR_RECHECK, "cluster_recheck_interval", "time", "Zero disables polling. Positive values are an interval in seconds (unless other SI units are specified. eg. 5min)", "15min", &check_timer, "Polling interval for time based changes to options, resource parameters and constraints.", "The Cluster is primarily event driven, however the configuration can have elements that change based on time." " To ensure these changes take effect, we can optionally poll the cluster's status for changes." }, { "load-threshold", NULL, "percentage", NULL, "80%", &check_utilization, "The maximum amount of system resources that should be used by nodes in the cluster", "The cluster will slow down its recovery process when the amount of system resources used" " (currently CPU) approaches this limit", }, { "node-action-limit", "migration-limit", "integer", NULL, "0", &check_number, "The maximum number of jobs that can be scheduled per node. Defaults to 2x cores"}, { XML_CONFIG_ATTR_ELECTION_FAIL, "election_timeout", "time", NULL, "2min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { XML_CONFIG_ATTR_FORCE_QUIT, "shutdown_escalation", "time", NULL, "20min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-integration-timeout", NULL, "time", NULL, "3min", &check_timer, "*** Advanced Use Only ***.", "If need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-finalization-timeout", NULL, "time", NULL, "30min", &check_timer, "*** Advanced Use Only ***.", "If you need to adjust this value, it probably indicates the presence of a bug." }, { "crmd-transition-delay", NULL, "time", NULL, "0s", &check_timer, "*** Advanced Use Only ***\nEnabling this option will slow down cluster recovery under all conditions", "Delay cluster recovery for the configured interval to allow for additional/related events to occur.\nUseful if your configuration is sensitive to the order in which ping updates arrive." }, #if SUPPORT_PLUGIN { XML_ATTR_EXPECTED_VOTES, NULL, "integer", NULL, "2", &check_number, "The number of nodes expected to be in the cluster", "Used to calculate quorum in openais based clusters." }, #endif }; /* *INDENT-ON* */ void crmd_metadata(void) { config_metadata("CRM Daemon", "1.0", "CRM Daemon Options", "This is a fake resource that details the options that can be configured for the CRM Daemon.", crmd_opts, DIMOF(crmd_opts)); } static void verify_crmd_options(GHashTable * options) { verify_all_options(options, crmd_opts, DIMOF(crmd_opts)); } static const char * crmd_pref(GHashTable * options, const char *name) { return get_cluster_pref(options, crmd_opts, DIMOF(crmd_opts), name); } static void config_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { - int value_i = 0; const char *value = NULL; GHashTable *config_hash = NULL; crm_time_t *now = crm_time_new(NULL); if (rc != pcmk_ok) { fsa_data_t *msg_data = NULL; crm_err("Local CIB query resulted in an error: %s", pcmk_strerror(rc)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); if (rc == -EACCES || rc == -pcmk_err_dtd_validation) { crm_err("The cluster is mis-configured - shutting down and staying down"); set_bit(fsa_input_register, R_STAYDOWN); } goto bail; } crm_debug("Call %d : Parsing CIB options", call_id); config_hash = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); unpack_instance_attributes(output, output, XML_CIB_TAG_PROPSET, NULL, config_hash, CIB_OPTIONS_FIRST, FALSE, now); verify_crmd_options(config_hash); value = crmd_pref(config_hash, XML_CONFIG_ATTR_DC_DEADTIME); election_trigger->period_ms = crm_get_msec(value); - value = crmd_pref(config_hash, "node-action-limit"); /* Also checks migration-limit */ - value_i = crm_int_helper(value, NULL); - if(value_i <= 0) { - value = getenv("LRMD_MAX_CHILDREN"); - value_i = crm_int_helper(value, NULL); - } - if(value_i > 0) { - throttle_job_max = value_i; - } + throttle_update_job_max(value); value = crmd_pref(config_hash, "load-threshold"); if(value) { throttle_load_target = strtof(value, NULL) / 100; } value = crmd_pref(config_hash, XML_CONFIG_ATTR_FORCE_QUIT); shutdown_escalation_timer->period_ms = crm_get_msec(value); /* How long to declare an election over - even if not everyone voted */ crm_debug("Shutdown escalation occurs after: %dms", shutdown_escalation_timer->period_ms); value = crmd_pref(config_hash, XML_CONFIG_ATTR_ELECTION_FAIL); election_timeout_set_period(fsa_election, crm_get_msec(value)); value = crmd_pref(config_hash, XML_CONFIG_ATTR_RECHECK); recheck_timer->period_ms = crm_get_msec(value); crm_debug("Checking for expired actions every %dms", recheck_timer->period_ms); value = crmd_pref(config_hash, "crmd-transition-delay"); transition_timer->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, "crmd-integration-timeout"); integration_timer->period_ms = crm_get_msec(value); value = crmd_pref(config_hash, "crmd-finalization-timeout"); finalization_timer->period_ms = crm_get_msec(value); #if SUPPORT_COROSYNC if (is_classic_ais_cluster()) { value = crmd_pref(config_hash, XML_ATTR_EXPECTED_VOTES); crm_debug("Sending expected-votes=%s to corosync", value); send_cluster_text(crm_class_quorum, value, TRUE, NULL, crm_msg_ais); } #endif set_bit(fsa_input_register, R_READ_CONFIG); crm_trace("Triggering FSA: %s", __FUNCTION__); mainloop_set_trigger(fsa_source); g_hash_table_destroy(config_hash); bail: crm_time_free(now); } gboolean crm_read_options(gpointer user_data) { int call_id = fsa_cib_conn->cmds->query(fsa_cib_conn, XML_CIB_TAG_CRMCONFIG, NULL, cib_scope_local); fsa_register_cib_callback(call_id, FALSE, NULL, config_query_callback); crm_trace("Querying the CIB... call %d", call_id); return TRUE; } /* A_READCONFIG */ void do_read_config(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t * msg_data) { throttle_init(); mainloop_set_trigger(config_read); } void crm_shutdown(int nsig) { if (crmd_mainloop != NULL && g_main_is_running(crmd_mainloop)) { if (is_set(fsa_input_register, R_SHUTDOWN)) { crm_err("Escalating the shutdown"); register_fsa_input_before(C_SHUTDOWN, I_ERROR, NULL); } else { set_bit(fsa_input_register, R_SHUTDOWN); register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL); if (shutdown_escalation_timer->period_ms < 1) { const char *value = crmd_pref(NULL, XML_CONFIG_ATTR_FORCE_QUIT); int msec = crm_get_msec(value); crm_debug("Using default shutdown escalation: %dms", msec); shutdown_escalation_timer->period_ms = msec; } /* cant rely on this... */ crm_notice("Requesting shutdown, upper limit is %dms", shutdown_escalation_timer->period_ms); crm_timer_start(shutdown_escalation_timer); } } else { crm_info("exit from shutdown"); crmd_exit(pcmk_ok); } } diff --git a/crmd/throttle.c b/crmd/throttle.c index aa28d099ac..33ac974a58 100644 --- a/crmd/throttle.c +++ b/crmd/throttle.c @@ -1,606 +1,635 @@ /* * Copyright (C) 2013 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include #include enum throttle_state_e { throttle_high = 0x0100, throttle_med = 0x0010, throttle_low = 0x0001, throttle_none = 0x0000, }; struct throttle_record_s { - int cores; + int max; enum throttle_state_e mode; char *node; }; int throttle_job_max = 0; float throttle_load_target = 0.0; #define THROTTLE_FACTOR_LOW 1.2 #define THROTTLE_FACTOR_MEDIUM 1.6 #define THROTTLE_FACTOR_HIGH 2.0 GHashTable *throttle_records = NULL; mainloop_timer_t *throttle_timer = NULL; int throttle_num_cores(void) { static int cores = 0; char buffer[256]; FILE *stream = NULL; const char *cpufile = "/proc/cpuinfo"; if(cores) { return cores; } stream = fopen(cpufile, "r"); if(stream == NULL) { int rc = errno; crm_warn("Couldn't read %s, assuming a single processor: %s (%d)", cpufile, pcmk_strerror(rc), rc); return 1; } while (fgets(buffer, sizeof(buffer), stream)) { if(strstr(buffer, "processor") == buffer) { cores++; } } if(cores == 0) { crm_warn("No processors found in %s, assuming 1", cpufile); return 1; } fclose(stream); return cores; } static char *find_cib_loadfile(void) { DIR *dp; struct dirent *entry; struct stat statbuf; char *match = NULL; dp = opendir("/proc"); if (!dp) { /* no proc directory to search through */ crm_notice("Can not read /proc directory to track existing components"); return FALSE; } while ((entry = readdir(dp)) != NULL) { char procpath[128]; char value[64]; char key[16]; FILE *file; int pid; strcpy(procpath, "/proc/"); /* strlen("/proc/") + strlen("/status") + 1 = 14 * 128 - 14 = 114 */ strncat(procpath, entry->d_name, 114); if (lstat(procpath, &statbuf)) { continue; } if (!S_ISDIR(statbuf.st_mode) || !isdigit(entry->d_name[0])) { continue; } strcat(procpath, "/status"); file = fopen(procpath, "r"); if (!file) { continue; } if (fscanf(file, "%15s%63s", key, value) != 2) { fclose(file); continue; } fclose(file); if (safe_str_neq("cib", value)) { continue; } pid = atoi(entry->d_name); if (pid <= 0) { continue; } match = g_strdup_printf("/proc/%d/stat", pid); break; } closedir(dp); return match; } static bool throttle_cib_load(float *load) { /* /proc/[pid]/stat Status information about the process. This is used by ps(1). It is defined in /usr/src/linux/fs/proc/array.c. The fields, in order, with their proper scanf(3) format specifiers, are: pid %d (1) The process ID. comm %s (2) The filename of the executable, in parentheses. This is visible whether or not the executable is swapped out. state %c (3) One character from the string "RSDZTW" where R is running, S is sleeping in an interruptible wait, D is waiting in uninterruptible disk sleep, Z is zombie, T is traced or stopped (on a signal), and W is paging. ppid %d (4) The PID of the parent. pgrp %d (5) The process group ID of the process. session %d (6) The session ID of the process. tty_nr %d (7) The controlling terminal of the process. (The minor device number is contained in the combination of bits 31 to 20 and 7 to 0; the major device number is in bits 15 to 8.) tpgid %d (8) The ID of the foreground process group of the controlling terminal of the process. flags %u (%lu before Linux 2.6.22) (9) The kernel flags word of the process. For bit meanings, see the PF_* defines in the Linux kernel source file include/linux/sched.h. Details depend on the kernel version. minflt %lu (10) The number of minor faults the process has made which have not required loading a memory page from disk. cminflt %lu (11) The number of minor faults that the process's waited-for children have made. majflt %lu (12) The number of major faults the process has made which have required loading a memory page from disk. cmajflt %lu (13) The number of major faults that the process's waited-for children have made. utime %lu (14) Amount of time that this process has been scheduled in user mode, measured in clock ticks (divide by sysconf(_SC_CLK_TCK)). This includes guest time, guest_time (time spent running a virtual CPU, see below), so that applications that are not aware of the guest time field do not lose that time from their calculations. stime %lu (15) Amount of time that this process has been scheduled in kernel mode, measured in clock ticks (divide by sysconf(_SC_CLK_TCK)). */ static char *loadfile = NULL; static time_t last_call = 0; static long ticks_per_s = 0; static unsigned long last_utime, last_stime; char buffer[64*1024]; FILE *stream = NULL; time_t now = time(NULL); if(load == NULL) { return FALSE; } else { *load = 0.0; } if(loadfile == NULL) { last_call = 0; last_utime = 0; last_stime = 0; loadfile = find_cib_loadfile(); ticks_per_s = sysconf(_SC_CLK_TCK); crm_trace("Found %s", loadfile); } stream = fopen(loadfile, "r"); if(stream == NULL) { int rc = errno; crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_strerror(rc), rc); free(loadfile); loadfile = NULL; return FALSE; } if(fgets(buffer, sizeof(buffer), stream)) { char *comm = calloc(1, 256); char state = 0; int rc = 0, pid = 0, ppid = 0, pgrp = 0, session = 0, tty_nr = 0, tpgid = 0; unsigned long flags = 0, minflt = 0, cminflt = 0, majflt = 0, cmajflt = 0, utime = 0, stime = 0; rc = sscanf(buffer, "%d %[^ ] %c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu", &pid, comm, &state, &ppid, &pgrp, &session, &tty_nr, &tpgid, &flags, &minflt, &cminflt, &majflt, &cmajflt, &utime, &stime); if(rc != 15) { crm_err("Only %d of 15 fields found in %s", rc, loadfile); return FALSE; } else if(last_call > 0 && last_call < now && last_utime <= utime && last_stime <= stime) { time_t elapsed = now - last_call; unsigned long delta_utime = utime - last_utime; unsigned long delta_stime = stime - last_stime; *load = (delta_utime + delta_stime); /* Cast to a float before division */ *load /= ticks_per_s; *load /= elapsed; crm_debug("Current CIB load from %lu ticks in %ds is %f (@%lu tps)", delta_utime + delta_stime, elapsed, *load, ticks_per_s); } else { crm_debug("Init %lu + %lu ticks at %d (%lu tps)", utime, stime, now, ticks_per_s); } last_call = now; last_utime = utime; last_stime = stime; fclose(stream); return TRUE; } fclose(stream); return FALSE; } static bool throttle_load_avg(float *load) { char buffer[256]; FILE *stream = NULL; const char *loadfile = "/proc/loadavg"; if(load == NULL) { return FALSE; } stream = fopen(loadfile, "r"); if(stream == NULL) { int rc = errno; crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_strerror(rc), rc); return FALSE; } if(fgets(buffer, sizeof(buffer), stream)) { char *nl = strstr(buffer, "\n"); /* Grab the 1-minute average, ignore the rest */ *load = strtof(buffer, NULL); if(nl) { nl[0] = 0; } crm_debug("Current load is %f (full: %s)", *load, buffer); fclose(stream); return TRUE; } fclose(stream); return FALSE; } static bool throttle_io_load(float *load, unsigned int *blocked) { char buffer[64*1024]; FILE *stream = NULL; const char *loadfile = "/proc/stat"; if(load == NULL) { return FALSE; } stream = fopen(loadfile, "r"); if(stream == NULL) { int rc = errno; crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_strerror(rc), rc); return FALSE; } if(fgets(buffer, sizeof(buffer), stream)) { /* Borrowed from procps-ng's sysinfo.c */ char *b = NULL; long long cpu_use = 0; long long cpu_nic = 0; long long cpu_sys = 0; long long cpu_idl = 0; long long cpu_iow = 0; /* not separated out until the 2.5.41 kernel */ long long cpu_xxx = 0; /* not separated out until the 2.6.0-test4 kernel */ long long cpu_yyy = 0; /* not separated out until the 2.6.0-test4 kernel */ long long cpu_zzz = 0; /* not separated out until the 2.6.11 kernel */ long long divo2 = 0; long long duse = 0; long long dsys = 0; long long didl =0; long long diow =0; long long dstl = 0; long long Div = 0; b = strstr(buffer, "cpu "); if(b) sscanf(b, "cpu %Lu %Lu %Lu %Lu %Lu %Lu %Lu %Lu", &cpu_use, &cpu_nic, &cpu_sys, &cpu_idl, &cpu_iow, &cpu_xxx, &cpu_yyy, &cpu_zzz); if(blocked) { b = strstr(buffer, "procs_blocked "); if(b) sscanf(b, "procs_blocked %u", blocked); } duse = cpu_use + cpu_nic; dsys = cpu_sys + cpu_xxx + cpu_yyy; didl = cpu_idl; diow = cpu_iow; dstl = cpu_zzz; Div = duse + dsys + didl + diow + dstl; if (!Div) Div = 1, didl = 1; divo2 = Div / 2UL; /* vmstat output: * * procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu---- * r b swpd free buff cache si so bi bo in cs us sy id wa * 1 0 5537800 958592 204180 1737740 1 1 12 15 0 0 2 1 97 0 * * The last four columns are calculated as: * * (unsigned)( (100*duse + divo2) / Div ), * (unsigned)( (100*dsys + divo2) / Div ), * (unsigned)( (100*didl + divo2) / Div ), * (unsigned)( (100*diow + divo2) / Div ) * */ *load = (diow + divo2) / Div; crm_debug("Current IO load is %f", *load); fclose(stream); return TRUE; } fclose(stream); return FALSE; } static enum throttle_state_e throttle_handle_load(float load, const char *desc) { if(load > THROTTLE_FACTOR_HIGH * throttle_load_target) { - crm_notice("High %s detected: %f (limit: %f)", desc, load, throttle_load_target); + crm_notice("High %s detected: %f", desc, load); return throttle_high; } else if(load > THROTTLE_FACTOR_MEDIUM * throttle_load_target) { - crm_info("Moderate %s detected: %f (limit: %f)", desc, load, throttle_load_target); + crm_info("Moderate %s detected: %f", desc, load); return throttle_med; } else if(load > THROTTLE_FACTOR_LOW * throttle_load_target) { - crm_debug("Noticable %s detected: %f (limit: %f)", desc, load, throttle_load_target); + crm_debug("Noticable %s detected: %f", desc, load); return throttle_low; } - crm_trace("Negligable %s detected: %f (limit: %f)", desc, load, throttle_load_target); + crm_trace("Negligable %s detected: %f", desc, load); return throttle_none; } static enum throttle_state_e throttle_mode(void) { float load; unsigned int blocked = 0; int cores = throttle_num_cores(); enum throttle_state_e mode = throttle_none; if(throttle_load_target <= 0) { /* If we ever make this a valid value, the cluster will at least behave as expected */ return mode; } if(throttle_load_avg(&load)) { float simple = load / cores; mode |= throttle_handle_load(simple, "CPU load"); } if(throttle_cib_load(&load)) { const char *desc = "CIB load"; if(load > 0.9) { crm_notice("High %s detected: %f", desc, load); mode |= throttle_high; } else if(load > 0.8) { crm_info("Moderate %s detected: %f", desc, load); mode |= throttle_med; } else if(load > 0.7) { crm_debug("Noticable %s detected: %f", desc, load); mode |= throttle_low; } else { crm_trace("Negligable %s detected: %f", desc, load); } } if(throttle_io_load(&load, &blocked)) { float blocked_ratio = 0.0; mode |= throttle_handle_load(load, "IO load"); if(cores) { blocked_ratio = blocked / cores; } else { blocked_ratio = blocked; } mode |= throttle_handle_load(blocked_ratio, "blocked IO ratio"); } if(mode & throttle_high) { return throttle_high; } else if(mode & throttle_med) { return throttle_med; } else if(mode & throttle_low) { return throttle_low; } return throttle_none; } static void throttle_send_command(enum throttle_state_e mode) { xmlNode *xml = NULL; - int cores = throttle_num_cores(); xml = create_request(CRM_OP_THROTTLE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL); crm_xml_add_int(xml, F_CRM_THROTTLE_MODE, mode); - crm_xml_add_int(xml, F_CRM_THROTTLE_CORES, cores); + crm_xml_add_int(xml, F_CRM_THROTTLE_MAX, throttle_job_max); send_cluster_message(NULL, crm_msg_crmd, xml, TRUE); free_xml(xml); crm_info("Updated throttle state to %.4x", mode); } static gboolean throttle_timer_cb(gpointer data) { static bool send_updates = FALSE; static enum throttle_state_e last = throttle_none; enum throttle_state_e now = throttle_mode(); if(send_updates == FALSE) { /* Optimize for the true case */ if(compare_version(fsa_our_dc_version, "3.0.8") < 0) { crm_trace("DC version %s doesn't support throttling", fsa_our_dc_version); } else { send_updates = TRUE; } } if(send_updates && now != last) { crm_debug("New throttle mode: %.4x (was %.4x)", now, last); throttle_send_command(now); last = now; } return TRUE; } static void throttle_record_free(gpointer p) { struct throttle_record_s *r = p; free(r->node); free(r); } void -throttle_init(void) +throttle_update_job_max(const char *preference) { - float load = 0.0; - throttle_load_avg(&load); + int max = 0; + + throttle_job_max = 2 * throttle_num_cores(); + + if(preference) { + /* Global preference from the CIB */ + max = crm_int_helper(preference, NULL); + if(max > 0) { + throttle_job_max = max; + } + } + + preference = getenv("LRMD_MAX_CHILDREN"); + if(preference) { + /* Legacy env variable */ + max = crm_int_helper(preference, NULL); + if(max > 0) { + throttle_job_max = max; + } + } + + preference = getenv("PCMK_node_action_limit"); + if(preference) { + /* Per-node override */ + max = crm_int_helper(preference, NULL); + if(max > 0) { + throttle_job_max = max; + } + } +} - throttle_records = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, throttle_record_free); + +void +throttle_init(void) +{ + throttle_records = g_hash_table_new_full( + crm_str_hash, g_str_equal, NULL, throttle_record_free); throttle_timer = mainloop_timer_add("throttle", 30* 1000, TRUE, throttle_timer_cb, NULL); + + throttle_update_job_max(NULL); mainloop_timer_start(throttle_timer); } void throttle_fini(void) { mainloop_timer_del(throttle_timer); throttle_timer = NULL; g_hash_table_destroy(throttle_records); throttle_records = NULL; } int throttle_get_job_limit(const char *node) { int jobs = 1; - int job_max = throttle_job_max; struct throttle_record_s *r = NULL; r = g_hash_table_lookup(throttle_records, node); if(r == NULL) { r = calloc(1, sizeof(struct throttle_record_s)); r->node = strdup(node); r->mode = throttle_mode(); - r->cores = throttle_num_cores(); + r->max = throttle_job_max; crm_trace("Defaulting to local values for unknown node %s", node); g_hash_table_insert(throttle_records, r->node, r); } - if(job_max <= 0) { - job_max = r->cores * 2; - } - switch(r->mode) { case throttle_high: jobs = 1; /* At least one job must always be allowed */ break; case throttle_med: - jobs = QB_MAX(1, job_max / 4); + jobs = QB_MAX(1, r->max / 4); break; case throttle_low: - jobs = QB_MAX(1, job_max / 2); + jobs = QB_MAX(1, r->max / 2); break; case throttle_none: - jobs = QB_MAX(1, job_max); + jobs = QB_MAX(1, r->max); break; default: crm_err("Unknown throttle mode %.4x on %s", r->mode, node); break; } return jobs; } void throttle_update(xmlNode *xml) { - int cores = 0; + int max = 0; enum throttle_state_e mode = 0; struct throttle_record_s *r = NULL; const char *from = crm_element_value(xml, F_CRM_HOST_FROM); crm_element_value_int(xml, F_CRM_THROTTLE_MODE, (int*)&mode); - crm_element_value_int(xml, F_CRM_THROTTLE_CORES, &cores); + crm_element_value_int(xml, F_CRM_THROTTLE_MAX, &max); r = g_hash_table_lookup(throttle_records, from); if(r == NULL) { r = calloc(1, sizeof(struct throttle_record_s)); r->node = strdup(from); g_hash_table_insert(throttle_records, r->node, r); } - r->cores = cores; + r->max = max; r->mode = mode; - crm_debug("Host %s has %d cores and throttle mode %.4x. New job limit is %d", - from, cores, mode, throttle_get_job_limit(from)); + crm_debug("Host %s supports a maximum of %d jobs and throttle mode %.4x. New job limit is %d", + from, max, mode, throttle_get_job_limit(from)); } diff --git a/crmd/throttle.h b/crmd/throttle.h index a00393cf01..4fc9f1b6fa 100644 --- a/crmd/throttle.h +++ b/crmd/throttle.h @@ -1,27 +1,28 @@ /* * Copyright (C) 2013 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -extern int throttle_job_max; + extern float throttle_load_target; void throttle_init(void); void throttle_fini(void); int throttle_num_cores(void); void throttle_update(xmlNode *xml); +void throttle_update_job_max(const char *preference); int throttle_get_job_limit(const char *node); diff --git a/include/crm/msg_xml.h b/include/crm/msg_xml.h index ce18e8346c..03ea63fb9d 100644 --- a/include/crm/msg_xml.h +++ b/include/crm/msg_xml.h @@ -1,368 +1,368 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef XML_TAGS__H # define XML_TAGS__H # ifndef F_ORIG # define F_ORIG "src" # endif # ifndef F_SEQ # define F_SEQ "seq" # endif # ifndef F_SUBTYPE # define F_SUBTYPE "subt" # endif # ifndef F_TYPE # define F_TYPE "t" # endif # ifndef F_CLIENTNAME # define F_CLIENTNAME "cn" # endif # ifndef F_XML_TAGNAME # define F_XML_TAGNAME "__name__" # endif # ifndef T_CRM # define T_CRM "crmd" # endif # ifndef T_ATTRD # define T_ATTRD "attrd" # endif # define CIB_OPTIONS_FIRST "cib-bootstrap-options" # define F_CRM_DATA "crm_xml" # define F_CRM_TASK "crm_task" # define F_CRM_HOST_TO "crm_host_to" # define F_CRM_MSG_TYPE F_SUBTYPE # define F_CRM_SYS_TO "crm_sys_to" # define F_CRM_SYS_FROM "crm_sys_from" # define F_CRM_HOST_FROM F_ORIG # define F_CRM_REFERENCE XML_ATTR_REFERENCE # define F_CRM_VERSION XML_ATTR_VERSION # define F_CRM_ORIGIN "origin" # define F_CRM_USER "crm_user" # define F_CRM_JOIN_ID "join_id" # define F_CRM_ELECTION_ID "election-id" # define F_CRM_ELECTION_AGE_S "election-age-sec" # define F_CRM_ELECTION_AGE_US "election-age-nano-sec" # define F_CRM_ELECTION_OWNER "election-owner" # define F_CRM_TGRAPH "crm-tgraph" # define F_CRM_TGRAPH_INPUT "crm-tgraph-in" # define F_CRM_THROTTLE_MODE "crm-limit-mode" -# define F_CRM_THROTTLE_CORES "crm-limit-cores" +# define F_CRM_THROTTLE_MAX "crm-limit-max" /*---- Common tags/attrs */ # define XML_DIFF_MARKER "__crm_diff_marker__" # define XML_ATTR_TAGNAME F_XML_TAGNAME # define XML_TAG_CIB "cib" # define XML_TAG_FAILED "failed" # define XML_ATTR_CRM_VERSION "crm_feature_set" # define XML_ATTR_DIGEST "digest" # define XML_ATTR_VALIDATION "validate-with" # define XML_ATTR_QUORUM_PANIC "no-quorum-panic" # define XML_ATTR_HAVE_QUORUM "have-quorum" # define XML_ATTR_EXPECTED_VOTES "expected-quorum-votes" # define XML_ATTR_GENERATION "epoch" # define XML_ATTR_GENERATION_ADMIN "admin_epoch" # define XML_ATTR_NUMUPDATES "num_updates" # define XML_ATTR_TIMEOUT "timeout" # define XML_ATTR_ORIGIN "crm-debug-origin" # define XML_ATTR_TSTAMP "crm-timestamp" # define XML_CIB_ATTR_WRITTEN "cib-last-written" # define XML_ATTR_VERSION "version" # define XML_ATTR_DESC "description" # define XML_ATTR_ID "id" # define XML_ATTR_IDREF "id-ref" # define XML_ATTR_ID_LONG "long-id" # define XML_ATTR_TYPE "type" # define XML_ATTR_FILTER_TYPE "type-filter" # define XML_ATTR_FILTER_ID "id-filter" # define XML_ATTR_FILTER_PRIORITY "priority-filter" # define XML_ATTR_VERBOSE "verbose" # define XML_ATTR_OP "op" # define XML_ATTR_DC "is_dc" # define XML_ATTR_DC_UUID "dc-uuid" # define XML_ATTR_UPDATE_ORIG "update-origin" # define XML_ATTR_UPDATE_CLIENT "update-client" # define XML_ATTR_UPDATE_USER "update-user" # define XML_BOOLEAN_TRUE "true" # define XML_BOOLEAN_FALSE "false" # define XML_BOOLEAN_YES XML_BOOLEAN_TRUE # define XML_BOOLEAN_NO XML_BOOLEAN_FALSE # define XML_TAG_OPTIONS "options" /*---- top level tags/attrs */ # define XML_MSG_TAG "crm_message" # define XML_MSG_TAG_DATA "msg_data" # define XML_ATTR_REQUEST "request" # define XML_ATTR_RESPONSE "response" # define XML_ATTR_UNAME "uname" # define XML_ATTR_UUID "id" # define XML_ATTR_REFERENCE "reference" # define XML_FAIL_TAG_RESOURCE "failed_resource" # define XML_FAILRES_ATTR_RESID "resource_id" # define XML_FAILRES_ATTR_REASON "reason" # define XML_FAILRES_ATTR_RESSTATUS "resource_status" # define XML_CRM_TAG_PING "ping_response" # define XML_PING_ATTR_STATUS "result" # define XML_PING_ATTR_SYSFROM "crm_subsystem" # define XML_TAG_FRAGMENT "cib_fragment" # define XML_ATTR_RESULT "result" # define XML_ATTR_SECTION "section" # define XML_FAIL_TAG_CIB "failed_update" # define XML_FAILCIB_ATTR_ID "id" # define XML_FAILCIB_ATTR_OBJTYPE "object_type" # define XML_FAILCIB_ATTR_OP "operation" # define XML_FAILCIB_ATTR_REASON "reason" /*---- CIB specific tags/attrs */ # define XML_CIB_TAG_SECTION_ALL "all" # define XML_CIB_TAG_CONFIGURATION "configuration" # define XML_CIB_TAG_STATUS "status" # define XML_CIB_TAG_RESOURCES "resources" # define XML_CIB_TAG_NODES "nodes" # define XML_CIB_TAG_DOMAINS "domains" # define XML_CIB_TAG_CONSTRAINTS "constraints" # define XML_CIB_TAG_CRMCONFIG "crm_config" # define XML_CIB_TAG_OPCONFIG "op_defaults" # define XML_CIB_TAG_RSCCONFIG "rsc_defaults" # define XML_CIB_TAG_ACLS "acls" # define XML_CIB_TAG_STATE "node_state" # define XML_CIB_TAG_NODE "node" # define XML_CIB_TAG_DOMAIN "domain" # define XML_CIB_TAG_CONSTRAINT "constraint" # define XML_CIB_TAG_NVPAIR "nvpair" # define XML_CIB_TAG_PROPSET "cluster_property_set" # define XML_TAG_ATTR_SETS "instance_attributes" # define XML_TAG_META_SETS "meta_attributes" # define XML_TAG_ATTRS "attributes" # define XML_TAG_PARAMS "parameters" # define XML_TAG_PARAM "param" # define XML_TAG_UTILIZATION "utilization" # define XML_TAG_RESOURCE_REF "resource_ref" # define XML_CIB_TAG_RESOURCE "primitive" # define XML_CIB_TAG_GROUP "group" # define XML_CIB_TAG_INCARNATION "clone" # define XML_CIB_TAG_MASTER "master" # define XML_CIB_TAG_RSC_TEMPLATE "template" # define XML_RSC_ATTR_RESTART "restart-type" # define XML_RSC_ATTR_ORDERED "ordered" # define XML_RSC_ATTR_INTERLEAVE "interleave" # define XML_RSC_ATTR_INCARNATION "clone" # define XML_RSC_ATTR_INCARNATION_MAX "clone-max" # define XML_RSC_ATTR_INCARNATION_NODEMAX "clone-node-max" # define XML_RSC_ATTR_MASTER_MAX "master-max" # define XML_RSC_ATTR_MASTER_NODEMAX "master-node-max" # define XML_RSC_ATTR_STATE "clone-state" # define XML_RSC_ATTR_MANAGED "is-managed" # define XML_RSC_ATTR_TARGET_ROLE "target-role" # define XML_RSC_ATTR_UNIQUE "globally-unique" # define XML_RSC_ATTR_NOTIFY "notify" # define XML_RSC_ATTR_STICKINESS "resource-stickiness" # define XML_RSC_ATTR_FAIL_STICKINESS "migration-threshold" # define XML_RSC_ATTR_FAIL_TIMEOUT "failure-timeout" # define XML_RSC_ATTR_MULTIPLE "multiple-active" # define XML_RSC_ATTR_PRIORITY "priority" # define XML_RSC_ATTR_REQUIRES "requires" # define XML_RSC_ATTR_CONTAINER "container" # define XML_RSC_ATTR_INTERNAL_RSC "internal_rsc" # define XML_OP_ATTR_ON_FAIL "on-fail" # define XML_OP_ATTR_START_DELAY "start-delay" # define XML_OP_ATTR_ALLOW_MIGRATE "allow-migrate" # define XML_OP_ATTR_DEPENDENT "dependent-on" # define XML_OP_ATTR_ORIGIN "interval-origin" # define XML_OP_ATTR_PENDING "record-pending" # define XML_CIB_TAG_LRM "lrm" # define XML_LRM_TAG_RESOURCES "lrm_resources" # define XML_LRM_TAG_RESOURCE "lrm_resource" # define XML_LRM_TAG_AGENTS "lrm_agents" # define XML_LRM_TAG_AGENT "lrm_agent" # define XML_LRM_TAG_RSC_OP "lrm_rsc_op" # define XML_AGENT_ATTR_CLASS "class" # define XML_AGENT_ATTR_PROVIDER "provider" # define XML_LRM_TAG_ATTRIBUTES "attributes" # define XML_CIB_ATTR_REPLACE "replace" # define XML_CIB_ATTR_SOURCE "source" # define XML_CIB_ATTR_HEALTH "health" # define XML_CIB_ATTR_WEIGHT "weight" # define XML_CIB_ATTR_PRIORITY "priority" # define XML_CIB_ATTR_CLEAR "clear_on" # define XML_CIB_ATTR_SOURCE "source" # define XML_NODE_JOIN_STATE "join" # define XML_NODE_EXPECTED "expected" # define XML_NODE_IN_CLUSTER "in_ccm" # define XML_NODE_IS_PEER "crmd" # define XML_NODE_IS_REMOTE "remote_node" # define XML_CIB_ATTR_SHUTDOWN "shutdown" # define XML_CIB_ATTR_STONITH "stonith" # define XML_LRM_ATTR_INTERVAL "interval" # define XML_LRM_ATTR_TASK "operation" # define XML_LRM_ATTR_TASK_KEY "operation_key" # define XML_LRM_ATTR_TARGET "on_node" /*! used for remote nodes. * For remote nodes the action is routed to the 'on_node' * node location, and then from there if 'exec_on' is set * the host will execute the action on the remote node * it controls. */ # define XML_LRM_ATTR_ROUTER_NODE "router_node" # define XML_LRM_ATTR_TARGET_UUID "on_node_uuid" # define XML_LRM_ATTR_RSCID "rsc-id" # define XML_LRM_ATTR_OPSTATUS "op-status" # define XML_LRM_ATTR_RC "rc-code" # define XML_LRM_ATTR_CALLID "call-id" # define XML_LRM_ATTR_OP_DIGEST "op-digest" # define XML_LRM_ATTR_OP_RESTART "op-force-restart" # define XML_LRM_ATTR_RESTART_DIGEST "op-restart-digest" # define XML_RSC_OP_LAST_CHANGE "last-rc-change" # define XML_RSC_OP_LAST_RUN "last-run" # define XML_RSC_OP_T_EXEC "exec-time" # define XML_RSC_OP_T_QUEUE "queue-time" # define XML_LRM_ATTR_MIGRATE_SOURCE "migrate_source" # define XML_LRM_ATTR_MIGRATE_TARGET "migrate_target" # define XML_TAG_GRAPH "transition_graph" # define XML_GRAPH_TAG_RSC_OP "rsc_op" # define XML_GRAPH_TAG_PSEUDO_EVENT "pseudo_event" # define XML_GRAPH_TAG_CRM_EVENT "crm_event" # define XML_TAG_RULE "rule" # define XML_RULE_ATTR_SCORE "score" # define XML_RULE_ATTR_SCORE_ATTRIBUTE "score-attribute" # define XML_RULE_ATTR_SCORE_MANGLED "score-attribute-mangled" # define XML_RULE_ATTR_ROLE "role" # define XML_RULE_ATTR_RESULT "result" # define XML_RULE_ATTR_BOOLEAN_OP "boolean-op" # define XML_TAG_EXPRESSION "expression" # define XML_EXPR_ATTR_ATTRIBUTE "attribute" # define XML_EXPR_ATTR_OPERATION "operation" # define XML_EXPR_ATTR_VALUE "value" # define XML_EXPR_ATTR_TYPE "type" # define XML_CONS_TAG_RSC_DEPEND "rsc_colocation" # define XML_CONS_TAG_RSC_ORDER "rsc_order" # define XML_CONS_TAG_RSC_LOCATION "rsc_location" # define XML_CONS_TAG_RSC_TICKET "rsc_ticket" # define XML_CONS_TAG_RSC_SET "resource_set" # define XML_CONS_ATTR_SYMMETRICAL "symmetrical" # define XML_COLOC_ATTR_SOURCE "rsc" # define XML_COLOC_ATTR_SOURCE_ROLE "rsc-role" # define XML_COLOC_ATTR_TARGET "with-rsc" # define XML_COLOC_ATTR_TARGET_ROLE "with-rsc-role" # define XML_COLOC_ATTR_NODE_ATTR "node-attribute" # define XML_COLOC_ATTR_SOURCE_INSTANCE "rsc-instance" # define XML_COLOC_ATTR_TARGET_INSTANCE "with-rsc-instance" # define XML_ORDER_ATTR_FIRST "first" # define XML_ORDER_ATTR_THEN "then" # define XML_ORDER_ATTR_FIRST_ACTION "first-action" # define XML_ORDER_ATTR_THEN_ACTION "then-action" # define XML_ORDER_ATTR_FIRST_INSTANCE "first-instance" # define XML_ORDER_ATTR_THEN_INSTANCE "then-instance" # define XML_ORDER_ATTR_KIND "kind" # define XML_TICKET_ATTR_TICKET "ticket" # define XML_TICKET_ATTR_LOSS_POLICY "loss-policy" # define XML_NVPAIR_ATTR_NAME "name" # define XML_NVPAIR_ATTR_VALUE "value" # define XML_NODE_ATTR_STATE "state" # define XML_CONFIG_ATTR_DC_DEADTIME "dc-deadtime" # define XML_CONFIG_ATTR_ELECTION_FAIL "election-timeout" # define XML_CONFIG_ATTR_FORCE_QUIT "shutdown-escalation" # define XML_CONFIG_ATTR_RECHECK "cluster-recheck-interval" # define XML_CIB_TAG_GENERATION_TUPPLE "generation_tuple" # define XML_ATTR_TRANSITION_MAGIC "transition-magic" # define XML_ATTR_TRANSITION_KEY "transition-key" # define XML_ATTR_TE_NOWAIT "op_no_wait" # define XML_ATTR_TE_TARGET_RC "op_target_rc" # define XML_ATTR_TE_ALLOWFAIL "op_allow_fail" # define XML_ATTR_LRM_PROBE "lrm-is-probe" # define XML_TAG_TRANSIENT_NODEATTRS "transient_attributes" # define XML_TAG_DIFF_ADDED "diff-added" # define XML_TAG_DIFF_REMOVED "diff-removed" # define XML_ACL_TAG_USER "acl_user" # define XML_ACL_TAG_ROLE "acl_role" # define XML_ACL_TAG_ROLE_REF "role_ref" # define XML_ACL_TAG_READ "read" # define XML_ACL_TAG_WRITE "write" # define XML_ACL_TAG_DENY "deny" # define XML_ACL_ATTR_REF "ref" # define XML_ACL_ATTR_TAG "tag" # define XML_ACL_ATTR_XPATH "xpath" # define XML_ACL_ATTR_ATTRIBUTE "attribute" # define XML_CIB_TAG_TICKETS "tickets" # define XML_CIB_TAG_TICKET_STATE "ticket_state" # define XML_TAG_FENCING_TOPOLOGY "fencing-topology" # define XML_TAG_FENCING_LEVEL "fencing-level" # define XML_ATTR_STONITH_INDEX "index" # define XML_ATTR_STONITH_TARGET "target" # define XML_ATTR_STONITH_DEVICES "devices" # include # define ID(x) crm_element_value(x, XML_ATTR_ID) # define INSTANCE(x) crm_element_value(x, XML_CIB_ATTR_INSTANCE) # define TSTAMP(x) crm_element_value(x, XML_ATTR_TSTAMP) # define TYPE(x) crm_element_name(x) # define NAME(x) crm_element_value(x, XML_NVPAIR_ATTR_NAME) # define VALUE(x) crm_element_value(x, XML_NVPAIR_ATTR_VALUE) #endif