diff --git a/crm/tengine/actions.c b/crm/tengine/actions.c index 1279c074b3..1e1dfbe76a 100644 --- a/crm/tengine/actions.c +++ b/crm/tengine/actions.c @@ -1,500 +1,506 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include char *te_uuid = NULL; IPC_Channel *crm_ch = NULL; void send_rsc_command(crm_action_t *action); extern crm_action_timer_t *transition_timer; static void te_start_action_timer(crm_action_t *action) { crm_malloc0(action->timer, sizeof(crm_action_timer_t)); action->timer->timeout = action->timeout; action->timer->reason = timeout_action_warn; action->timer->action = action; action->timer->source_id = Gmain_timeout_add( action->timer->timeout, action_timer_callback, (void*)action->timer); CRM_ASSERT(action->timer->source_id != 0); } static gboolean te_pseudo_action(crm_graph_t *graph, crm_action_t *pseudo) { crm_info("Pseudo action %d fired and confirmed", pseudo->id); pseudo->confirmed = TRUE; update_graph(graph, pseudo); trigger_graph(); return TRUE; } void send_stonith_update(stonith_ops_t * op) { enum cib_errors rc = cib_ok; const char *target = op->node_name; const char *uuid = op->node_uuid; /* zero out the node-status & remove all LRM status info */ crm_data_t *node_state = create_xml_node(NULL, XML_CIB_TAG_STATE); CRM_CHECK(op->node_name != NULL, return); CRM_CHECK(op->node_uuid != NULL, return); crm_xml_add(node_state, XML_ATTR_UUID, uuid); crm_xml_add(node_state, XML_ATTR_UNAME, target); crm_xml_add(node_state, XML_CIB_ATTR_HASTATE, DEADSTATUS); crm_xml_add(node_state, XML_CIB_ATTR_INCCM, XML_BOOLEAN_NO); crm_xml_add(node_state, XML_CIB_ATTR_CRMDSTATE, OFFLINESTATUS); crm_xml_add(node_state, XML_CIB_ATTR_JOINSTATE, CRMD_JOINSTATE_DOWN); crm_xml_add(node_state, XML_CIB_ATTR_EXPSTATE, CRMD_JOINSTATE_DOWN); crm_xml_add(node_state, XML_CIB_ATTR_REPLACE, XML_CIB_TAG_LRM); crm_xml_add(node_state, XML_ATTR_ORIGIN, __FUNCTION__); rc = te_cib_conn->cmds->update( te_cib_conn, XML_CIB_TAG_STATUS, node_state, NULL, cib_quorum_override|cib_scope_local); if(rc < cib_ok) { crm_err("CIB update failed: %s", cib_error2string(rc)); abort_transition( INFINITY, tg_shutdown, "CIB update failed", node_state); } else { /* delay processing the trigger until the update completes */ add_cib_op_callback(rc, FALSE, NULL, cib_fencing_updated); } free_xml(node_state); return; } static gboolean te_fence_node(crm_graph_t *graph, crm_action_t *action) { const char *id = NULL; const char *uuid = NULL; const char *target = NULL; const char *type = NULL; stonith_ops_t * st_op = NULL; id = ID(action->xml); target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID); type = g_hash_table_lookup(action->params, crm_meta_name("stonith_action")); CRM_CHECK(id != NULL, crm_log_xml_warn(action->xml, "BadAction"); return FALSE); CRM_CHECK(uuid != NULL, crm_log_xml_warn(action->xml, "BadAction"); return FALSE); CRM_CHECK(type != NULL, crm_log_xml_warn(action->xml, "BadAction"); return FALSE); CRM_CHECK(target != NULL, crm_log_xml_warn(action->xml, "BadAction"); return FALSE); te_log_action(LOG_INFO, "Executing %s fencing operation (%s) on %s (timeout=%d)", type, id, target, transition_graph->transition_timeout / 2); + /* Block until we can connect... */ + while(stonith_src == NULL) { + te_connect_stonith(); + sleep(1); + } + crm_malloc0(st_op, sizeof(stonith_ops_t)); if(safe_str_eq(type, "poweroff")) { st_op->optype = POWEROFF; } else { st_op->optype = RESET; } st_op->timeout = transition_graph->transition_timeout / 2; st_op->node_name = crm_strdup(target); st_op->node_uuid = crm_strdup(uuid); st_op->private_data = generate_transition_key( transition_graph->id, action->id, te_uuid); CRM_ASSERT(stonithd_input_IPC_channel() != NULL); if (ST_OK != stonithd_node_fence( st_op )) { crm_err("Cannot fence %s: stonithd_node_fence() call failed ", target); } return TRUE; } static gboolean te_crm_command(crm_graph_t *graph, crm_action_t *action) { char *value = NULL; char *counter = NULL; HA_Message *cmd = NULL; const char *id = NULL; const char *task = NULL; const char *on_node = NULL; gboolean ret = TRUE; id = ID(action->xml); task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); on_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); CRM_CHECK(on_node != NULL && strlen(on_node) != 0, te_log_action(LOG_ERR, "Corrupted command (id=%s) %s: no node", crm_str(id), crm_str(task)); return FALSE); te_log_action(LOG_INFO, "Executing crm-event (%s): %s on %s", crm_str(id), crm_str(task), on_node); cmd = create_request(task, NULL, on_node, CRM_SYSTEM_CRMD, CRM_SYSTEM_TENGINE, NULL); counter = generate_transition_key( transition_graph->id, action->id, te_uuid); crm_xml_add(cmd, XML_ATTR_TRANSITION_KEY, counter); ret = send_ipc_message(crm_ch, cmd); crm_free(counter); crm_msg_del(cmd); value = g_hash_table_lookup(action->params, crm_meta_name(XML_ATTR_TE_NOWAIT)); if(ret == FALSE) { crm_err("Action %d failed: send", action->id); return FALSE; } else if(crm_is_true(value)) { crm_info("Skipping wait for %d", action->id); action->confirmed = TRUE; update_graph(graph, action); trigger_graph(); } else if(ret && action->timeout > 0) { crm_debug("Setting timer for action %d",action->id); action->timer->reason = timeout_action_warn; te_start_action_timer(action); } return TRUE; } static gboolean te_rsc_command(crm_graph_t *graph, crm_action_t *action) { /* never overwrite stop actions in the CIB with * anything other than completed results * * Writing pending stops makes it look like the * resource is running again */ const char *task = NULL; const char *on_node = NULL; action->executed = FALSE; on_node = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); CRM_CHECK(on_node != NULL && strlen(on_node) != 0, te_log_action(LOG_ERR, "Corrupted command(id=%s) %s: no node", ID(action->xml), crm_str(task)); return FALSE); send_rsc_command(action); return TRUE; } gboolean cib_action_update(crm_action_t *action, int status) { char *code = NULL; char *digest = NULL; crm_data_t *params = NULL; crm_data_t *state = NULL; crm_data_t *rsc = NULL; crm_data_t *xml_op = NULL; crm_data_t *action_rsc = NULL; char *op_id = NULL; enum cib_errors rc = cib_ok; const char *name = NULL; const char *value = NULL; const char *rsc_id = NULL; const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); const char *task_uuid = crm_element_value( action->xml, XML_LRM_ATTR_TASK_KEY); const char *target_uuid = crm_element_value( action->xml, XML_LRM_ATTR_TARGET_UUID); int call_options = cib_quorum_override|cib_scope_local; crm_warn("%s %d: %s on %s timed out", crm_element_name(action->xml), action->id, task_uuid, target); action_rsc = find_xml_node(action->xml, XML_CIB_TAG_RESOURCE, TRUE); if(action_rsc == NULL) { return FALSE; } rsc_id = ID(action_rsc); CRM_CHECK(rsc_id != NULL, crm_log_xml_err(action->xml, "Bad:action"); return FALSE); code = crm_itoa(status); /* update the CIB */ state = create_xml_node(NULL, XML_CIB_TAG_STATE); crm_xml_add(state, XML_ATTR_UUID, target_uuid); crm_xml_add(state, XML_ATTR_UNAME, target); rsc = create_xml_node(state, XML_CIB_TAG_LRM); crm_xml_add(rsc, XML_ATTR_ID, target_uuid); rsc = create_xml_node(rsc, XML_LRM_TAG_RESOURCES); rsc = create_xml_node(rsc, XML_LRM_TAG_RESOURCE); crm_xml_add(rsc, XML_ATTR_ID, rsc_id); name = XML_ATTR_TYPE; value = crm_element_value(action_rsc, name); crm_xml_add(rsc, name, value); name = XML_AGENT_ATTR_CLASS; value = crm_element_value(action_rsc, name); crm_xml_add(rsc, name, value); name = XML_AGENT_ATTR_PROVIDER; value = crm_element_value(action_rsc, name); crm_xml_add(rsc, name, value); xml_op = create_xml_node(rsc, XML_LRM_TAG_RSC_OP); crm_xml_add(xml_op, XML_ATTR_ID, task); op_id = generate_op_key(rsc_id, task, action->interval); crm_xml_add(xml_op, XML_ATTR_ID, op_id); crm_free(op_id); crm_xml_add(xml_op, XML_LRM_ATTR_TASK, task); crm_xml_add(xml_op, XML_ATTR_CRM_VERSION, CRM_FEATURE_SET); crm_xml_add(xml_op, XML_LRM_ATTR_OPSTATUS, code); crm_xml_add(xml_op, XML_LRM_ATTR_CALLID, "-1"); crm_xml_add_int(xml_op, XML_LRM_ATTR_INTERVAL, action->interval); crm_xml_add(xml_op, XML_LRM_ATTR_RC, code); crm_xml_add(xml_op, XML_ATTR_ORIGIN, __FUNCTION__); crm_free(code); code = generate_transition_key(transition_graph->id, action->id,te_uuid); crm_xml_add(xml_op, XML_ATTR_TRANSITION_KEY, code); crm_free(code); code = generate_transition_magic( crm_element_value(xml_op, XML_ATTR_TRANSITION_KEY), status, status); crm_xml_add(xml_op, XML_ATTR_TRANSITION_MAGIC, code); crm_free(code); params = find_xml_node(action->xml, "attributes", TRUE); params = copy_xml(params); filter_action_parameters(params, CRM_FEATURE_SET); digest = calculate_xml_digest(params, TRUE); crm_xml_add(xml_op, XML_LRM_ATTR_OP_DIGEST, digest); crm_free(digest); free_xml(params); crm_debug_3("Updating CIB with \"%s\" (%s): %s %s on %s", status<0?"new action":XML_ATTR_TIMEOUT, crm_element_name(action->xml), crm_str(task), rsc_id, target); rc = te_cib_conn->cmds->update( te_cib_conn, XML_CIB_TAG_STATUS, state, NULL, call_options); crm_debug("Updating CIB with %s action %d: %s %s on %s (call_id=%d)", op_status2text(status), action->id, task_uuid, rsc_id, target, rc); add_cib_op_callback(rc, FALSE, NULL, cib_action_updated); free_xml(state); action->sent_update = TRUE; if(rc < cib_ok) { return FALSE; } return TRUE; } void send_rsc_command(crm_action_t *action) { HA_Message *cmd = NULL; crm_data_t *rsc_op = NULL; char *counter = NULL; const char *task = NULL; const char *value = NULL; const char *on_node = NULL; const char *task_uuid = NULL; CRM_ASSERT(action != NULL); CRM_ASSERT(action->xml != NULL); rsc_op = action->xml; task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); task_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY); on_node = crm_element_value(rsc_op, XML_LRM_ATTR_TARGET); counter = generate_transition_key( transition_graph->id, action->id, te_uuid); crm_xml_add(rsc_op, XML_ATTR_TRANSITION_KEY, counter); crm_info("Initiating action %d: %s on %s", action->id, task_uuid, on_node); crm_free(counter); if(rsc_op != NULL) { crm_log_xml_debug_2(rsc_op, "Performing"); } cmd = create_request(CRM_OP_INVOKE_LRM, rsc_op, on_node, CRM_SYSTEM_LRMD, CRM_SYSTEM_TENGINE, NULL); #if 1 send_ipc_message(crm_ch, cmd); #else /* test the TE timer/recovery code */ if((action->id % 11) == 0) { crm_err("Faking lost action %d: %s", action->id, task_uuid); } else { send_ipc_message(crm_ch, cmd); } #endif crm_msg_del(cmd); action->executed = TRUE; value = g_hash_table_lookup(action->params, crm_meta_name(XML_ATTR_TE_NOWAIT)); if(crm_is_true(value)) { crm_debug("Skipping wait for %d", action->id); action->confirmed = TRUE; update_graph(transition_graph, action); trigger_graph(); } else if(action->timeout > 0) { int action_timeout = (2 * action->timeout) + transition_graph->network_delay; crm_debug_3("Setting timer for action %s", task_uuid); if(transition_graph->transition_timeout < action_timeout) { crm_debug("Action %d:" " Increasing transition %d timeout to %d (2*%d + %d)", action->id, transition_graph->id, action_timeout, action->timeout, transition_graph->network_delay); transition_graph->transition_timeout = action_timeout; } te_start_action_timer(action); } } crm_graph_functions_t te_graph_fns = { te_pseudo_action, te_rsc_command, te_crm_command, te_fence_node }; extern GMainLoop* mainloop; void notify_crmd(crm_graph_t *graph) { HA_Message *cmd = NULL; int log_level = LOG_DEBUG; const char *op = CRM_OP_TEABORT; int pending_callbacks = num_cib_op_callbacks(); stop_te_timer(transition_timer); if(pending_callbacks != 0) { crm_warn("Delaying completion until all CIB updates complete"); return; } CRM_CHECK(graph->complete, graph->complete = TRUE); switch(graph->completion_action) { case tg_stop: op = CRM_OP_TECOMPLETE; log_level = LOG_INFO; break; case tg_abort: case tg_restart: op = CRM_OP_TEABORT; break; case tg_shutdown: crm_info("Exiting after transition"); if (mainloop != NULL && g_main_is_running(mainloop)) { g_main_quit(mainloop); return; } exit(LSB_EXIT_OK); } te_log_action(log_level, "Transition %d status: %s - %s", graph->id, op, crm_str(graph->abort_reason)); print_graph(LOG_DEBUG_3, graph); cmd = create_request( op, NULL, NULL, CRM_SYSTEM_DC, CRM_SYSTEM_TENGINE, NULL); if(graph->abort_reason != NULL) { ha_msg_add(cmd, "message", graph->abort_reason); } send_ipc_message(crm_ch, cmd); crm_msg_del(cmd); graph->abort_reason = NULL; graph->completion_action = tg_restart; } diff --git a/crm/tengine/callbacks.c b/crm/tengine/callbacks.c index a2d8fba43f..1fdcf3794e 100644 --- a/crm/tengine/callbacks.c +++ b/crm/tengine/callbacks.c @@ -1,585 +1,590 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include void te_update_confirm(const char *event, HA_Message *msg); void te_update_diff(const char *event, HA_Message *msg); crm_data_t *need_abort(crm_data_t *update); void cib_fencing_updated(const HA_Message *msg, int call_id, int rc, crm_data_t *output, void *user_data); extern char *te_uuid; gboolean shuttingdown = FALSE; crm_graph_t *transition_graph; GTRIGSource *transition_trigger = NULL; crm_action_timer_t *transition_timer = NULL; static gboolean start_global_timer(crm_action_timer_t *timer, int timeout) { CRM_ASSERT(timer != NULL); CRM_CHECK(timer > 0, return FALSE); CRM_CHECK(timer->source_id == 0, return FALSE); if(timeout <= 0) { crm_err("Tried to start timer with period: %d", timeout); } else if(timer->source_id == 0) { crm_debug_2("Starting abort timer: %dms", timeout); timer->timeout = timeout; timer->source_id = Gmain_timeout_add( timeout, global_timer_callback, (void*)timer); CRM_ASSERT(timer->source_id != 0); return TRUE; } else { crm_err("Timer is already active with period: %d", timer->timeout); } return FALSE; } void te_update_diff(const char *event, HA_Message *msg) { int rc = -1; const char *op = NULL; crm_data_t *diff = NULL; crm_data_t *aborted = NULL; const char *set_name = NULL; int diff_add_updates = 0; int diff_add_epoch = 0; int diff_add_admin_epoch = 0; int diff_del_updates = 0; int diff_del_epoch = 0; int diff_del_admin_epoch = 0; if(msg == NULL) { crm_err("NULL update"); return; } ha_msg_value_int(msg, F_CIB_RC, &rc); op = cl_get_string(msg, F_CIB_OPERATION); if(rc < cib_ok) { crm_debug_2("Ignoring failed %s operation: %s", op, cib_error2string(rc)); return; } diff = get_message_xml(msg, F_CIB_UPDATE_RESULT); cib_diff_version_details( diff, &diff_add_admin_epoch, &diff_add_epoch, &diff_add_updates, &diff_del_admin_epoch, &diff_del_epoch, &diff_del_updates); crm_debug("Processing diff (%s): %d.%d.%d -> %d.%d.%d", op, diff_del_admin_epoch,diff_del_epoch,diff_del_updates, diff_add_admin_epoch,diff_add_epoch,diff_add_updates); log_cib_diff(LOG_DEBUG_2, diff, op); set_name = "diff-added"; if(diff != NULL) { crm_data_t *section = NULL; crm_data_t *change_set = find_xml_node(diff, set_name, FALSE); change_set = find_xml_node(change_set, XML_TAG_CIB, FALSE); if(change_set != NULL) { crm_debug_2("Checking status changes"); section=get_object_root(XML_CIB_TAG_STATUS,change_set); } if(section != NULL) { extract_event(section); } crm_debug_2("Checking change set: %s", set_name); aborted = need_abort(change_set); } set_name = "diff-removed"; if(diff != NULL && aborted == NULL) { crm_data_t *attrs = NULL; crm_data_t *status = NULL; crm_data_t *change_set = find_xml_node(diff, set_name, FALSE); change_set = find_xml_node(change_set, XML_TAG_CIB, FALSE); crm_debug_2("Checking change set: %s", set_name); aborted = need_abort(change_set); if(aborted == NULL && change_set != NULL) { status = get_object_root(XML_CIB_TAG_STATUS, change_set); xml_child_iter_filter( status, node_state, XML_CIB_TAG_STATE, attrs = find_xml_node( node_state, XML_TAG_TRANSIENT_NODEATTRS, FALSE); if(attrs != NULL) { crm_info("Aborting on "XML_TAG_TRANSIENT_NODEATTRS" deletions"); abort_transition(INFINITY, tg_restart, XML_TAG_TRANSIENT_NODEATTRS, attrs); } ); } } if(aborted != NULL) { abort_transition( INFINITY, tg_restart, "Non-status change", NULL); } free_xml(diff); return; } gboolean process_te_message(HA_Message *msg, crm_data_t *xml_data, IPC_Channel *sender) { crm_data_t *xml_obj = NULL; const char *from = cl_get_string(msg, F_ORIG); const char *sys_to = cl_get_string(msg, F_CRM_SYS_TO); const char *sys_from = cl_get_string(msg, F_CRM_SYS_FROM); const char *ref = cl_get_string(msg, XML_ATTR_REFERENCE); const char *op = cl_get_string(msg, F_CRM_TASK); const char *type = cl_get_string(msg, F_CRM_MSG_TYPE); crm_debug_2("Processing %s (%s) message", op, ref); crm_log_message(LOG_DEBUG_3, msg); + if(stonith_src == NULL) { + te_connect_stonith(); + } if(op == NULL){ /* error */ } else if(strcasecmp(op, CRM_OP_HELLO) == 0) { /* ignore */ } else if(sys_to == NULL || strcasecmp(sys_to, CRM_SYSTEM_TENGINE) != 0) { crm_debug_2("Bad sys-to %s", crm_str(sys_to)); return FALSE; } else if(safe_str_eq(op, CRM_OP_INVOKE_LRM) && safe_str_eq(sys_from, CRM_SYSTEM_LRMD) /* && safe_str_eq(type, XML_ATTR_RESPONSE) */ ){ #if CRM_DEPRECATED_SINCE_2_0_4 if(safe_str_eq(crm_element_name(xml_data), XML_TAG_CIB)) { xml_obj = xml_data; } else { xml_obj = find_xml_node(xml_data, XML_TAG_CIB, TRUE); } #else xml_obj = xml_data; CRM_CHECK(xml_obj != NULL, crm_log_message_adv(LOG_ERR, "Invalid (N)ACK", msg); return FALSE); #endif CRM_CHECK(xml_obj != NULL, crm_log_message_adv(LOG_ERR, "Invalid (N)ACK", msg); return FALSE); xml_obj = get_object_root(XML_CIB_TAG_STATUS, xml_obj); CRM_CHECK(xml_obj != NULL, crm_log_message_adv(LOG_ERR, "Invalid (N)ACK", msg); return FALSE); crm_log_message_adv(LOG_DEBUG_2, "Processing (N)ACK", msg); crm_info("Processing (N)ACK %s from %s", cl_get_string(msg, XML_ATTR_REFERENCE), from); extract_event(xml_obj); } else if(safe_str_eq(type, XML_ATTR_RESPONSE)) { crm_err("Message was a response not a request. Discarding"); return TRUE; } else if(strcasecmp(op, CRM_OP_TRANSITION) == 0) { const char *graph_file = cl_get_string(msg, F_CRM_TGRAPH); const char *graph_input = cl_get_string(msg, F_CRM_TGRAPH_INPUT); CRM_CHECK(graph_file != NULL || xml_data != NULL, crm_err("No graph provided"); crm_log_message(LOG_WARNING, msg); return TRUE); if(transition_graph->complete == FALSE) { crm_info("Another transition is already active"); abort_transition( INFINITY, tg_restart, "Transition Active", NULL); } else { destroy_graph(transition_graph); crm_debug("Processing graph derived from %s", graph_input); if(graph_file == NULL) { transition_graph = unpack_graph(xml_data); } else { crm_data_t *graph_data = NULL; FILE *graph_fd = fopen(graph_file, "r"); CRM_CHECK(graph_fd != NULL, cl_perror("Could not open graph file %s", graph_file); return TRUE); graph_data = file2xml(graph_fd, FALSE); transition_graph = unpack_graph(graph_data); free_xml(graph_data); unlink(graph_file); fclose(graph_fd); } start_global_timer(transition_timer, transition_graph->transition_timeout); trigger_graph(); print_graph(LOG_DEBUG_2, transition_graph); } } else if(strcasecmp(op, CRM_OP_TE_HALT) == 0) { abort_transition(INFINITY, tg_stop, "Peer Halt", NULL); } else if(strcasecmp(op, CRM_OP_TEABORT) == 0) { abort_transition(INFINITY, tg_restart, "Peer Cancelled", NULL); } else { crm_err("Unknown command: %s::%s from %s", type, op, sys_from); } crm_debug_3("finished processing message"); return TRUE; } void tengine_stonith_callback(stonith_ops_t * op) { const char *allow_fail = NULL; int stonith_id = -1; int transition_id = -1; char *uuid = NULL; crm_action_t *stonith_action = NULL; if(op == NULL) { crm_err("Called with a NULL op!"); return; } crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", op->call_id, op->optype, op->node_name, op->op_result, (char *)op->node_list, op->private_data); /* this will mark the event complete if a match is found */ CRM_CHECK(op->private_data != NULL, return); /* filter out old STONITH actions */ CRM_CHECK(decode_transition_key( op->private_data, &uuid, &transition_id, &stonith_id), crm_err("Invalid event detected"); goto bail; ); if(transition_graph->complete || stonith_id < 0 || safe_str_neq(uuid, te_uuid) || transition_graph->id != transition_id) { crm_info("Ignoring STONITH action initiated outside" " of the current transition"); } stonith_action = get_action(stonith_id, TRUE); if(stonith_action == NULL) { crm_err("Stonith action not matched"); goto bail; } switch(op->op_result) { case STONITH_SUCCEEDED: send_stonith_update(op); break; case STONITH_CANNOT: case STONITH_TIMEOUT: case STONITH_GENERIC: stonith_action->failed = TRUE; allow_fail = g_hash_table_lookup( stonith_action->params, crm_meta_name(XML_ATTR_TE_ALLOWFAIL)); if(FALSE == crm_is_true(allow_fail)) { crm_err("Stonith of %s failed (%d)..." " aborting transition.", op->node_name, op->op_result); abort_transition(INFINITY, tg_restart, "Stonith failed", NULL); } break; default: crm_err("Unsupported action result: %d", op->op_result); abort_transition(INFINITY, tg_restart, "Unsupport Stonith result", NULL); } update_graph(transition_graph, stonith_action); trigger_graph(); bail: crm_free(uuid); return; } + void tengine_stonith_connection_destroy(gpointer user_data) { -#if 0 - crm_err("Fencing daemon has left us: Shutting down...NOW"); - /* shutdown properly later */ - CRM_CHECK(FALSE/* fencing daemon died */); -#else crm_err("Fencing daemon has left us"); -#endif + stonith_src = NULL; + if(stonith_src == NULL) { + sleep(2); + if(te_connect_stonith() == FALSE) { + crm_err("Fencing daemon is still not connected..."); + } + } return; } gboolean tengine_stonith_dispatch(IPC_Channel *sender, void *user_data) { int lpc = 0; while(stonithd_op_result_ready()) { if (sender->ch_status == IPC_DISCONNECT) { /* The message which was pending for us is that * the IPC status is now IPC_DISCONNECT */ break; } if(ST_FAIL == stonithd_receive_ops_result(FALSE)) { crm_err("stonithd_receive_ops_result() failed"); } else { lpc++; } } crm_debug_2("Processed %d messages", lpc); if (sender->ch_status == IPC_DISCONNECT) { return FALSE; } return TRUE; } void cib_fencing_updated(const HA_Message *msg, int call_id, int rc, crm_data_t *output, void *user_data) { trigger_graph(); if(rc < cib_ok) { crm_err("CIB update failed: %s", cib_error2string(rc)); crm_log_xml_warn(msg, "[Failed Update]"); } } void cib_action_updated(const HA_Message *msg, int call_id, int rc, crm_data_t *output, void *user_data) { trigger_graph(); if(rc < cib_ok) { crm_err("Update %d FAILED: %s", call_id, cib_error2string(rc)); } } gboolean action_timer_callback(gpointer data) { crm_action_timer_t *timer = NULL; if(data == NULL) { crm_err("Timer popped with no data"); return FALSE; } timer = (crm_action_timer_t*)data; stop_te_timer(timer); crm_warn("Timer popped (abort_level=%d, complete=%s)", transition_graph->abort_priority, transition_graph->complete?"true":"false"); CRM_CHECK(timer->action != NULL, return FALSE); if(transition_graph->complete) { crm_err("Ignoring timeout while not in transition"); } else if(timer->reason == timeout_action_warn) { print_action( LOG_WARNING,"Action missed its timeout", timer->action); } else { /* fail the action */ cib_action_update(timer->action, LRM_OP_TIMEOUT); } return FALSE; } static int unconfirmed_actions(gboolean send_updates) { int unconfirmed = 0; const char *key = NULL; const char *task = NULL; crm_debug_2("Unconfirmed actions..."); slist_iter( synapse, synapse_t, transition_graph->synapses, lpc, /* lookup event */ slist_iter( action, crm_action_t, synapse->actions, lpc2, if(action->executed == FALSE) { continue; } else if(action->confirmed) { continue; } unconfirmed++; task = crm_element_value(action->xml,XML_LRM_ATTR_TASK); key = crm_element_value( action->xml,XML_LRM_ATTR_TASK_KEY); crm_info("Action %s %d unconfirmed from peer", key, action->id); if(action->type != action_type_rsc) { continue; } else if(send_updates == FALSE) { continue; } else if(safe_str_eq(task, "cancel")) { /* we dont need to update the CIB with these */ continue; } else if(safe_str_eq(task, "stop")) { /* *never* update the CIB with these */ continue; } cib_action_update(action, LRM_OP_PENDING); ); ); if(unconfirmed > 0) { crm_err("Waiting on %d unconfirmed actions", unconfirmed); } return unconfirmed; } gboolean global_timer_callback(gpointer data) { crm_action_timer_t *timer = NULL; if(data == NULL) { crm_err("Timer popped with no data"); return FALSE; } timer = (crm_action_timer_t*)data; stop_te_timer(timer); crm_warn("Timer popped (abort_level=%d, complete=%s)", transition_graph->abort_priority, transition_graph->complete?"true":"false"); CRM_CHECK(timer->action == NULL, return FALSE); if(transition_graph->complete) { crm_err("Ignoring timeout while not in transition"); } else if(timer->reason == timeout_abort) { int unconfirmed = unconfirmed_actions(FALSE); crm_warn("Transition abort timeout reached..." " marking transition complete."); transition_graph->complete = TRUE; abort_transition(INFINITY, tg_restart, "Global Timeout", NULL); if(unconfirmed != 0) { crm_warn("Writing %d unconfirmed actions to the CIB", unconfirmed); unconfirmed_actions(TRUE); } } return FALSE; } gboolean te_graph_trigger(gpointer user_data) { int timeout = 0; enum transition_status graph_rc = -1; if(transition_graph->complete) { notify_crmd(transition_graph); return TRUE; } graph_rc = run_graph(transition_graph); timeout = transition_graph->transition_timeout; print_graph(LOG_DEBUG_3, transition_graph); if(graph_rc == transition_active) { crm_debug_3("Transition not yet complete"); stop_te_timer(transition_timer); start_global_timer(transition_timer, timeout); return TRUE; } else if(graph_rc == transition_pending) { crm_debug_3("Transition not yet complete - no actions fired"); return TRUE; } if(graph_rc != transition_complete) { crm_err("Transition failed: %s", transition_status(graph_rc)); print_graph(LOG_WARNING, transition_graph); } transition_graph->complete = TRUE; notify_crmd(transition_graph); return TRUE; } diff --git a/crm/tengine/main.c b/crm/tengine/main.c index f9ad278c7e..c6f3180b19 100644 --- a/crm/tengine/main.c +++ b/crm/tengine/main.c @@ -1,259 +1,237 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define SYS_NAME CRM_SYSTEM_TENGINE #define OPTARGS "hVc" GMainLoop* mainloop = NULL; const char* crm_system_name = SYS_NAME; cib_t *te_cib_conn = NULL; extern GTRIGSource *transition_trigger; extern crm_action_timer_t *transition_timer; void usage(const char* cmd, int exit_status); int te_init(void); gboolean tengine_shutdown(int nsig, gpointer unused); extern void te_update_confirm(const char *event, HA_Message *msg); extern void te_update_diff(const char *event, HA_Message *msg); extern crm_graph_functions_t te_graph_fns; int main(int argc, char ** argv) { int flag; int rc = 0; int argerr = 0; gboolean allow_cores = TRUE; crm_log_init(crm_system_name, LOG_INFO, TRUE, FALSE, 0, NULL); G_main_add_SignalHandler( G_PRIORITY_HIGH, SIGTERM, tengine_shutdown, NULL, NULL); transition_trigger = G_main_add_TriggerHandler( G_PRIORITY_LOW, te_graph_trigger, NULL, NULL); crm_debug_3("Begining option processing"); while ((flag = getopt(argc, argv, OPTARGS)) != EOF) { switch(flag) { case 'V': alter_debug(DEBUG_INC); break; case 'h': /* Help message */ usage(crm_system_name, LSB_EXIT_OK); break; case 'c': allow_cores = TRUE; break; default: ++argerr; break; } } crm_debug_3("Option processing complete"); if (optind > argc) { ++argerr; } if (argerr) { usage(crm_system_name,LSB_EXIT_GENERIC); } /* read local config file */ crm_debug_3("Starting..."); rc = te_init(); return rc; } - int te_init(void) { int init_ok = TRUE; init_client_ipc_comms( CRM_SYSTEM_CRMD, subsystem_msg_dispatch, (void*)process_te_message, &crm_ch); if(crm_ch != NULL) { send_hello_message(crm_ch, "1234", CRM_SYSTEM_TENGINE, "0", "1"); } else { init_ok = FALSE; crm_err("Could not connect to the CRMd"); } if(init_ok) { crm_debug_4("Creating CIB connection"); te_cib_conn = cib_new(); if(te_cib_conn == NULL) { init_ok = FALSE; } } if(init_ok) { crm_debug_4("Connecting to the CIB"); if(cib_ok != te_cib_conn->cmds->signon( te_cib_conn, crm_system_name, cib_command)) { crm_err("Could not connect to the CIB"); init_ok = FALSE; } } if(init_ok) { crm_debug_4("Setting CIB notification callback"); if(cib_ok != te_cib_conn->cmds->add_notify_callback( te_cib_conn, T_CIB_DIFF_NOTIFY, te_update_diff)) { crm_err("Could not set CIB notification callback"); init_ok = FALSE; } } - if(init_ok && ST_OK != stonithd_signon(crm_system_name)) { - crm_err("Could not sign up to stonithd"); -/* init_ok = FALSE; */ - } - - if(init_ok && ST_OK != stonithd_set_stonith_ops_callback( - tengine_stonith_callback)) { - crm_err("Could not set stonith callback"); - stonithd_signoff(); -/* init_ok = FALSE; */ - } - if(init_ok) { - IPC_Channel *fence_ch = stonithd_input_IPC_channel(); - - if(fence_ch == NULL) { - } else if(NULL == G_main_add_IPC_Channel( - G_PRIORITY_LOW, fence_ch, FALSE, - tengine_stonith_dispatch, NULL, - tengine_stonith_connection_destroy)) { - crm_err("Failed to add Fencing channel to our mainloop"); - init_ok = FALSE; - } + init_ok = te_connect_stonith(); } if(init_ok) { cl_uuid_t new_uuid; char uuid_str[UU_UNPARSE_SIZEOF]; cl_uuid_generate(&new_uuid); cl_uuid_unparse(&new_uuid, uuid_str); te_uuid = crm_strdup(uuid_str); crm_info("Registering TE UUID: %s", te_uuid); set_graph_functions(&te_graph_fns); /* create a blank one */ transition_graph = unpack_graph(NULL); transition_graph->complete = TRUE; transition_graph->abort_reason = "DC Takeover"; transition_graph->completion_action = tg_restart; crm_malloc0(transition_timer, sizeof(crm_action_timer_t)); transition_timer->source_id = 0; transition_timer->reason = timeout_abort; transition_timer->action = NULL; } if(init_ok) { /* Create the mainloop and run it... */ crm_info("Starting %s", crm_system_name); mainloop = g_main_new(FALSE); g_main_run(mainloop); return_to_orig_privs(); crm_info("Exiting %s", crm_system_name); } else { crm_warn("Initialization errors, %s not starting.", crm_system_name); } destroy_graph(transition_graph); crm_free(transition_timer); te_cib_conn->cmds->signoff(te_cib_conn); cib_delete(te_cib_conn); te_cib_conn = NULL; stonithd_signoff(); crm_free(te_uuid); if(init_ok) { return 0; } return 1; } void usage(const char* cmd, int exit_status) { FILE* stream; stream = exit_status ? stderr : stdout; fprintf(stream, "usage: %s [-srkh]" "[-c configure file]\n", cmd); /* fprintf(stream, "\t-d\tsets debug level\n"); */ /* fprintf(stream, "\t-s\tgets daemon status\n"); */ /* fprintf(stream, "\t-r\trestarts daemon\n"); */ /* fprintf(stream, "\t-k\tstops daemon\n"); */ /* fprintf(stream, "\t-h\thelp message\n"); */ fflush(stream); exit(exit_status); } gboolean shuttingdown; gboolean tengine_shutdown(int nsig, gpointer unused) { shuttingdown = TRUE; abort_transition(INFINITY, tg_shutdown, "Shutdown", NULL); return TRUE; } diff --git a/crm/tengine/tengine.h b/crm/tengine/tengine.h index 28bcb58cbc..d5ec78df73 100644 --- a/crm/tengine/tengine.h +++ b/crm/tengine/tengine.h @@ -1,65 +1,68 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef TENGINE__H #define TENGINE__H #include #include #include extern IPC_Channel *crm_ch; extern GMainLoop* mainloop; /* tengine */ extern crm_action_t *match_down_event( int rc, const char *target, const char *filter); extern void send_stonith_update(stonith_ops_t * op); extern gboolean cib_action_update(crm_action_t *action, int status); /* utils */ extern crm_action_t *get_action(int id, gboolean confirmed); extern gboolean stop_te_timer(crm_action_timer_t *timer); extern const char *get_rsc_state(const char *task, op_status_t status); /* unpack */ extern gboolean extract_event(crm_data_t *msg); extern gboolean process_te_message( HA_Message * msg, crm_data_t *xml_data, IPC_Channel *sender); extern crm_graph_t *transition_graph; extern GTRIGSource *transition_trigger; extern char *te_uuid; extern cib_t *te_cib_conn; extern void notify_crmd(crm_graph_t *graph); #include extern void trigger_graph_processing(const char *fn, int line); extern void abort_transition_graph( int abort_priority, enum transition_action abort_action, const char *abort_text, crm_data_t *reason, const char *fn, int line); #define trigger_graph() trigger_graph_processing(__FUNCTION__, __LINE__) #define abort_transition(pri, action, text, reason) \ abort_transition_graph(pri, action, text, reason,__FUNCTION__,__LINE__); +extern gboolean te_connect_stonith(void); +extern GCHSource *stonith_src; + #endif diff --git a/crm/tengine/utils.c b/crm/tengine/utils.c index 5f271ba92c..e6ec768b41 100644 --- a/crm/tengine/utils.c +++ b/crm/tengine/utils.c @@ -1,100 +1,128 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include extern cib_t *te_cib_conn; +GCHSource *stonith_src = NULL; + +gboolean +te_connect_stonith(void) +{ + IPC_Channel *fence_ch = NULL; + CRM_CHECK(stonith_src == NULL, return TRUE); + + crm_info("Attempting connection to fencing daemon..."); + + if(ST_OK != stonithd_signon("tengine")) { + crm_err("Could not sign in"); + return FALSE; + } + + CRM_ASSERT(stonithd_set_stonith_ops_callback( + tengine_stonith_callback) == ST_OK); + + fence_ch = stonithd_input_IPC_channel(); + CRM_ASSERT(fence_ch != NULL); + + stonith_src = G_main_add_IPC_Channel( + G_PRIORITY_LOW, fence_ch, FALSE, tengine_stonith_dispatch, NULL, + tengine_stonith_connection_destroy); + + CRM_ASSERT(stonith_src != NULL); + return TRUE; +} gboolean stop_te_timer(crm_action_timer_t *timer) { const char *timer_desc = "action timer"; if(timer == NULL) { return FALSE; } if(timer->reason == timeout_abort) { timer_desc = "global timer"; } if(timer->source_id != 0) { crm_debug_2("Stopping %s", timer_desc); Gmain_timeout_remove(timer->source_id); timer->source_id = 0; } else { return FALSE; } return TRUE; } void trigger_graph_processing(const char *fn, int line) { G_main_set_trigger(transition_trigger); crm_debug_2("%s:%d - Triggered graph processing", fn, line); } void abort_transition_graph( int abort_priority, enum transition_action abort_action, const char *abort_text, crm_data_t *reason, const char *fn, int line) { int log_level = LOG_DEBUG; /* if(abort_priority >= INFINITY) { log_level = LOG_INFO; } */ update_abort_priority( transition_graph, abort_priority, abort_action, abort_text); do_crm_log(log_level, "%s:%d - Triggered graph processing : %s", fn, line, abort_text); if(reason != NULL) { const char *magic = crm_element_value( reason, XML_ATTR_TRANSITION_MAGIC); if(magic) { do_crm_log(log_level, "Caused by update to %s: %s", ID(reason), magic); } else { crm_log_xml(log_level, "Cause", reason); } } if(transition_graph->complete) { notify_crmd(transition_graph); } else { G_main_set_trigger(transition_trigger); } } diff --git a/cts/CM_LinuxHAv2.py.in b/cts/CM_LinuxHAv2.py.in index ec1cd8a2c8..767de204d7 100755 --- a/cts/CM_LinuxHAv2.py.in +++ b/cts/CM_LinuxHAv2.py.in @@ -1,690 +1,693 @@ #!@PYTHON@ '''CTS: Cluster Testing System: LinuxHA v2 dependent modules... ''' __copyright__=''' Author: Huang Zhen Copyright (C) 2004 International Business Machines Additional Audits, Revised Start action, Default Configuration: Copyright (C) 2004 Andrew Beekhof ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import os,sys,CTS,CTSaudits,CTStests, warnings from CTS import * from CM_hb import HeartbeatCM from CTSaudits import ClusterAudit from CTStests import * from CIB import * try: from xml.dom.minidom import * except ImportError: sys.__stdout__.write("Python module xml.dom.minidom not found\n") sys.__stdout__.write("Please install python-xml or similar before continuing\n") sys.__stdout__.flush() sys.exit(1) ####################################################################### # # LinuxHA v2 dependent modules # ####################################################################### class LinuxHAv2(HeartbeatCM): ''' The linux-ha version 2 cluster manager class. It implements the things we need to talk to and manipulate linux-ha version 2 clusters ''' def __init__(self, Environment, randseed=None): HeartbeatCM.__init__(self, Environment, randseed=randseed) self.clear_cache = 0 self.cib_installed = 0 self.config = None self.cluster_monitor = 0 self.use_short_names = 1 self.update({ "Name" : "linux-ha-v2", "DeadTime" : 300, "StartTime" : 300, # Max time to start up "StableTime" : 30, "StartCmd" : "@INITDIR@/heartbeat@INIT_EXT@ start > /dev/null 2>&1", "StopCmd" : "@INITDIR@/heartbeat@INIT_EXT@ stop > /dev/null 2>&1", "ElectionCmd" : "@sbindir@/crmadmin -E %s", "StatusCmd" : "@sbindir@/crmadmin -S %s 2>/dev/null", "EpocheCmd" : "@sbindir@/ccm_tool -e", "QuorumCmd" : "@sbindir@/ccm_tool -q", "CibQuery" : "@sbindir@/cibadmin -Ql", "ParitionCmd" : "@sbindir@/ccm_tool -p", "IsRscRunning" : "@libdir@/heartbeat/lrmadmin -E %s monitor 0 0 EVERYTIME 2>/dev/null|grep return", "ExecuteRscOp" : "@libdir@/heartbeat/lrmadmin -n %s -E %s %s 0 %d EVERYTIME 2>/dev/null", "CIBfile" : "%s:@HA_VARLIBDIR@/heartbeat/crm/cib.xml", "TmpDir" : "/tmp", "BreakCommCmd2" : "@HA_NOARCHDATAHBDIR@/TestHeartbeatComm break-communication %s>/dev/null 2>&1", "IsIPAddrRscRunning" : "", "StandbyCmd" : "@sbindir@/crm_standby -U %s -v %s 2>/dev/null", "UUIDQueryCmd" : "@sbindir@/crmadmin -N", "StandbyQueryCmd" : "@sbindir@/crm_standby -GQ -U %s 2>/dev/null", # Patterns to look for in the log files for various occasions... "Pat:DC_IDLE" : "crmd.*State transition.*-> S_IDLE", # This wont work if we have multiple partitions # Use: "Pat:They_started" : "%s crmd:.*State transition.*-> S_NOT_DC", "Pat:They_started" : "Updating node state to member for %s", "Pat:We_started" : "%s crmd:.* State transition.*-> S_IDLE", "Pat:We_stopped" : "heartbeat.*%s.*Heartbeat shutdown complete", "Pat:Logd_stopped" : "%s logd:.*Exiting write process", "Pat:They_stopped" : "%s crmd:.*LOST:.* %s ", "Pat:All_stopped" : "heartbeat.*%s.*Heartbeat shutdown complete", "Pat:They_dead" : "node %s.*: is dead", "Pat:TransitionComplete" : "Transition status: Complete: complete", # Bad news Regexes. Should never occur. "BadRegexes" : ( r"ERROR:", r"CRIT:", r"Shutting down\.", r"Forcing shutdown\.", r"Timer I_TERMINATE just popped", r"input=I_ERROR", r"input=I_FAIL", r"input=I_INTEGRATED cause=C_TIMER_POPPED", r"input=I_FINALIZED cause=C_TIMER_POPPED", r"input=I_ERROR", r", exiting\.", r"WARN.*Ignoring HA message.*vote.*not in our membership list", r"pengine.*Attempting recovery of resource", r"tengine.*is taking more than 2x its timeout", r"Confirm not received from", r"Welcome reply not received from", r"Attempting to schedule .* after a stop", r"Resource .* was active at shutdown", r"duplicate entries for call_id", r"Search terminated:", r"No need to invoke the TE", r":global_timer_callback", r"Updating failcount for ", r"Faking parameter digest creation", r"Parameters to .* action changed:", r"apply_xml_diff: Diff application failed!", ), }) del self["Standby"] if self.Env["DoBSC"]: del self["Pat:They_stopped"] del self["Pat:Logd_stopped"] self.Env["use_logd"] = 0 self.check_transitions = 0 self.check_elections = 0 self.CIBsync = {} self.default_cts_cib=CIB(self).cib() self.debug(self.default_cts_cib) def errorstoignore(self): # At some point implement a more elegant solution that # also produces a report at the end '''Return list of errors which are known and very noisey should be ignored''' if 1: return [ "crmadmin:", "ERROR: Message hist queue is filling up" ] return [] def install_config(self, node): if not self.ns.WaitForNodeToComeUp(node): self.log("Node %s is not up." % node) return None if not self.CIBsync.has_key(node) and self.Env["ClobberCIB"] == 1: self.CIBsync[node] = 1 self.rsh.remote_py(node, "os", "system", "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml") self.rsh.remote_py(node, "os", "system", "rm -f @HA_VARLIBDIR@/heartbeat/crm/cib.xml.sig") # Only install the CIB on the first node, all the other ones will pick it up from there if self.cib_installed == 1: return None self.cib_installed = 1 if self.Env["CIBfilename"] == None: self.debug("Installing Generated CIB on node %s" %(node)) warnings.filterwarnings("ignore") cib_file=os.tmpnam() warnings.resetwarnings() os.system("rm -f "+cib_file) self.debug("Creating new CIB for " + node + " in: " + cib_file) os.system("echo \'" + self.default_cts_cib + "\' > " + cib_file) if 0!=self.rsh.echo_cp(None, cib_file, node, "@HA_VARLIBDIR@/heartbeat/crm/cib.xml"): raise ValueError("Can not create CIB on %s "%node) os.system("rm -f "+cib_file) else: self.debug("Installing CIB (%s) on node %s" %(self.Env["CIBfilename"], node)) if 0!=self.rsh.cp(self.Env["CIBfilename"], "root@" + (self["CIBfile"]%node)): raise ValueError("Can not scp file to %s "%node) self.rsh.remote_py(node, "os", "system", "chown @HA_CCMUSER@ @HA_VARLIBDIR@/heartbeat/crm/cib.xml") def prepare(self): '''Finish the Initialization process. Prepare to test...''' for node in self.Env["nodes"]: self.ShouldBeStatus[node] = "" self.StataCM(node) def test_node_CM(self, node): '''Report the status of the cluster manager on a given node''' watchpats = [ ] watchpats.append("Current ping state: (S_IDLE|S_NOT_DC)") watchpats.append(self["Pat:They_started"]%node) idle_watch = CTS.LogWatcher(self["LogFileName"], watchpats) idle_watch.setwatch() out=self.rsh.readaline(node, self["StatusCmd"]%node) ret= (string.find(out, 'ok') != -1) self.debug("Node %s status: %s" %(node, out)) if not ret: if self.ShouldBeStatus[node] == self["up"]: self.log( "Node status for %s is %s but we think it should be %s" %(node, self["down"], self.ShouldBeStatus[node])) self.ShouldBeStatus[node]=self["down"] return 0 if self.ShouldBeStatus[node] == self["down"]: self.log( "Node status for %s is %s but we think it should be %s: %s" %(node, self["up"], self.ShouldBeStatus[node], out)) self.ShouldBeStatus[node]=self["up"] # check the output first - because syslog-ng looses messages if string.find(out, 'S_NOT_DC') != -1: # Up and stable return 2 if string.find(out, 'S_IDLE') != -1: # Up and stable return 2 # fall back to syslog-ng and wait if not idle_watch.look(): # just up self.debug("Warn: Node %s is unstable: %s" %(node, out)) return 1 # Up and stable return 2 # Is the node up or is the node down def StataCM(self, node): '''Report the status of the cluster manager on a given node''' if self.test_node_CM(node) > 0: return 1 return None # Being up and being stable is not the same question... def node_stable(self, node): '''Report the status of the cluster manager on a given node''' if self.test_node_CM(node) == 2: return 1 self.log("Warn: Node %s not stable" %(node)) return None def cluster_stable(self, timeout=None): watchpats = [ ] watchpats.append("Current ping state: S_IDLE") watchpats.append(self["Pat:DC_IDLE"]) if timeout == None: timeout = self["DeadTime"] idle_watch = CTS.LogWatcher(self["LogFileName"], watchpats, timeout) idle_watch.setwatch() any_up = 0 for node in self.Env["nodes"]: # have each node dump its current state if self.ShouldBeStatus[node] == self["up"]: self.rsh.readaline(node, (self["StatusCmd"] %node) ) any_up = 1 if any_up == 0: self.debug("Cluster is inactive") return 1 ret = idle_watch.look() if ret: self.debug(ret) return 1 self.log("Warn: Cluster Master not IDLE after %ds" % timeout) return None def is_node_dc(self, node, status_line=None): rc = 0 if not status_line: status_line = self.rsh.readaline(node, self["StatusCmd"]%node) if not status_line: rc = 0 elif string.find(status_line, 'S_IDLE') != -1: rc = 1 elif string.find(status_line, 'S_INTEGRATION') != -1: rc = 1 elif string.find(status_line, 'S_FINALIZE_JOIN') != -1: rc = 1 elif string.find(status_line, 'S_POLICY_ENGINE') != -1: rc = 1 elif string.find(status_line, 'S_TRANSITION_ENGINE') != -1: rc = 1 if rc == 1: self.debug("%s _is_ the DC" % node) return rc def active_resources(self, node): (rc, output) = self.rsh.remote_py( node, "os", "system", """@sbindir@/crm_mon -1 | grep "Started %s" """ % node) resources = [] for line in output: fields = line.split() resources.append(fields[0]) return resources def ResourceOp(self, resource, op, node, interval=0, app="lrmadmin"): ''' Execute an operation on a resource ''' self.rsh.readaline(node, self["ExecuteRscOp"] % (app, resource, op, interval)) return self.rsh.lastrc def ResourceLocation(self, rid): ResourceNodes = [] for node in self.Env["nodes"]: if self.ShouldBeStatus[node] == self["up"]: if self.ResourceOp(rid, "monitor", node) == 0: ResourceNodes.append(node) return ResourceNodes def isolate_node(self, node, allowlist): '''isolate the communication between the nodes''' rc = self.rsh(node, self["BreakCommCmd2"]%allowlist) if rc == 0: return 1 else: self.log("Could not break the communication from node: %s",node) return None def Configuration(self): if self.config: return self.config.getElementsByTagName('configuration')[0] warnings.filterwarnings("ignore") cib_file=os.tmpnam() warnings.resetwarnings() os.system("rm -f "+cib_file) if self.Env["ClobberCIB"] == 1: if self.Env["CIBfilename"] == None: self.debug("Creating new CIB in: " + cib_file) os.system("echo \'"+ self.default_cts_cib +"\' > "+ cib_file) else: os.system("cp "+self.Env["CIBfilename"]+" "+cib_file) else: if 0 != self.rsh.echo_cp( self.Env["nodes"][0], "@HA_VARLIBDIR@/heartbeat/crm/cib.xml", None, cib_file): raise ValueError("Can not copy file to %s, maybe permission denied"%cib_file) self.config = parse(cib_file) os.remove(cib_file) return self.config.getElementsByTagName('configuration')[0] def Resources(self): ResourceList = [] #read resources in cib configuration = self.Configuration() resources = configuration.getElementsByTagName('resources')[0] rscs = configuration.getElementsByTagName('primitive') incs = configuration.getElementsByTagName('clone') groups = configuration.getElementsByTagName('group') for rsc in rscs: if rsc in resources.childNodes: ResourceList.append(HAResource(self,rsc)) for grp in groups: for rsc in rscs: if rsc in grp.childNodes: if self.use_short_names: resource = HAResource(self,rsc) else: resource = HAResource(self,rsc,grp.getAttribute('id')) ResourceList.append(resource) for inc in incs: max = 0 inc_name = inc.getAttribute("id") instance_attributes = inc.getElementsByTagName('instance_attributes')[0] attributes = instance_attributes.getElementsByTagName('attributes')[0] nvpairs = attributes.getElementsByTagName('nvpair') for nvpair in nvpairs: if nvpair.getAttribute("name") == "clone_max": max = int(nvpair.getAttribute("value")) inc_rsc = inc.getElementsByTagName('primitive')[0] for i in range(0,max): rsc = HAResource(self,inc_rsc) rsc.inc_no = i rsc.inc_name = inc_name rsc.inc_max = max if self.use_short_names: rsc.rid = rsc.rid + ":%d"%i else: rsc.rid = inc_name+":"+rsc.rid + ":%d"%i rsc.Instance = rsc.rid ResourceList.append(rsc) return ResourceList def ResourceGroups(self): GroupList = [] #read resources in cib configuration = self.Configuration() groups = configuration.getElementsByTagName('group') rscs = configuration.getElementsByTagName('primitive') for grp in groups: group = [] GroupList.append(group) for rsc in rscs: if rsc in grp.childNodes: if self.use_short_names: resource = HAResource(self,rsc) else: resource = HAResource(self,rsc,grp.getAttribute('id')) group.append(resource) return GroupList def Dependencies(self): DependencyList = [] #read dependency in cib configuration=self.Configuration() constraints=configuration.getElementsByTagName('constraints')[0] rsc_to_rscs=configuration.getElementsByTagName('rsc_to_rsc') for node in rsc_to_rscs: dependency = {} dependency["id"]=node.getAttribute('id') dependency["from"]=node.getAttribute('from') dependency["to"]=node.getAttribute('to') dependency["type"]=node.getAttribute('type') dependency["strength"]=node.getAttribute('strength') DependencyList.append(dependency) return DependencyList def find_partitions(self): ccm_partitions = [] for node in self.Env["nodes"]: self.debug("Retrieving partition details for %s" %node) if self.ShouldBeStatus[node] == self["up"]: partition = self.rsh.readaline(node, self["ParitionCmd"]) if not partition: self.log("no partition details for %s" %node) elif len(partition) > 2: partition = partition[:-1] found=0 for a_partition in ccm_partitions: if partition == a_partition: found = 1 if found == 0: self.debug("Adding partition from %s: %s" %(node, partition)) ccm_partitions.append(partition) else: self.log("bad partition details for %s" %node) return ccm_partitions def HasQuorum(self, node_list): # If we are auditing a partition, then one side will # have quorum and the other not. # So the caller needs to tell us which we are checking # If no value for node_list is specified... assume all nodes if not node_list: node_list = self.Env["nodes"] for node in node_list: if self.ShouldBeStatus[node] == self["up"]: quorum = self.rsh.readaline(node, self["QuorumCmd"]) if string.find(quorum, "1") != -1: return 1 elif string.find(quorum, "0") != -1: return 0 else: self.log("WARN: Unexpected quorum test result from "+ node +":"+ quorum) return 0 def Components(self): complist = [] common_ignore = [ "Pending action:", "ERROR: crm_log_message_adv:", "ERROR: MSG: No message to dump", "pending LRM operations at shutdown", "Lost connection to the CIB service", "Connection to the CIB terminated...", "Sending message to CIB service FAILED", "apply_xml_diff: Diff application failed!", "crmd: .*Action A_RECOVER .* not supported", "send_ipc_message: IPC Channel to .* is not connected", "ERROR: cib_native_msgready: Message pending on command channel", ] stonith_ignore = [ "update_failcount: Updating failcount for child_DoFencing", + "ERROR: stonithd_signon: to fetch reply failed.", + "ERROR: te_connect_stonith: Could not sign in", ] stonith_ignore.extend(common_ignore) complist.append(Process("ccm", 0, [ "Respawning client .*crmd", "Respawning client .*cib", "CCM connection appears to have failed", "Client .*cib exited with return code 2.", "Client .*crmd exited with return code 2.", "crmd: .*Action A_RECOVER .* not supported", "crmd: .*Input I_TERMINATE from do_recover", "Exiting to recover from CCM connection failure", "crmd:.*do_exit: Could not recover from internal error", "crmd: .*I_ERROR.*(ccm_dispatch|crmd_cib_connection_destroy)", # "WARN: determine_online_status: Node .* is unclean", # "Scheduling Node .* for STONITH", # "Executing .* fencing operation", # "tengine_stonith_callback: .*result=0", "State transition S_STARTING -> S_PENDING", ], [], common_ignore, self)) complist.append(Process("cib", 0, [ "Respawning client .*crmd", "Lost connection to the CIB service", "Connection to the CIB terminated...", "Client .*crmd exited with return code 2.", "State transition S_STARTING -> S_PENDING", "crmd: .*Input I_TERMINATE from do_recover", "crmd: .*I_ERROR.*crmd_cib_connection_destroy", "crmd:.*do_exit: Could not recover from internal error", ], [], common_ignore, self)) complist.append(Process("lrmd", 0, [ "LRM Connection failed", "Respawning client .*crmd", "crmd: .*I_ERROR.*lrm_dispatch", "State transition S_STARTING -> S_PENDING", "Client .*crmd exited with return code 2.", "crmd: .*Input I_TERMINATE from do_recover", "crmd:.*do_exit: Could not recover from internal error", ], [], common_ignore, self)) complist.append(Process("crmd", 0, [ # "WARN: determine_online_status: Node .* is unclean", # "Scheduling Node .* for STONITH", # "Executing .* fencing operation", # "tengine_stonith_callback: .*result=0", "State transition S_STARTING -> S_PENDING", ], [ "tengine: .*ERROR: subsystem_msg_dispatch: The server .* has left us: Shutting down...NOW", "pengine: .*ERROR: subsystem_msg_dispatch: The server .* has left us: Shutting down...NOW", ], common_ignore, self)) complist.append(Process("pengine", 1, [ "Respawning client .*crmd", "Client .*crmd exited with return code 2.", "crmd: .*Input I_TERMINATE from do_recover", "crmd:.*do_exit: Could not recover from internal error", ], [], common_ignore, self)) complist.append(Process("tengine", 1, [ "Respawning client .*crmd", "Client .*crmd exited with return code 2.", "crmd: .*Input I_TERMINATE from do_recover", "crmd:.*do_exit: Could not recover from internal error", ], [], common_ignore, self)) if self.Env["DoFencing"] == 1 : complist.append(Process("stonithd", 0, [], [ "tengine_stonith_connection_destroy: Fencing daemon has left us", + "Attempting connection to fencing daemon", ], stonith_ignore, self)) # complist.append(Process("heartbeat", 0, [], [], [], self)) return complist def NodeUUID(self, node): lines = self.rsh.readlines(node, self["UUIDQueryCmd"]) for line in lines: self.debug("UUIDLine:"+ line) m = re.search(r'%s.+\((.+)\)' % node, line) if m: return m.group(1) return "" def StandbyStatus(self, node): out=self.rsh.readaline(node, self["StandbyQueryCmd"]%node) if not out: return "off" out = out[:-1] self.debug("Standby result: "+out) return out # status == "on" : Enter Standby mode # status == "off": Enter Active mode def SetStandbyMode(self, node, status): current_status = self.StandbyStatus(node) cmd = self["StandbyCmd"] % (node, status) ret = self.rsh(node, cmd) return True class HAResource(Resource): def __init__(self, cm, node, group=None): ''' Get information from xml node ''' if group == None : self.rid = str(node.getAttribute('id')) else : self.rid = group + ":" + str(node.getAttribute('id')) self.rclass = str(node.getAttribute('class')) self.rtype = str(node.getAttribute('type')) self.inc_name = None self.inc_no = -1 self.inc_max = -1 self.rparameters = {} nvpairs = [] list = node.getElementsByTagName('instance_attributes') if len(list) > 0: attributes = list[0] list = attributes.getElementsByTagName('attributes') if len(list) > 0: parameters = list[0] nvpairs = parameters.getElementsByTagName('nvpair') for nvpair in nvpairs: name=nvpair.getAttribute('name') value=nvpair.getAttribute('value') self.rparameters[name]=value # This should normally be called first... FIXME! Resource.__init__(self, cm, self.rtype, self.rid) # resources that dont need quorum will have: # ops = node.getElementsByTagName('op') for op in ops: if op.getAttribute('name') == "start" and op.getAttribute('prereq') == "nothing": self.needs_quorum = 0 def IsRunningOn(self, nodename): ''' This member function returns true if our resource is running on the given node in the cluster. We call the status operation for the resource script. ''' rc = self.CM.ResourceOp(self.rid, "monitor", nodename) return (rc == 0) def RunningNodes(self): return self.CM.ResourceLocation(self.rid) def Start(self, nodename): ''' This member function starts or activates the resource. ''' return self.CM.ResourceOp(self.rid, "start", nodename) def Stop(self, nodename): ''' This member function stops or deactivates the resource. ''' return self.CM.ResourceOp(self.rid, "stop", nodename) def IsWorkingCorrectly(self, nodename): return self.IsRunningOn(nodename) ####################################################################### # # A little test code... # # Which you are advised to completely ignore... # ####################################################################### if __name__ == '__main__': pass