diff --git a/crmd/lrm.c b/crmd/lrm.c index 5c01bd91d5..2e929456bc 100644 --- a/crmd/lrm.c +++ b/crmd/lrm.c @@ -1,1866 +1,1896 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include /* for access */ #include #include #include #include #include #include #include #include #include #include #include struct recurring_op_s { char *rsc_id; char *op_key; int call_id; int interval; + gboolean remove; + gboolean cancelled; }; char *make_stop_id(const char *rsc, int call_id); gboolean build_operation_update( xmlNode *rsc_list, lrm_op_t *op, const char *src, int lpc); gboolean build_active_RAs(xmlNode *rsc_list); gboolean is_rsc_active(const char *rsc_id); void do_update_resource(lrm_op_t *op); gboolean process_lrm_event(lrm_op_t *op); void do_lrm_rsc_op(lrm_rsc_t *rsc, const char *operation, xmlNode *msg, xmlNode *request); lrm_op_t *construct_op( xmlNode *rsc_op, const char *rsc_id, const char *operation); void send_direct_ack(const char *to_host, const char *to_sys, lrm_op_t* op, const char *rsc_id); void free_recurring_op(gpointer value); GHashTable *meta_hash = NULL; GHashTable *resources = NULL; GHashTable *pending_ops = NULL; GCHSource *lrm_source = NULL; int num_lrm_register_fails = 0; int max_lrm_register_fails = 30; /* A_LRM_CONNECT */ void do_lrm_control(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { int ret = HA_OK; if(action & A_LRM_DISCONNECT) { if(verify_stopped(cur_state, LOG_INFO) == FALSE) { crmd_fsa_stall(NULL); return; } if(lrm_source) { crm_debug("Removing LRM connection from MainLoop"); if(G_main_del_IPC_Channel(lrm_source) == FALSE) { crm_err("Could not remove LRM connection" " from MainLoop"); } lrm_source = NULL; } if(fsa_lrm_conn) { fsa_lrm_conn->lrm_ops->signoff(fsa_lrm_conn); crm_info("Disconnected from the LRM"); clear_bit_inplace(fsa_input_register, R_LRM_CONNECTED); } /* TODO: Clean up the hashtable */ } if(action & A_LRM_CONNECT) { ret = HA_OK; pending_ops = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, free_recurring_op); resources = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); if(NULL == fsa_lrm_conn) { register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); ret = HA_FAIL; } if(ret == HA_OK) { crm_debug("Connecting to the LRM"); ret = fsa_lrm_conn->lrm_ops->signon( fsa_lrm_conn, CRM_SYSTEM_CRMD); } if(ret != HA_OK) { if(++num_lrm_register_fails < max_lrm_register_fails) { crm_warn("Failed to sign on to the LRM %d" " (%d max) times", num_lrm_register_fails, max_lrm_register_fails); crm_timer_start(wait_timer); crmd_fsa_stall(NULL); return; } } if(ret == HA_OK) { crm_debug_4("LRM: set_lrm_callback..."); ret = fsa_lrm_conn->lrm_ops->set_lrm_callback( fsa_lrm_conn, lrm_op_callback); if(ret != HA_OK) { crm_err("Failed to set LRM callbacks"); } } if(ret != HA_OK) { crm_err("Failed to sign on to the LRM %d" " (max) times", num_lrm_register_fails); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); return; } /* TODO: create a destroy handler that causes * some recovery to happen */ lrm_source = G_main_add_IPC_Channel( G_PRIORITY_LOW, fsa_lrm_conn->lrm_ops->ipcchan(fsa_lrm_conn), FALSE, lrm_dispatch, fsa_lrm_conn, default_ipc_connection_destroy); set_bit_inplace(fsa_input_register, R_LRM_CONNECTED); crm_debug("LRM connection established"); } if(action & ~(A_LRM_CONNECT|A_LRM_DISCONNECT)) { crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__); } } static void ghash_print_pending(gpointer key, gpointer value, gpointer user_data) { const char *stop_id = key; int *log_level = user_data; struct recurring_op_s *pending = value; do_crm_log(*log_level, "Pending action: %s (%s)", stop_id, pending->op_key); } static void ghash_print_pending_for_rsc(gpointer key, gpointer value, gpointer user_data) { const char *stop_id = key; char *rsc = user_data; struct recurring_op_s *pending = value; if(safe_str_eq(rsc, pending->rsc_id)) { do_crm_log(LOG_NOTICE, "%sction %s (%s) incomplete at shutdown", pending->interval==0?"A":"Recurring a", stop_id, pending->op_key); } } static void ghash_count_pending(gpointer key, gpointer value, gpointer user_data) { int *counter = user_data; struct recurring_op_s *pending = value; if(pending->interval > 0) { /* Ignore recurring actions in the shutdown calculations */ return; } (*counter)++; } gboolean verify_stopped(enum crmd_fsa_state cur_state, int log_level) { int counter = 0; gboolean rc = TRUE; GListPtr lrm_list = NULL; crm_debug("Checking for active resources before exit"); if(cur_state == S_TERMINATE) { log_level = LOG_ERR; } g_hash_table_foreach(pending_ops, ghash_count_pending, &counter); if(counter > 0) { rc = FALSE; do_crm_log(log_level, "%d pending LRM operations at shutdown%s", g_hash_table_size(pending_ops), cur_state == S_TERMINATE?"":"... waiting"); if(cur_state == S_TERMINATE || !is_set(fsa_input_register, R_SENT_RSC_STOP)) { g_hash_table_foreach( pending_ops, ghash_print_pending, &log_level); } goto bail; } if(lrm_source != NULL && fsa_lrm_conn != NULL) { lrm_list = fsa_lrm_conn->lrm_ops->get_all_rscs(fsa_lrm_conn); } slist_iter( rsc_id, char, lrm_list, lpc, if(is_rsc_active(rsc_id) == FALSE) { continue; } crm_err("Resource %s was active at shutdown." " You may ignore this error if it is unmanaged.", rsc_id); g_hash_table_foreach( pending_ops, ghash_print_pending_for_rsc, rsc_id); ); bail: set_bit_inplace(fsa_input_register, R_SENT_RSC_STOP); if(cur_state == S_TERMINATE) { rc = TRUE; } return rc; } static const char * get_rsc_metadata(const char *type, const char *class, const char *provider) { int len = 0; char *key = NULL; char *metadata = NULL; if(meta_hash == NULL) { meta_hash = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); } CRM_CHECK(type != NULL, return NULL); CRM_CHECK(class != NULL, return NULL); if(provider == NULL) { provider = "heartbeat"; } len = strlen(type) + strlen(class) + strlen(provider) + 4; crm_malloc0(key, len); sprintf(key, "%s::%s:%s", type, class, provider); key[len-1] = 0; metadata = g_hash_table_lookup(meta_hash, key); if(metadata) { crm_debug_2("Returning cached metadata for %s", key); goto out; } crm_debug("Retreiving metadata for %s", key); metadata = fsa_lrm_conn->lrm_ops->get_rsc_type_metadata( fsa_lrm_conn, class, type, provider); if(metadata) { /* copy the metadata because the LRM likes using * g_alloc instead of cl_malloc */ char *m_copy = crm_strdup(metadata); g_hash_table_insert(meta_hash, key, m_copy); key = NULL; /* prevent it from being free'd */ g_free(metadata); metadata = m_copy; } else { crm_warn("No metadata found for %s", key); } out: crm_free(key); return metadata; } static GListPtr get_rsc_restart_list(lrm_rsc_t *rsc, lrm_op_t *op) { gboolean supported = FALSE; GListPtr restart_list = NULL; const char *value = NULL; const char *metadata_str = get_rsc_metadata( rsc->type, rsc->class, rsc->provider); xmlNode *params = NULL; xmlNode *actions = NULL; xmlNode *metadata = NULL; if(metadata_str == NULL) { return NULL; } metadata = string2xml(metadata_str); if(metadata == NULL) { crm_err("Metadata for %s::%s:%s is not valid XML", rsc->provider, rsc->class, rsc->type); return NULL; } actions = find_xml_node(metadata, "actions", TRUE); xml_child_iter_filter( actions, action, "action", value = crm_element_value(action, "name"); if(safe_str_eq("reload", value)) { supported = TRUE; break; } ); if(supported == FALSE) { goto cleanup; } params = find_xml_node(metadata, "parameters", TRUE); xml_child_iter_filter( params, param, "parameter", value = crm_element_value(param, "unique"); if(crm_is_true(value)) { value = crm_element_value(param, "name"); crm_debug("Attr %s is not reloadable", value); restart_list = g_list_append( restart_list, crm_strdup(value)); } ); cleanup: free_xml(metadata); return restart_list; } static void append_restart_list(xmlNode *update, lrm_op_t *op, const char *version) { int len = 0; char *list = NULL; char *digest = NULL; lrm_rsc_t *rsc = NULL; const char *value = NULL; gboolean non_empty = FALSE; xmlNode *restart = NULL; GListPtr restart_list = NULL; if(op->interval > 0) { /* monitors are not reloadable */ return; } else if(safe_str_neq(CRMD_ACTION_START, op->op_type)) { /* only starts are potentially reloadable */ return; } else if(compare_version("1.0.8", version) > 0) { crm_debug("Caller version %s does not support reloads", version); return; } rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, op->rsc_id); if(rsc == NULL) { crm_info("Resource %s no longer in the LRM", op->rsc_id); return; } restart_list = get_rsc_restart_list(rsc, op); if(restart_list == NULL) { crm_debug("Resource %s does not support reloads", op->rsc_id); return; } restart = create_xml_node(NULL, "parameters"); slist_iter(param, const char, restart_list, lpc, int start = len; value = g_hash_table_lookup(op->params, param); if(value != NULL) { non_empty = TRUE; crm_xml_add(restart, param, value); } len += strlen(param) + 2; crm_realloc(list, len+1); sprintf(list+start, " %s ", param); ); digest = calculate_xml_digest(restart, TRUE, FALSE); crm_xml_add(update, XML_LRM_ATTR_OP_RESTART, list); crm_xml_add(update, XML_LRM_ATTR_RESTART_DIGEST, digest); crm_debug("%s : %s", digest, list); if(non_empty) { crm_log_xml_debug(restart, "restart digest source"); } slist_destroy(char, child, restart_list, crm_free(child)); free_xml(restart); crm_free(digest); crm_free(list); } gboolean build_operation_update( xmlNode *xml_rsc, lrm_op_t *op, const char *src, int lpc) { char *magic = NULL; const char *task = NULL; xmlNode *xml_op = NULL; char *op_id = NULL; char *local_user_data = NULL; const char *caller_version = NULL; char *digest = NULL; xmlNode *args_xml = NULL; xmlNode *args_parent = NULL; CRM_DEV_ASSERT(op != NULL); if(crm_assert_failed) { return FALSE; } crm_debug_2("%s: Updating resouce %s after %s %s op", src, op->rsc_id, op_status2text(op->op_status), op->op_type); if(op->op_status == LRM_OP_CANCELLED) { crm_debug_3("Ignoring cancelled op"); return TRUE; } if(AM_I_DC) { caller_version = CRM_FEATURE_SET; } else if(fsa_our_dc_version != NULL) { caller_version = fsa_our_dc_version; } else { /* there is a small risk in formerly mixed clusters that * it will be sub-optimal. * however with our upgrade policy, the update we send * should still be completely supported anyway */ caller_version = g_hash_table_lookup( op->params, XML_ATTR_CRM_VERSION); crm_warn("Falling back to operation originator version: %s", caller_version); } crm_debug_3("DC version: %s", caller_version); task = op->op_type; /* remap the task name under various scenarios * this makes life easier for the PE when its trying determin the current state */ if(crm_str_eq(task, "reload", TRUE)) { if(op->op_status == LRM_OP_DONE) { task = CRMD_ACTION_START; } else { task = CRMD_ACTION_STATUS; } } else if(crm_str_eq(task, CRMD_ACTION_MIGRATE, TRUE)) { /* if the migrate_from fails it will have enough info to do the right thing */ if(op->op_status == LRM_OP_DONE) { task = CRMD_ACTION_STOP; } else { task = CRMD_ACTION_STATUS; } } else if(op->op_status == LRM_OP_DONE && crm_str_eq(task, CRMD_ACTION_MIGRATED, TRUE)) { task = CRMD_ACTION_START; } if(safe_str_eq(task, CRMD_ACTION_NOTIFY)) { const char *n_type = g_hash_table_lookup( op->params, crm_meta_name("notify_type")); const char *n_task = g_hash_table_lookup( op->params, crm_meta_name("notify_operation")); #if CRM_DEPRECATED_SINCE_2_0_5 if(n_type == NULL) { n_type = g_hash_table_lookup(op->params, "notify_type"); } if(n_task == NULL) { n_task = g_hash_table_lookup(op->params, "notify_operation"); } #endif CRM_DEV_ASSERT(n_type != NULL); CRM_DEV_ASSERT(n_task != NULL); op_id = generate_notify_key(op->rsc_id, n_type, n_task); /* these are not yet allowed to fail */ op->op_status = LRM_OP_DONE; op->rc = 0; } else { op_id = generate_op_key(op->rsc_id, task, op->interval); } xml_op = find_entity(xml_rsc, XML_LRM_TAG_RSC_OP, op_id); if(xml_op != NULL) { crm_log_xml(LOG_DEBUG, "Replacing existing entry", xml_op); } else { xml_op = create_xml_node(xml_rsc, XML_LRM_TAG_RSC_OP); } crm_xml_add(xml_op, XML_ATTR_ID, op_id); crm_free(op_id); crm_xml_add(xml_op, XML_LRM_ATTR_TASK, task); crm_xml_add(xml_op, XML_ATTR_ORIGIN, src); if(op->user_data == NULL) { char *id = crm_itoa(op->call_id); crm_debug("Generating fake transition key for:" " %s_%s_%d %d from %s", op->rsc_id, op->op_type, op->interval, op->call_id, op->app_name); local_user_data = generate_transition_key(-1, 0, id); op->user_data = local_user_data; crm_free(id); } if(compare_version("1.0.3", caller_version) > 0) { magic = generate_transition_magic_v202( op->user_data, op->op_status); } else { magic = generate_transition_magic( op->user_data, op->op_status, op->rc); } crm_xml_add(xml_op, XML_ATTR_TRANSITION_KEY, op->user_data); crm_xml_add(xml_op, XML_ATTR_TRANSITION_MAGIC, magic); crm_free(magic); switch(op->op_status) { case LRM_OP_PENDING: break; case LRM_OP_CANCELLED: crm_err("What to do here"); break; case LRM_OP_ERROR: case LRM_OP_TIMEOUT: case LRM_OP_NOTSUPPORTED: crm_debug_2("Resource action %s/%s %s: %d", op->rsc_id, task, op_status2text(op->op_status), op->rc); break; case LRM_OP_DONE: break; } crm_xml_add_int(xml_op, XML_LRM_ATTR_CALLID, op->call_id); /* set these on 'xml_rsc' too to make life easy for the PE */ crm_xml_add(xml_op, XML_ATTR_CRM_VERSION, caller_version); crm_xml_add_int(xml_op, XML_LRM_ATTR_RC, op->rc); crm_xml_add_int(xml_op, XML_LRM_ATTR_OPSTATUS, op->op_status); crm_xml_add_int(xml_op, XML_LRM_ATTR_INTERVAL, op->interval); if(compare_version("2.1", caller_version) <= 0) { if(op->t_run || op->t_rcchange || op->exec_time || op->queue_time) { crm_debug("Timing data (%s_%s_%d): last=%lu change=%lu exec=%lu queue=%lu", op->rsc_id, op->op_type, op->interval, op->t_run, op->t_rcchange, op->exec_time, op->queue_time); crm_xml_add_int(xml_op, "last_run", op->t_run); crm_xml_add_int(xml_op, "last_rc_change", op->t_rcchange); crm_xml_add_int(xml_op, "exec_time", op->exec_time); crm_xml_add_int(xml_op, "queue_time", op->queue_time); } } /* this will enable us to later determin that the * resource's parameters have changed and we should force * a restart */ args_parent = NULL; #if CRM_DEPRECATED_SINCE_2_0_4 if(compare_version("1.0.4", caller_version) > 0) { args_parent = xml_op; } #endif args_xml = create_xml_node(args_parent, XML_TAG_PARAMS); g_hash_table_foreach(op->params, hash2field, args_xml); filter_action_parameters(args_xml, caller_version); digest = calculate_xml_digest(args_xml, TRUE, FALSE); if(op->interval == 0 && safe_str_neq(task, CRMD_ACTION_STOP)) { crm_debug("Calculated digest %s for %s (%s)\n", digest, ID(xml_op), crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC)); crm_log_xml(LOG_DEBUG, "digest:source", args_xml); } crm_xml_add(xml_op, XML_LRM_ATTR_OP_DIGEST, digest); crm_free(digest); if(args_parent == NULL) { free_xml(args_xml); } append_restart_list(xml_op, op, caller_version); if(op->op_status != LRM_OP_DONE && crm_str_eq(op->op_type, CRMD_ACTION_MIGRATED, TRUE)) { const char *host = g_hash_table_lookup( op->params, crm_meta_name("migrate_source_uuid")); crm_xml_add(xml_op, CRMD_ACTION_MIGRATED, host); } if(local_user_data) { crm_free(local_user_data); op->user_data = NULL; } return TRUE; } gboolean is_rsc_active(const char *rsc_id) { GList *op_list = NULL; gboolean active = FALSE; lrm_rsc_t *the_rsc = NULL; state_flag_t cur_state = 0; int max_call_id = -1; if(fsa_lrm_conn == NULL) { return FALSE; } the_rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, rsc_id); crm_debug_3("Processing lrm_rsc_t entry %s", rsc_id); if(the_rsc == NULL) { crm_err("NULL resource returned from the LRM"); return FALSE; } op_list = the_rsc->ops->get_cur_state(the_rsc, &cur_state); crm_debug_3("\tcurrent state:%s",cur_state==LRM_RSC_IDLE?"Idle":"Busy"); slist_iter( op, lrm_op_t, op_list, llpc, crm_debug_2("Processing op %s_%d (%d) for %s (status=%d, rc=%d)", op->op_type, op->interval, op->call_id, the_rsc->id, op->op_status, op->rc); CRM_ASSERT(max_call_id <= op->call_id); if(op->rc == EXECRA_OK && safe_str_eq(op->op_type, CRMD_ACTION_STOP)) { active = FALSE; } else if(op->rc == EXECRA_OK && safe_str_eq(op->op_type, CRMD_ACTION_MIGRATE)) { /* a stricter check is too complex... * leave that to the PE */ active = FALSE; } else if(op->rc == EXECRA_NOT_RUNNING) { active = FALSE; } else { active = TRUE; } max_call_id = op->call_id; lrm_free_op(op); ); g_list_free(op_list); lrm_free_rsc(the_rsc); return active; } gboolean build_active_RAs(xmlNode *rsc_list) { GList *op_list = NULL; GList *lrm_list = NULL; gboolean found_op = FALSE; state_flag_t cur_state = 0; if(fsa_lrm_conn == NULL) { return FALSE; } lrm_list = fsa_lrm_conn->lrm_ops->get_all_rscs(fsa_lrm_conn); slist_iter( rid, char, lrm_list, lpc, lrm_rsc_t *the_rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, rid); xmlNode *xml_rsc = create_xml_node( rsc_list, XML_LRM_TAG_RESOURCE); int max_call_id = -1; crm_debug_2("Processing lrm_rsc_t entry %s", rid); if(the_rsc == NULL) { crm_err("NULL resource returned from the LRM"); continue; } crm_xml_add(xml_rsc, XML_ATTR_ID, the_rsc->id); crm_xml_add(xml_rsc, XML_ATTR_TYPE, the_rsc->type); crm_xml_add(xml_rsc, XML_AGENT_ATTR_CLASS, the_rsc->class); crm_xml_add(xml_rsc, XML_AGENT_ATTR_PROVIDER,the_rsc->provider); op_list = the_rsc->ops->get_cur_state(the_rsc, &cur_state); crm_debug_2("\tcurrent state:%s", cur_state==LRM_RSC_IDLE?"Idle":"Busy"); slist_iter( op, lrm_op_t, op_list, llpc, crm_debug_2("Processing op %s for %s (status=%d, rc=%d)", op->op_type, the_rsc->id, op->op_status, op->rc); if(max_call_id < op->call_id) { build_operation_update( xml_rsc, op, __FUNCTION__, llpc); } else if(max_call_id > op->call_id) { crm_err("Bad call_id in list=%d. Previous call_id=%d", op->call_id, max_call_id); } else { crm_warn("lrm->get_cur_state() returned" " duplicate entries for call_id=%d", op->call_id); } max_call_id = op->call_id; found_op = TRUE; lrm_free_op(op); ); if(found_op == FALSE && g_list_length(op_list) != 0) { crm_err("Could not properly determin last op" " for %s from %d entries", the_rsc->id, g_list_length(op_list)); } g_list_free(op_list); lrm_free_rsc(the_rsc); ); g_list_free(lrm_list); return TRUE; } xmlNode* do_lrm_query(gboolean is_replace) { gboolean shut_down = FALSE; xmlNode *xml_result= NULL; xmlNode *xml_state = NULL; xmlNode *xml_data = NULL; xmlNode *rsc_list = NULL; const char *exp_state = CRMD_STATE_ACTIVE; if(is_set(fsa_input_register, R_SHUTDOWN)) { exp_state = CRMD_STATE_INACTIVE; shut_down = TRUE; } xml_state = create_node_state( fsa_our_uname, ACTIVESTATUS, XML_BOOLEAN_TRUE, ONLINESTATUS, CRMD_JOINSTATE_MEMBER, exp_state, !shut_down, __FUNCTION__); xml_data = create_xml_node(xml_state, XML_CIB_TAG_LRM); crm_xml_add(xml_data, XML_ATTR_ID, fsa_our_uuid); rsc_list = create_xml_node(xml_data, XML_LRM_TAG_RESOURCES); /* Build a list of active (not always running) resources */ build_active_RAs(rsc_list); if(is_replace) { crm_xml_add(xml_state, XML_CIB_ATTR_REPLACE, XML_CIB_TAG_LRM); } xml_result = create_cib_fragment(xml_state, XML_CIB_TAG_STATUS); free_xml(xml_state); crm_log_xml_debug_3(xml_state, "Current state of the LRM"); return xml_result; } static void delete_rsc_entry(const char *rsc_id) { xmlNode *xml_top = NULL; xmlNode *xml_tmp = NULL; /* * Remove the rsc from the CIB * * Avoids refreshing the entire LRM section of this host */ CRM_CHECK(rsc_id != NULL, return); xml_top = create_xml_node(NULL, XML_CIB_TAG_STATE); crm_xml_add(xml_top, XML_ATTR_ID, fsa_our_uuid); xml_tmp = create_xml_node(xml_top, XML_CIB_TAG_LRM); crm_xml_add(xml_tmp, XML_ATTR_ID, fsa_our_uuid); xml_tmp = create_xml_node(xml_tmp, XML_LRM_TAG_RESOURCES); xml_tmp = create_xml_node(xml_tmp, XML_LRM_TAG_RESOURCE); crm_xml_add(xml_tmp, XML_ATTR_ID, rsc_id); crm_debug("sync: Sending delete op for %s", rsc_id); fsa_cib_conn->cmds->delete_absolute(fsa_cib_conn, XML_CIB_TAG_STATUS, xml_top, NULL, cib_quorum_override); /* crm_log_xml_err(xml_top, "op:cancel"); */ free_xml(xml_top); } static void delete_op_entry(lrm_op_t *op, const char *rsc_id, const char *key, int call_id) { xmlNode *xml_top = NULL; /* * Remove the op from the CIB * * Avoids refreshing the entire LRM section of this host */ + if(op != NULL) { xml_top = create_xml_node(NULL, XML_LRM_TAG_RSC_OP); crm_xml_add_int(xml_top, XML_LRM_ATTR_CALLID, op->call_id); crm_xml_add(xml_top, XML_ATTR_TRANSITION_KEY, op->user_data); crm_debug("async: Sending delete op for %s_%s_%d (call=%d)", op->rsc_id, op->op_type, op->interval, op->call_id); fsa_cib_conn->cmds->delete(fsa_cib_conn, XML_CIB_TAG_STATUS, xml_top, NULL, cib_quorum_override); } else if (rsc_id != NULL && key != NULL) { xmlNode *xml_tmp = NULL; xml_top = create_xml_node(NULL, XML_CIB_TAG_STATE); crm_xml_add(xml_top, XML_ATTR_ID, fsa_our_uuid); xml_tmp = create_xml_node(xml_top, XML_CIB_TAG_LRM); crm_xml_add(xml_tmp, XML_ATTR_ID, fsa_our_uuid); xml_tmp = create_xml_node(xml_tmp, XML_LRM_TAG_RESOURCES); xml_tmp = create_xml_node(xml_tmp, XML_LRM_TAG_RESOURCE); crm_xml_add(xml_tmp, XML_ATTR_ID, rsc_id); xml_tmp = create_xml_node(xml_tmp, XML_LRM_TAG_RSC_OP); crm_xml_add(xml_tmp, XML_ATTR_ID, key); if(call_id > 0) { crm_xml_add_int(xml_tmp, XML_LRM_ATTR_CALLID, call_id); } crm_debug("sync: Sending delete op for %s (call=%d)", key, call_id); fsa_cib_conn->cmds->delete_absolute(fsa_cib_conn, XML_CIB_TAG_STATUS, xml_top, NULL, cib_quorum_override); } else { crm_err("Not enough information to delete op entry: rsc=%p key=%p", rsc_id, key); return; } crm_log_xml_debug_2(xml_top, "op:cancel"); - free_xml(xml_top); } static gboolean cancel_op(lrm_rsc_t *rsc, const char *key, int op, gboolean remove) { int rc = HA_OK; + struct recurring_op_s *pending = NULL; CRM_CHECK(op != 0, return FALSE); CRM_CHECK(rsc != NULL, return FALSE); if(key == NULL) { - key = "unknown"; + key = make_stop_id(rsc->id, op); } - + pending = g_hash_table_lookup(pending_ops, key); + + if(pending) { + if(remove && pending->remove == FALSE) { + pending->remove = TRUE; + crm_debug("Scheduling %s for removal", key); + } + + if(pending->cancelled) { + crm_debug("Operation %s already cancelled", key); + return TRUE; + } + + pending->cancelled = TRUE; + } + crm_debug("Cancelling op %d for %s (%s)", op, rsc->id, key); rc = rsc->ops->cancel_op(rsc, op); if(rc != HA_OK) { crm_debug("Op %d for %s (%s): Nothing to cancel", op, rsc->id, key); - if(key && remove) { - delete_op_entry(NULL, rsc->id, key, op); - } /* The caller needs to make sure the entry is * removed from the pending_ops list * * Usually by returning TRUE inside the worker function * supplied to g_hash_table_foreach_remove() * * Not removing the entry from pending_ops will block * the node from shutting down */ return FALSE; } return TRUE; } struct cancel_data { gboolean done; + gboolean remove; const char *key; lrm_rsc_t *rsc; }; static gboolean cancel_action_by_key(gpointer key, gpointer value, gpointer user_data) { struct cancel_data *data = user_data; struct recurring_op_s *op = (struct recurring_op_s*)value; if(safe_str_eq(op->op_key, data->key)) { data->done = TRUE; - if (cancel_op(data->rsc, key, op->call_id, TRUE) == FALSE) { + if (cancel_op(data->rsc, key, op->call_id, data->remove) == FALSE) { return TRUE; } } return FALSE; } static gboolean cancel_op_key(lrm_rsc_t *rsc, const char *key, gboolean remove) { struct cancel_data data; CRM_CHECK(rsc != NULL, return FALSE); CRM_CHECK(key != NULL, return FALSE); data.key = key; data.rsc = rsc; data.done = FALSE; + data.remove = remove; g_hash_table_foreach_remove(pending_ops, cancel_action_by_key, &data); - - if(data.done == FALSE && remove) { - crm_err("No known %s operation to cancel", key); - delete_op_entry(NULL, rsc->id, key, 0); - } - return data.done; } static lrm_rsc_t * get_lrm_resource(xmlNode *resource, xmlNode *op_msg, gboolean do_create) { char rid[64]; lrm_rsc_t *rsc = NULL; const char *short_id = ID(resource); const char *long_id = crm_element_value(resource, XML_ATTR_ID_LONG); crm_debug_2("Retrieving %s from the LRM.", short_id); CRM_CHECK(short_id != NULL, return NULL); if(rsc == NULL) { /* check if its already there (short name) */ strncpy(rid, short_id, 64); rid[63] = 0; rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, rid); } if(rsc == NULL && long_id != NULL) { /* try the long name instead */ strncpy(rid, long_id, 64); rid[63] = 0; rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, rid); } if(rsc == NULL && do_create) { /* add it to the LRM */ const char *type = crm_element_value(resource, XML_ATTR_TYPE); const char *class = crm_element_value(resource, XML_AGENT_ATTR_CLASS); const char *provider = crm_element_value(resource, XML_AGENT_ATTR_PROVIDER); GHashTable *params = xml2list(op_msg); CRM_CHECK(class != NULL, return NULL); CRM_CHECK(type != NULL, return NULL); crm_debug("Adding rsc %s before operation", short_id); strncpy(rid, short_id, 64); rid[63] = 0; #if CRM_DEPRECATED_SINCE_2_0_3 if(op_msg != NULL) { if(g_hash_table_lookup( params, XML_ATTR_CRM_VERSION) == NULL) { g_hash_table_destroy(params); params = xml2list_202(op_msg); } } #endif if(g_hash_table_size(params) == 0) { crm_log_xml_warn(op_msg, "EmptyParams"); } fsa_lrm_conn->lrm_ops->add_rsc( fsa_lrm_conn, rid, class, type, provider, params); rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, rid); g_hash_table_destroy(params); if(rsc == NULL) { fsa_data_t *msg_data = NULL; crm_err("Could not add resource %s to LRM", rid); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); } } return rsc; } static gboolean lrm_remove_deleted_op( gpointer key, gpointer value, gpointer user_data) { const char *rsc = user_data; struct recurring_op_s *pending = value; if(safe_str_eq(rsc, pending->rsc_id)) { crm_info("Removing op %s:%d for deleted resource %s", pending->op_key, pending->call_id, rsc); return TRUE; } return FALSE; } /* A_LRM_INVOKE */ void do_lrm_invoke(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input current_input, fsa_data_t *msg_data) { + gboolean done = FALSE; gboolean create_rsc = TRUE; const char *crm_op = NULL; const char *from_sys = NULL; const char *from_host = NULL; const char *operation = NULL; ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg); crm_op = crm_element_value(input->msg, F_CRM_TASK); from_sys = crm_element_value(input->msg, F_CRM_SYS_FROM); if(safe_str_neq(from_sys, CRM_SYSTEM_TENGINE)) { from_host = crm_element_value(input->msg, F_CRM_HOST_FROM); } crm_debug_2("LRM command from: %s", from_sys); if(safe_str_eq(crm_op, CRM_OP_LRM_DELETE)) { operation = CRMD_ACTION_DELETE; } else if(safe_str_eq(operation, CRM_OP_LRM_REFRESH)) { crm_op = CRM_OP_LRM_REFRESH; } else if(safe_str_eq(crm_op, CRM_OP_LRM_FAIL)) { crm_info("Failing resource..."); operation = "fail"; } else if(input->xml != NULL) { operation = crm_element_value(input->xml, XML_LRM_ATTR_TASK); } if(safe_str_eq(crm_op, CRM_OP_LRM_REFRESH)) { enum cib_errors rc = cib_ok; xmlNode *fragment = do_lrm_query(TRUE); crm_info("Forcing a local LRM refresh"); fsa_cib_update(XML_CIB_TAG_STATUS, fragment, cib_quorum_override, rc); free_xml(fragment); } else if(safe_str_eq(crm_op, CRM_OP_LRM_QUERY)) { xmlNode *data = do_lrm_query(FALSE); xmlNode *reply = create_reply(input->msg, data); if(relay_message(reply, TRUE) == FALSE) { crm_err("Unable to route reply"); crm_log_xml(LOG_ERR, "reply", reply); free_xml(reply); } free_xml(data); } else if(safe_str_eq(operation, CRM_OP_PROBED) || safe_str_eq(crm_op, CRM_OP_REPROBE)) { const char *probed = XML_BOOLEAN_TRUE; if(safe_str_eq(crm_op, CRM_OP_REPROBE)) { probed = XML_BOOLEAN_FALSE; } update_attr(fsa_cib_conn, cib_none, XML_CIB_TAG_STATUS, fsa_our_uuid, NULL, NULL, CRM_OP_PROBED, probed, FALSE); } else if(operation != NULL) { lrm_rsc_t *rsc = NULL; xmlNode *params = NULL; xmlNode *xml_rsc = find_xml_node( input->xml, XML_CIB_TAG_RESOURCE, TRUE); CRM_CHECK(xml_rsc != NULL, return); /* only the first 16 chars are used by the LRM */ params = find_xml_node(input->xml, XML_TAG_ATTRS, TRUE); if(safe_str_eq(operation, CRMD_ACTION_DELETE)) { create_rsc = FALSE; } rsc = get_lrm_resource(xml_rsc, input->xml, create_rsc); if(rsc == NULL && create_rsc) { crm_err("Invalid resource definition"); crm_log_xml_warn(input->msg, "bad input"); } else if(rsc == NULL) { lrm_op_t* op = NULL; crm_err("Not creating resource for a %s event: %s", operation, ID(input->xml)); crm_log_xml_warn(input->msg, "bad input"); op = construct_op(input->xml, ID(xml_rsc), operation); op->op_status = LRM_OP_DONE; op->rc = EXECRA_OK; CRM_ASSERT(op != NULL); send_direct_ack(from_host, from_sys, op, ID(xml_rsc)); free_lrm_op(op); } else if(safe_str_eq(operation, CRMD_ACTION_CANCEL)) { lrm_op_t* op = NULL; char *op_key = NULL; int call = 0; const char *call_id = NULL; const char *op_task = NULL; const char *op_interval = NULL; CRM_CHECK(params != NULL, crm_log_xml_warn(input->xml, "Bad command"); return); op_interval = crm_element_value(params, crm_meta_name("interval")); op_task = crm_element_value(params, crm_meta_name(XML_LRM_ATTR_TASK)); call_id = crm_element_value(params, crm_meta_name(XML_LRM_ATTR_CALLID)); #if CRM_DEPRECATED_SINCE_2_0_5 if(op_interval == NULL) { op_interval = crm_element_value(params, "interval"); } if(op_task == NULL) { op_task = crm_element_value(params, XML_LRM_ATTR_TASK); if(op_task == NULL) { op_task = crm_element_value(params, "task"); } } #endif CRM_CHECK(op_task != NULL, crm_log_xml_warn(input->xml, "Bad command"); return); CRM_CHECK(op_interval != NULL, crm_log_xml_warn(input->xml, "Bad command"); return); op = construct_op(input->xml, rsc->id, op_task); CRM_ASSERT(op != NULL); op_key = generate_op_key( rsc->id,op_task,crm_parse_int(op_interval,"0")); - crm_debug("PE requested op %s be cancelled", op_key); + crm_debug("PE requested op %s (call=%s) be cancelled", + op_key, call_id?call_id:"NA"); call = crm_parse_int(call_id, "0"); if(call == 0) { - cancel_op_key(rsc, op_key, TRUE); + /* the normal case when the PE cancels a recurring op */ + done = cancel_op_key(rsc, op_key, TRUE); + } else { - cancel_op(rsc, op_key, call, TRUE); - g_hash_table_remove(pending_ops, op_key); + /* the normal case when the PE cancels an orphan op */ + done = cancel_op(rsc, op_key, call, TRUE); + } + + if(done == FALSE) { + crm_debug("Nothing known about operation %d for %s", call, op_key); + delete_op_entry(NULL, rsc->id, op_key, call); + + /* needed?? surely not otherwise the cancel_op_(_key) wouldn't + * have failed in the first place + */ + g_hash_table_remove(pending_ops, op_key); } + op->op_status = LRM_OP_DONE; op->rc = EXECRA_OK; send_direct_ack(from_host, from_sys, op, rsc->id); crm_free(op_key); free_lrm_op(op); } else if(safe_str_eq(operation, CRMD_ACTION_DELETE)) { int rc = HA_OK; lrm_op_t* op = NULL; CRM_ASSERT(rsc != NULL); op = construct_op(input->xml, rsc->id, operation); CRM_ASSERT(op != NULL); op->op_status = LRM_OP_DONE; op->rc = EXECRA_OK; crm_info("Removing resource %s from the LRM", rsc->id); rc = fsa_lrm_conn->lrm_ops->delete_rsc(fsa_lrm_conn, rsc->id); if(rc != HA_OK) { crm_err("Failed to remove resource %s", rsc->id); op->op_status = LRM_OP_ERROR; op->rc = EXECRA_UNKNOWN_ERROR; } delete_rsc_entry(rsc->id); send_direct_ack(from_host, from_sys, op, rsc->id); free_lrm_op(op); g_hash_table_foreach_remove(pending_ops, lrm_remove_deleted_op, rsc->id); if(safe_str_neq(from_sys, CRM_SYSTEM_TENGINE)) { /* this isn't expected - trigger a new transition */ time_t now = time(NULL); char *now_s = crm_itoa(now); crm_debug("Triggering a refresh after %s deleted %s from the LRM", from_sys, rsc->id); update_attr(fsa_cib_conn, cib_none, XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, "last-lrm-refresh", now_s, FALSE); crm_free(now_s); } } else if(rsc != NULL) { do_lrm_rsc_op(rsc, operation, input->xml, input->msg); } lrm_free_rsc(rsc); } else { crm_err("Operation was neither a lrm_query, nor a rsc op. %s", crm_str(crm_op)); register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL); } } lrm_op_t * construct_op(xmlNode *rsc_op, const char *rsc_id, const char *operation) { lrm_op_t *op = NULL; const char *op_delay = NULL; const char *op_timeout = NULL; const char *op_interval = NULL; const char *transition = NULL; CRM_DEV_ASSERT(rsc_id != NULL); crm_malloc0(op, sizeof(lrm_op_t)); op->op_type = crm_strdup(operation); op->op_status = LRM_OP_PENDING; op->rc = -1; op->rsc_id = crm_strdup(rsc_id); op->interval = 0; op->timeout = 0; op->start_delay = 0; op->copyparams = 0; op->app_name = crm_strdup(CRM_SYSTEM_CRMD); if(rsc_op == NULL) { CRM_DEV_ASSERT(safe_str_eq(CRMD_ACTION_STOP, operation)); op->user_data = NULL; op->user_data_len = 0; /* the stop_all_resources() case * by definition there is no DC (or they'd be shutting * us down). * So we should put our version here. */ op->params = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); g_hash_table_insert(op->params, crm_strdup(XML_ATTR_CRM_VERSION), crm_strdup(CRM_FEATURE_SET)); crm_debug_2("Constructed %s op for %s", operation, rsc_id); return op; } op->params = xml2list(rsc_op); #if CRM_DEPRECATED_SINCE_2_0_3 if(g_hash_table_lookup(op->params, XML_ATTR_CRM_VERSION) == NULL) { g_hash_table_destroy(op->params); op->params = xml2list_202(rsc_op); } #endif if(op->params == NULL) { CRM_DEV_ASSERT(safe_str_eq(CRMD_ACTION_STOP, operation)); } op_delay = g_hash_table_lookup(op->params, crm_meta_name("start_delay")); op_timeout = g_hash_table_lookup(op->params, crm_meta_name("timeout")); op_interval = g_hash_table_lookup(op->params, crm_meta_name("interval")); #if CRM_DEPRECATED_SINCE_2_0_5 if(op_delay == NULL) { op_delay = g_hash_table_lookup(op->params, "start_delay"); } if(op_timeout == NULL) { op_timeout = g_hash_table_lookup(op->params, "timeout"); } if(op_interval == NULL) { op_interval = g_hash_table_lookup(op->params, "interval"); } #endif op->interval = crm_parse_int(op_interval, "0"); op->timeout = crm_parse_int(op_timeout, "0"); op->start_delay = crm_parse_int(op_delay, "0"); /* sanity */ if(op->interval < 0) { op->interval = 0; } if(op->timeout < 0) { op->timeout = 0; } if(op->start_delay < 0) { op->start_delay = 0; } transition = crm_element_value(rsc_op, XML_ATTR_TRANSITION_KEY); CRM_CHECK(transition != NULL, return op); op->user_data = crm_strdup(transition); op->user_data_len = 1+strlen(op->user_data); if(op->interval != 0) { if(safe_str_eq(operation, CRMD_ACTION_START) || safe_str_eq(operation, CRMD_ACTION_STOP)) { crm_err("Start and Stop actions cannot have an interval"); op->interval = 0; } } /* reset the resource's parameters? */ if(op->interval == 0) { if(safe_str_eq(CRMD_ACTION_START, operation) || safe_str_eq(CRMD_ACTION_STATUS, operation)) { op->copyparams = 1; } } crm_debug_2("Constructed %s op for %s: interval=%d", operation, rsc_id, op->interval); return op; } void send_direct_ack(const char *to_host, const char *to_sys, lrm_op_t* op, const char *rsc_id) { xmlNode *reply = NULL; xmlNode *update, *iter; xmlNode *fragment; CRM_DEV_ASSERT(op != NULL); if(crm_assert_failed) { return; } if(op->rsc_id == NULL) { CRM_DEV_ASSERT(rsc_id != NULL); op->rsc_id = crm_strdup(rsc_id); } if(to_sys == NULL) { to_sys = CRM_SYSTEM_TENGINE; } update = create_node_state( fsa_our_uname, NULL, NULL, NULL, NULL, NULL, FALSE, __FUNCTION__); iter = create_xml_node(update, XML_CIB_TAG_LRM); crm_xml_add(iter, XML_ATTR_ID, fsa_our_uuid); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCES); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCE); crm_xml_add(iter, XML_ATTR_ID, op->rsc_id); build_operation_update(iter, op, __FUNCTION__, 0); fragment = create_cib_fragment(update, XML_CIB_TAG_STATUS); reply = create_request(CRM_OP_INVOKE_LRM, fragment, to_host, to_sys, CRM_SYSTEM_LRMD, NULL); crm_log_xml_debug_2(update, "ACK Update"); crm_info("ACK'ing resource op %s_%s_%d from %s: %s", op->rsc_id, op->op_type, op->interval, op->user_data, crm_element_value(reply, XML_ATTR_REFERENCE)); if(relay_message(reply, TRUE) == FALSE) { crm_log_xml(LOG_ERR, "Unable to route reply", reply); free_xml(reply); } free_xml(fragment); free_xml(update); } static gboolean stop_recurring_action_by_rsc(gpointer key, gpointer value, gpointer user_data) { lrm_rsc_t *rsc = user_data; struct recurring_op_s *op = (struct recurring_op_s*)value; if(op->interval != 0 && safe_str_eq(op->rsc_id, rsc->id)) { if (cancel_op(rsc, key, op->call_id, FALSE) == FALSE) { return TRUE; } } return FALSE; } void do_lrm_rsc_op(lrm_rsc_t *rsc, const char *operation, xmlNode *msg, xmlNode *request) { int call_id = 0; char *op_id = NULL; lrm_op_t* op = NULL; fsa_data_t *msg_data = NULL; const char *transition = NULL; CRM_CHECK(rsc != NULL, return); if(msg != NULL) { transition = crm_element_value(msg, XML_ATTR_TRANSITION_KEY); if(transition == NULL) { crm_log_xml_err(msg, "Missing transition number"); } } op = construct_op(msg, rsc->id, operation); /* stop the monitor before stopping the resource */ if(crm_str_eq(operation, CRMD_ACTION_STOP, TRUE) || crm_str_eq(operation, CRMD_ACTION_DEMOTE, TRUE) || crm_str_eq(operation, CRMD_ACTION_PROMOTE, TRUE) || crm_str_eq(operation, CRMD_ACTION_MIGRATE, TRUE)) { g_hash_table_foreach_remove(pending_ops, stop_recurring_action_by_rsc, rsc); } /* now do the op */ crm_info("Performing op=%s_%s_%d key=%s)", rsc->id, operation, op->interval, transition); if(fsa_state != S_NOT_DC && fsa_state != S_TRANSITION_ENGINE) { if(safe_str_neq(operation, "fail") && safe_str_neq(operation, CRMD_ACTION_STOP)) { crm_info("Discarding attempt to perform action %s on %s" " in state %s", operation, rsc->id, fsa_state2string(fsa_state)); op->rc = 99; op->op_status = LRM_OP_ERROR; send_direct_ack(NULL, NULL, op, rsc->id); free_lrm_op(op); crm_free(op_id); return; } } op_id = generate_op_key(rsc->id, op->op_type, op->interval); if(op->interval > 0) { /* cancel it so we can then restart it without conflict */ cancel_op_key(rsc, op_id, FALSE); op->target_rc = CHANGED; } else { op->target_rc = EVERYTIME; } g_hash_table_replace(resources,crm_strdup(rsc->id), crm_strdup(op_id)); call_id = rsc->ops->perform_op(rsc, op); if(call_id <= 0) { crm_err("Operation %s on %s failed: %d", operation, rsc->id, call_id); register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL); } else { /* record all operations so we can wait * for them to complete during shutdown */ char *call_id_s = make_stop_id(rsc->id, call_id); struct recurring_op_s *pending = NULL; crm_malloc0(pending, sizeof(struct recurring_op_s)); crm_debug("Recording pending op: %d - %s %s", call_id, op_id, call_id_s); pending->call_id = call_id; pending->interval = op->interval; pending->op_key = crm_strdup(op_id); pending->rsc_id = crm_strdup(rsc->id); g_hash_table_replace(pending_ops, call_id_s, pending); } crm_free(op_id); free_lrm_op(op); return; } void free_recurring_op(gpointer value) { struct recurring_op_s *op = (struct recurring_op_s*)value; crm_free(op->rsc_id); crm_free(op->op_key); crm_free(op); } void free_lrm_op(lrm_op_t *op) { g_hash_table_destroy(op->params); crm_free(op->user_data); crm_free(op->output); crm_free(op->rsc_id); crm_free(op->op_type); crm_free(op->app_name); crm_free(op); } static void dup_attr(gpointer key, gpointer value, gpointer user_data) { g_hash_table_replace(user_data, crm_strdup(key), crm_strdup(value)); } lrm_op_t * copy_lrm_op(const lrm_op_t *op) { lrm_op_t *op_copy = NULL; CRM_DEV_ASSERT(op != NULL); if(crm_assert_failed) { return NULL; } CRM_ASSERT(op->rsc_id != NULL); crm_malloc0(op_copy, sizeof(lrm_op_t)); op_copy->op_type = crm_strdup(op->op_type); /* input fields */ op_copy->params = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); if(op->params != NULL) { g_hash_table_foreach(op->params, dup_attr, op_copy->params); } op_copy->timeout = op->timeout; op_copy->interval = op->interval; op_copy->target_rc = op->target_rc; /* in the CRM, this is always a string */ if(op->user_data != NULL) { op_copy->user_data = crm_strdup(op->user_data); } /* output fields */ op_copy->op_status = op->op_status; op_copy->rc = op->rc; op_copy->call_id = op->call_id; op_copy->output = NULL; op_copy->rsc_id = crm_strdup(op->rsc_id); if(op->app_name != NULL) { op_copy->app_name = crm_strdup(op->app_name); } if(op->output != NULL) { op_copy->output = crm_strdup(op->output); } return op_copy; } lrm_rsc_t * copy_lrm_rsc(const lrm_rsc_t *rsc) { lrm_rsc_t *rsc_copy = NULL; if(rsc == NULL) { return NULL; } crm_malloc0(rsc_copy, sizeof(lrm_rsc_t)); rsc_copy->id = crm_strdup(rsc->id); rsc_copy->type = crm_strdup(rsc->type); rsc_copy->class = NULL; rsc_copy->provider = NULL; if(rsc->class != NULL) { rsc_copy->class = crm_strdup(rsc->class); } if(rsc->provider != NULL) { rsc_copy->provider = crm_strdup(rsc->provider); } /* GHashTable* params; */ rsc_copy->params = NULL; rsc_copy->ops = NULL; return rsc_copy; } static void cib_rsc_callback(xmlNode *msg, int call_id, int rc, xmlNode *output, void *user_data) { switch(rc) { case cib_ok: case cib_diff_failed: case cib_diff_resync: crm_debug("Resource update %d complete: rc=%d", call_id, rc); break; default: crm_err("Resource update %d failed: (rc=%d) %s", call_id, rc, cib_error2string(rc)); } } void do_update_resource(lrm_op_t* op) { /* */ int rc = cib_ok; lrm_rsc_t *rsc = NULL; xmlNode *update, *iter; CRM_CHECK(op != NULL, return); update = create_node_state( fsa_our_uname, NULL, NULL, NULL, NULL, NULL, FALSE, __FUNCTION__); iter = create_xml_node(update, XML_CIB_TAG_LRM); crm_xml_add(iter, XML_ATTR_ID, fsa_our_uuid); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCES); iter = create_xml_node(iter, XML_LRM_TAG_RESOURCE); crm_xml_add(iter, XML_ATTR_ID, op->rsc_id); rsc = fsa_lrm_conn->lrm_ops->get_rsc(fsa_lrm_conn, op->rsc_id); CRM_CHECK(rsc->type != NULL, crm_err("Resource %s has no value for type", op->rsc_id)); CRM_CHECK(rsc->class != NULL, crm_err("Resource %s has no value for class", op->rsc_id)); crm_xml_add(iter, XML_ATTR_TYPE, rsc->type); crm_xml_add(iter, XML_AGENT_ATTR_CLASS, rsc->class); crm_xml_add(iter, XML_AGENT_ATTR_PROVIDER,rsc->provider); lrm_free_rsc(rsc); build_operation_update(iter, op, __FUNCTION__, 0); /* make it an asyncronous call and be done with it * * Best case: * the resource state will be discovered during * the next signup or election. * * Bad case: * we are shutting down and there is no DC at the time, * but then why were we shutting down then anyway? * (probably because of an internal error) * * Worst case: * we get shot for having resources "running" when the really weren't * * the alternative however means blocking here for too long, which * isnt acceptable */ fsa_cib_update(XML_CIB_TAG_STATUS, update, cib_quorum_override, rc); if(rc > 0) { /* the return code is a call number, not an error code */ crm_debug("Sent resource state update message: %d", rc); add_cib_op_callback(rc, FALSE, NULL, cib_rsc_callback); } else { crm_err("Resource state update failed: %s", cib_error2string(rc)); } free_xml(update); } void do_lrm_event(long long action, enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t *msg_data) { CRM_CHECK(FALSE, return); } gboolean process_lrm_event(lrm_op_t *op) { char *op_id = NULL; char *op_key = NULL; int log_level = LOG_ERR; + struct recurring_op_s *pending = NULL; CRM_CHECK(op != NULL, return FALSE); CRM_CHECK(op->rsc_id != NULL, return FALSE); op_key = generate_op_key(op->rsc_id, op->op_type, op->interval); switch(op->op_status) { case LRM_OP_ERROR: case LRM_OP_PENDING: case LRM_OP_NOTSUPPORTED: break; case LRM_OP_CANCELLED: log_level = LOG_INFO; break; case LRM_OP_DONE: log_level = LOG_INFO; break; case LRM_OP_TIMEOUT: log_level = LOG_DEBUG_3; crm_err("LRM operation %s (%d) %s (timeout=%dms)", op_key, op->call_id, op_status2text(op->op_status), op->timeout); break; default: crm_err("Mapping unknown status (%d) to ERROR", op->op_status); op->op_status = LRM_OP_ERROR; } if(op->op_status == LRM_OP_ERROR && (op->rc == EXECRA_RUNNING_MASTER || op->rc == EXECRA_NOT_RUNNING)) { /* Leave it up to the TE/PE to decide if this is an error */ op->op_status = LRM_OP_DONE; log_level = LOG_INFO; } do_crm_log(log_level, "LRM operation %s (call=%d, rc=%d) %s %s", op_key, op->call_id, op->rc, op_status2text(op->op_status), op->op_status==LRM_OP_ERROR?execra_code2string(op->rc):""); if(op->op_status == LRM_OP_ERROR && op->output != NULL) { crm_info("Result: %s", op->output); } + op_id = make_stop_id(op->rsc_id, op->call_id); + pending = g_hash_table_lookup(pending_ops, op_id); + if(op->op_status != LRM_OP_CANCELLED) { do_update_resource(op); if(op->interval != 0) { goto out; } } else if(op->interval == 0) { /* no known valid reason for this to happen */ crm_err("Op %s (call=%d): Cancelled", op_key, op->call_id); - } else if(op->user_data != NULL) { - delete_op_entry(op, NULL, NULL, op->call_id); - - } else { + } else if(pending == NULL) { + crm_err("Op %s (call=%d): No 'pending' entry", + op_key, op->call_id); + + } else if(op->user_data == NULL) { crm_err("Op %s (call=%d): No user data", op_key, op->call_id); + + } else if(pending->remove) { + delete_op_entry(op, op->rsc_id, op_key, op->call_id); } - op_id = make_stop_id(op->rsc_id, op->call_id); if(g_hash_table_remove(pending_ops, op_id)) { crm_debug("Op %s (call=%d): Confirmed", op_key, op->call_id); - goto out; } - crm_err("Op %s (call=%d): Not matched", op_key, op->call_id); - out: crm_free(op_key); crm_free(op_id); return TRUE; } char * make_stop_id(const char *rsc, int call_id) { char *op_id = NULL; crm_malloc0(op_id, strlen(rsc) + 34); if(op_id != NULL) { snprintf(op_id, strlen(rsc) + 34, "%s:%d", rsc, call_id); } return op_id; } diff --git a/cts/CTSaudits.py.in b/cts/CTSaudits.py.in index 2284ebe78d..2ee9056c33 100755 --- a/cts/CTSaudits.py.in +++ b/cts/CTSaudits.py.in @@ -1,861 +1,863 @@ #!@PYTHON@ '''CTS: Cluster Testing System: Audit module ''' __copyright__=''' Copyright (C) 2000, 2001,2005 Alan Robertson Licensed under the GNU GPL. ''' # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. import time, os, popen2, string, re import CTS import os import popen2 class ClusterAudit: def __init__(self, cm): self.CM = cm def __call__(self): raise ValueError("Abstract Class member (__call__)") def is_applicable(self): '''Return TRUE if we are applicable in the current test configuration''' raise ValueError("Abstract Class member (is_applicable)") return 1 def name(self): raise ValueError("Abstract Class member (name)") AllAuditClasses = [ ] class LogAudit(ClusterAudit): def name(self): return "LogAudit" def __init__(self, cm): self.CM = cm def RestartClusterLogging(self): self.CM.log("WARN: Restarting logging on cluster nodes") for node in self.CM.Env["nodes"]: cmd=self.CM.Env["logrestartcmd"] if self.CM.rsh.noBlock(node, cmd) != 0: self.CM.log ("ERROR: Cannot restart logging on %s [%s failed]" % (node, cmd)) def TestLogging(self): patterns= [] prefix="Test message from " for node in self.CM.Env["nodes"]: patterns.append(prefix + node) watch = CTS.LogWatcher(self.CM.Env["LogFileName"], patterns, 30 + len(self.CM.Env["nodes"])) watch.setwatch() for node in self.CM.Env["nodes"]: cmd="logger -p %s.info %s%s" % (self.CM.Env["logfacility"], prefix, node) if self.CM.rsh.noBlock(node, cmd) != 0: self.CM.log ("ERROR: Cannot execute remote command [%s] on %s" % (cmd, node)) watch_result = watch.lookforall() if watch.unmatched: self.CM.log("ERROR: Remote logging is not working.") for regex in watch.unmatched: self.CM.log ("ERROR: Test message [%s] not found in logs." % (regex)) return 0 return 1 def __call__(self): max=3 attempt=0 while attempt <= max and self.TestLogging() == 0: attempt = attempt + 1 self.RestartClusterLogging() time.sleep(60*attempt) if attempt > max: self.CM.log("Cluster logging unrecoverable.") return 0 if attempt > 0: self.CM.log("NOTE: Cluster logging now working.") return 1 def is_applicable(self): if self.CM.Env["DoBSC"]: return 0 return 1 class DiskAudit(ClusterAudit): def name(self): return "DiskspaceAudit" def __init__(self, cm): self.CM = cm def __call__(self): result=1 dfcmd="df -k /var/log | tail -1 | tr -s ' ' | cut -d' ' -f2" for node in self.CM.Env["nodes"]: dfout=self.CM.rsh.readaline(node, dfcmd) if not dfout: self.CM.log ("ERROR: Cannot execute remote df command [%s] on %s" % (dfcmd, node)) else: try: idfout = int(dfout) except (ValueError, TypeError): self.CM.log("Warning: df output from %s was invalid [%s]" % (node, dfout)) else: if idfout == 0: self.CM.log("CRIT: Completely out of log disk space on %s" % node) result=None elif idfout <= 1000: self.CM.log("WARN: Low on log disk space (%d Mbytes) on %s" % (idfout, node)) return result def is_applicable(self): if self.CM.Env["DoBSC"]: return 0 return 1 class AuditResource: def __init__(self, cm, line): fields = line.split() self.CM = cm + self.line = line self.type = fields[1] self.id = fields[2] self.parent = fields[3] self.managed = fields[4] self.needs_quorum = fields[5] if self.parent == "NA": self.parent = None class AuditConstraint: def __init__(self, cm, line): fields = line.split() self.CM = cm + self.line = line self.type = fields[1] self.id = fields[2] self.rsc = fields[3] self.target = fields[4] self.score = fields[5] self.rsc_role = fields[6] self.target_role = fields[7] if self.rsc_role == "NA": self.rsc_role = None if self.target_role == "NA": self.target_role = None class PrimitiveAudit(ClusterAudit): def name(self): return "PrimitiveAudit" def __init__(self, cm): self.CM = cm def doResourceAudit(self, resource): rc=1 active = self.CM.ResourceLocation(resource.id) if len(active) > 1: rc=0 self.CM.log("Resource %s is active multiple times: %s" % (resource.id, repr(active))) elif len(active) == 1: if self.CM.HasQuorum(None): self.CM.debug("Resource %s active on %s" % (resource.id, repr(active))) elif resource.needs_quorum == 1: rc=0 - self.CM.log("Resource %s active without quorum: %s" - % (resource.id, repr(active))) + self.CM.log("Resource %s active without quorum: %s (%s)" + % (resource.id, repr(active), resource.line)) elif self.CM.HasQuorum(None) or not resource.needs_quorum: self.CM.log("WARN: Resource %s not served anywhere (Inactive nodes: %s)" % (resource.id, repr(self.inactive_nodes))) else: self.CM.debug("Resource %s not served anywhere (Inactive nodes: %s)" % (resource.id, repr(self.inactive_nodes))) if resource.managed == "0": self.CM.log("Resource %s not managed: faking success" % resource.id) rc = 1 return rc def setup(self): self.target = None self.resources = [] self.constraints = [] self.active_nodes = [] self.inactive_nodes = [] self.CM.debug("Do Audit %s"%self.name()) for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["up"]: self.active_nodes.append(node) else: self.inactive_nodes.append(node) for node in self.CM.Env["nodes"]: if self.target == None and self.CM.ShouldBeStatus[node] == self.CM["up"]: self.target = node if not self.target: # TODO: In Pacemaker 1.0 clusters we'll be able to run crm_resource # with CIB_file=/path/to/cib.xml even when the cluster isn't running self.CM.debug("No nodes active - skipping %s" % self.name()) return 0 (rc, lines) = self.CM.rsh.remote_py(self.target, "os", "system", "@sbindir@/crm_resource -c") for line in lines: if re.search("^Resource", line): self.resources.append(AuditResource(self.CM, line)) elif re.search("^Constraint", line): self.constraints.append(AuditConstraint(self.CM, line)) else: self.CM.log("Unknown entry: %s" % line); return 1 def __call__(self): rc = 1 if not self.setup(): return 1 for resource in self.resources: if resource.type == "primitive": if self.doResourceAudit(resource) == 0: rc = 0 return rc def is_applicable(self): if self.CM["Name"] == "linux-ha-v2" and self.CM.Env["ResCanStop"] == 0: return 1 if self.CM["Name"] == "crm-ais" and self.CM.Env["ResCanStop"] == 0: return 1 return 0 class GroupAudit(PrimitiveAudit): def name(self): return "GroupAudit" def __call__(self): rc = 1 if not self.setup(): return 1 for group in self.resources: if group.type == "group": first_match = 1 group_location = None for child in self.resources: if child.parent == group.id: nodes = self.CM.ResourceLocation(child.id) if first_match and len(nodes) > 0: group_location = nodes[0] first_match = 0 if len(nodes) > 1: rc = 0 self.CM.log("Child %s of %s is active more than once: %s" % (child.id, group.id, repr(nodes))) elif len(nodes) == 0: # Groups are allowed to be partially active # However we do need to make sure later children aren't running group_location = None self.CM.debug("Child %s of %s is stopped" % (child.id, group.id)) elif nodes[0] != group_location: rc = 0 self.CM.log("Child %s of %s is active on the wrong node (%s) expected %s" % (child.id, group.id, nodes[0], group_location)) else: self.CM.debug("Child %s of %s is active on %s" % (child.id, group.id, nodes[0])) return rc class CloneAudit(PrimitiveAudit): def name(self): return "CloneAudit" def __call__(self): rc = 1 if not self.setup(): return 1 for clone in self.resources: if clone.type == "clone": for child in self.resources: if child.parent == clone.id and child.type == "primitive": self.CM.debug("Checking child %s of %s..." % (child.id, clone.id)) # Check max and node_max # Obtain with: # crm_resource -g clone_max --meta -r child.id # crm_resource -g clone_node_max --meta -r child.id return rc class ColocationAudit(PrimitiveAudit): def name(self): return "ColocationAudit" def crm_location(self, resource): (rc, lines) = self.CM.rsh.remote_py( self.target, "os", "system", "@sbindir@/crm_resource -W -r %s -Q"%resource) hosts = [] if rc == 0: for line in lines: fields = line.split() hosts.append(fields[0]) return hosts def __call__(self): rc = 1 if not self.setup(): return 1 for coloc in self.constraints: if coloc.type == "rsc_colocation": source = self.crm_location(coloc.rsc) target = self.crm_location(coloc.target) if len(source) == 0: self.CM.debug("Colocation audit (%s): %s not running" % (coloc.id, coloc.rsc)) else: for node in source: if not node in target: rc = 0 self.CM.log("Colocation audit (%s): %s running on %s (not in %s)" % (coloc.id, coloc.rsc, node, repr(target))) else: self.CM.debug("Colocation audit (%s): %s running on %s (in %s)" % (coloc.id, coloc.rsc, node, repr(target))) return rc class ResourceAudit(ClusterAudit): def name(self): return "ResourceAudit" def _doauditRsc(self, resource): ResourceNodes = [] for node in self.CM.Env["nodes"]: if self.CM.ShouldBeStatus[node] == self.CM["up"]: if resource.IsRunningOn(node): ResourceNodes.append(node) return ResourceNodes def _doaudit(self): '''Check to see if all resources are running in exactly one place in the cluster. We also verify that the members of a resource group are all running on the same node in the cluster, and we monitor that they are all running "properly". ''' Fatal = 0 result = [] # Thought: use self.CM.find_partitions() and make this audit # aware of partitions. Since in a split cluster one # partition may have quorum (and permission to run resources) # and the other not. Groups = self.CM.ResourceGroups() for group in Groups: GrpServedBy = None lastResource = None for resource in group: # # _doauditRsc returns the set of nodes serving # the given resource. This is normally a single node. # ResourceNodes = self._doauditRsc(resource) # Is the resource served without quorum present? if not self.CM.HasQuorum(None) and len(ResourceNodes) != 0 and resource.needs_quorum: result.append("Resource " + repr(resource) + " active without Quorum: " + repr(ResourceNodes)) # Is the resource served at all? elif len(ResourceNodes) == 0 and self.CM.HasQuorum(None): result.append("Resource " + repr(resource) + " not served anywhere.") # Is the resource served too many times? elif len(ResourceNodes) > 1: result.append("Resource " + repr(resource) + " served too many times: " + repr(ResourceNodes)) self.CM.log("Resource " + repr(resource) + " served too many times: " + repr(ResourceNodes)) Fatal = 1 elif GrpServedBy == None: GrpServedBy = ResourceNodes # Are all the members of the Rsc Grp served by the same node? elif GrpServedBy != ResourceNodes: result.append("Resource group resources" + repr(resource) + " running on different nodes: " + repr(ResourceNodes)+" vs "+repr(GrpServedBy) + "(otherRsc = " + repr(lastResource) + ")") self.CM.log("Resource group resources" + repr(resource) + " running on different nodes: " + repr(ResourceNodes)+" vs "+repr(GrpServedBy) + "(otherRsc = " + repr(lastResource) + ")") Fatal = 1 if self.CM.Env.has_key("SuppressMonitoring") and \ self.CM.Env["SuppressMonitoring"]: continue # Is the resource working correctly ? if not Fatal and len(ResourceNodes) == 1: beforearpchild = popen2.Popen3("date;/sbin/arp -n|cut -c1-15,26-50,75-" , None) beforearpchild.tochild.close() # /dev/null if not resource.IsWorkingCorrectly(ResourceNodes[0]): afterarpchild = popen2.Popen3("/sbin/arp -n|cut -c1-15,26-50,75-" , None) afterarpchild.tochild.close() # /dev/null result.append("Resource " + repr(resource) + " not operating properly." + " Resource is running on " + ResourceNodes[0]); Fatal = 1 self.CM.log("ARP table before failure ========"); for line in beforearpchild.fromchild.readlines(): self.CM.log(line) self.CM.log("ARP table after failure ========"); for line in afterarpchild.fromchild.readlines(): self.CM.log(line) self.CM.log("End of ARP tables ========"); try: beforearpchild.wait() afterarpchild.wait() except OSError: pass afterarpchild.fromchild.close() beforearpchild.fromchild.close() lastResource = resource if (Fatal): result.insert(0, "FATAL") # Kludgy. return result def __call__(self): # # Audit the resources. Since heartbeat doesn't really # know when resource acquisition is complete, we will # poll until things get stable. # # Having a resource duplicately implemented is a Fatal Error # with no tolerance granted. # audresult = self._doaudit() # # Probably the constant below should be a CM parameter. # Then it could be 0 for FailSafe. # Of course, it really depends on what resources # you have in the test suite, and how long it takes # for them to settle. # Recently, we've changed heartbeat so we know better when # resource acquisition is done. # audcount=5; while(audcount > 0): audresult = self._doaudit() if (len(audresult) <= 0 or audresult[0] == "FATAL"): audcount=0 else: audcount = audcount - 1 if (audcount > 0): time.sleep(1) if (len(audresult) > 0): self.CM.log("Fatal Audit error: " + repr(audresult)) return (len(audresult) == 0) def is_applicable(self): if self.CM["Name"] == "heartbeat": return 1 return 0 class CrmdStateAudit(ClusterAudit): def __init__(self, cm): self.CM = cm self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} def has_key(self, key): return self.Stats.has_key(key) def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): return self.Stats[key] def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 def __call__(self): passed = 1 up_are_down = 0 down_are_up = 0 unstable_list = [] self.CM.debug("Do Audit %s"%self.name()) for node in self.CM.Env["nodes"]: should_be = self.CM.ShouldBeStatus[node] rc = self.CM.test_node_CM(node) if rc > 0: if should_be == self.CM["down"]: down_are_up = down_are_up + 1 if rc == 1: unstable_list.append(node) elif should_be == self.CM["up"]: up_are_down = up_are_down + 1 if len(unstable_list) > 0: passed = 0 self.CM.log("Cluster is not stable: %d (of %d): %s" %(len(unstable_list), self.CM.upcount(), repr(unstable_list))) if up_are_down > 0: passed = 0 self.CM.log("%d (of %d) nodes expected to be up were down." %(up_are_down, len(self.CM.Env["nodes"]))) if down_are_up > 0: passed = 0 self.CM.log("%d (of %d) nodes expected to be down were up." %(down_are_up, len(self.CM.Env["nodes"]))) return passed def name(self): return "CrmdStateAudit" def is_applicable(self): if self.CM["Name"] == "linux-ha-v2": return 1 if self.CM["Name"] == "crm-ais": return 1 return 0 class CIBAudit(ClusterAudit): def __init__(self, cm): self.CM = cm self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} def has_key(self, key): return self.Stats.has_key(key) def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): return self.Stats[key] def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 def __call__(self): self.CM.debug("Do Audit %s"%self.name()) passed = 1 ccm_partitions = self.CM.find_partitions() if len(ccm_partitions) == 0: self.CM.debug("\tNo partitions to audit") return 1 for partition in ccm_partitions: self.CM.debug("\tAuditing CIB consistency for: %s" %partition) partition_passed = 0 if self.audit_cib_contents(partition) == 0: passed = 0 return passed def audit_cib_contents(self, hostlist): passed = 1 first_host = None first_host_xml = "" partition_hosts = hostlist.split() for a_host in partition_hosts: if first_host == None: first_host = a_host first_host_xml = self.store_remote_cib(a_host) #self.CM.debug("Retrieved CIB: %s" % first_host_xml) else: a_host_xml = self.store_remote_cib(a_host) diff_cmd="@sbindir@/crm_diff -c -VV -f -N \'%s\' -O '%s'" % (a_host_xml, first_host_xml) infile, outfile, errfile = os.popen3(diff_cmd) diff_lines = outfile.readlines() for line in diff_lines: if not re.search("", line): passed = 0 self.CM.log("CibDiff[%s-%s]: %s" % (first_host, a_host, line)) else: self.CM.debug("CibDiff[%s-%s] Ignoring: %s" % (first_host, a_host, line)) diff_lines = errfile.readlines() for line in diff_lines: passed = 0 self.CM.log("CibDiff[%s-%s] ERROR: %s" % (first_host, a_host, line)) return passed def store_remote_cib(self, node): combined = "" first_line = 1 extra_debug = 0 #self.CM.debug("\tRetrieving CIB from: %s" % node) lines = self.CM.rsh.readlines(node, self.CM["CibQuery"]) if extra_debug: self.CM.debug("Start Cib[%s]" % node) for line in lines: combined = combined + line[:-1] if first_line: self.CM.debug("[Cib]" + line) first_line = 0 elif extra_debug: self.CM.debug("[Cib]" + line) if extra_debug: self.CM.debug("End Cib[%s]" % node) #self.CM.debug("Complete CIB: %s" % combined) return combined def name(self): return "CibAudit" def is_applicable(self): if self.CM["Name"] == "linux-ha-v2": return 1 if self.CM["Name"] == "crm-ais": return 1 return 0 class PartitionAudit(ClusterAudit): def __init__(self, cm): self.CM = cm self.Stats = {"calls":0 , "success":0 , "failure":0 , "skipped":0 , "auditfail":0} self.NodeEpoche={} self.NodeState={} self.NodeQuorum={} def has_key(self, key): return self.Stats.has_key(key) def __setitem__(self, key, value): self.Stats[key] = value def __getitem__(self, key): return self.Stats[key] def incr(self, name): '''Increment (or initialize) the value associated with the given name''' if not self.Stats.has_key(name): self.Stats[name]=0 self.Stats[name] = self.Stats[name]+1 def __call__(self): self.CM.debug("Do Audit %s"%self.name()) passed = 1 ccm_partitions = self.CM.find_partitions() if ccm_partitions == None or len(ccm_partitions) == 0: return 1 if len(ccm_partitions) > 1: self.CM.log("ERROR: %d cluster partitions detected:" %len(ccm_partitions)) passed = 0 for partition in ccm_partitions: self.CM.log("\t %s" %partition) for partition in ccm_partitions: partition_passed = 0 if self.audit_partition(partition) == 0: passed = 0 return passed def trim_string(self, avalue): if not avalue: return None if len(avalue) > 1: return avalue[:-1] def trim2int(self, avalue): if not avalue: return None if len(avalue) > 1: return int(avalue[:-1]) def audit_partition(self, partition): passed = 1 dc_found = [] dc_allowed_list = [] lowest_epoche = None node_list = partition.split() self.CM.debug("Auditing partition: %s" %(partition)) for node in node_list: if self.CM.ShouldBeStatus[node] != self.CM["up"]: self.CM.log("Warn: Node %s appeared out of nowhere" %(node)) self.CM.ShouldBeStatus[node] = self.CM["up"] # not in itself a reason to fail the audit (not what we're # checking for in this audit) self.NodeState[node] = self.CM.rsh.readaline( node, self.CM["StatusCmd"]%node) self.NodeEpoche[node] = self.CM.rsh.readaline( node, self.CM["EpocheCmd"]) self.NodeQuorum[node] = self.CM.rsh.readaline( node, self.CM["QuorumCmd"]) self.CM.debug("Node %s: %s - %s - %s." %(node, self.NodeState[node], self.NodeEpoche[node], self.NodeQuorum[node])) self.NodeState[node] = self.trim_string(self.NodeState[node]) self.NodeEpoche[node] = self.trim2int(self.NodeEpoche[node]) self.NodeQuorum[node] = self.trim_string(self.NodeQuorum[node]) if not self.NodeEpoche[node]: self.CM.log("Warn: Node %s dissappeared: cant determin epoche" %(node)) self.CM.ShouldBeStatus[node] = self.CM["down"] # not in itself a reason to fail the audit (not what we're # checking for in this audit) elif lowest_epoche == None or self.NodeEpoche[node] < lowest_epoche: lowest_epoche = self.NodeEpoche[node] if not lowest_epoche: self.CM.log("Lowest epoche not determined in %s" % (partition)) passed = 0 for node in node_list: if self.CM.ShouldBeStatus[node] == self.CM["up"]: if self.CM.is_node_dc(node, self.NodeState[node]): dc_found.append(node) if self.NodeEpoche[node] == lowest_epoche: self.CM.debug("%s: OK" % node) elif not self.NodeEpoche[node]: self.CM.debug("Check on %s ignored: no node epoche" % node) elif not lowest_epoche: self.CM.debug("Check on %s ignored: no lowest epoche" % node) else: self.CM.log("DC %s is not the oldest node (%d vs. %d)" %(node, self.NodeEpoche[node], lowest_epoche)) passed = 0 if len(dc_found) == 0: self.CM.log("DC not found on any of the %d allowed nodes: %s (of %s)" %(len(dc_allowed_list), str(dc_allowed_list), str(node_list))) elif len(dc_found) > 1: self.CM.log("%d DCs (%s) found in cluster partition: %s" %(len(dc_found), str(dc_found), str(node_list))) passed = 0 if passed == 0: for node in node_list: if self.CM.ShouldBeStatus[node] == self.CM["up"]: self.CM.log("epoche %s : %s" %(self.NodeEpoche[node], self.NodeState[node])) return passed def name(self): return "PartitionAudit" def is_applicable(self): if self.CM["Name"] == "linux-ha-v2": return 1 if self.CM["Name"] == "crm-ais": return 1 return 0 AllAuditClasses.append(DiskAudit) AllAuditClasses.append(LogAudit) AllAuditClasses.append(CrmdStateAudit) AllAuditClasses.append(PartitionAudit) AllAuditClasses.append(ResourceAudit) AllAuditClasses.append(PrimitiveAudit) AllAuditClasses.append(GroupAudit) AllAuditClasses.append(CloneAudit) AllAuditClasses.append(ColocationAudit) AllAuditClasses.append(CIBAudit) def AuditList(cm): result = [] for auditclass in AllAuditClasses: result.append(auditclass(cm)) return result diff --git a/pengine/allocate.c b/pengine/allocate.c index 0b24e771e8..2f1b09306d 100644 --- a/pengine/allocate.c +++ b/pengine/allocate.c @@ -1,850 +1,853 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include void set_alloc_actions(pe_working_set_t *data_set); void migrate_reload_madness(pe_working_set_t *data_set); resource_alloc_functions_t resource_class_alloc_functions[] = { { native_merge_weights, native_color, native_create_actions, native_create_probe, native_internal_constraints, native_rsc_colocation_lh, native_rsc_colocation_rh, native_rsc_order_lh, native_rsc_order_rh, native_rsc_location, native_expand, complex_migrate_reload, complex_stonith_ordering, complex_create_notify_element, }, { group_merge_weights, group_color, group_create_actions, native_create_probe, group_internal_constraints, group_rsc_colocation_lh, group_rsc_colocation_rh, group_rsc_order_lh, group_rsc_order_rh, group_rsc_location, group_expand, complex_migrate_reload, complex_stonith_ordering, complex_create_notify_element, }, { native_merge_weights, clone_color, clone_create_actions, clone_create_probe, clone_internal_constraints, clone_rsc_colocation_lh, clone_rsc_colocation_rh, clone_rsc_order_lh, clone_rsc_order_rh, clone_rsc_location, clone_expand, complex_migrate_reload, complex_stonith_ordering, complex_create_notify_element, }, { native_merge_weights, master_color, master_create_actions, clone_create_probe, master_internal_constraints, clone_rsc_colocation_lh, master_rsc_colocation_rh, clone_rsc_order_lh, clone_rsc_order_rh, clone_rsc_location, clone_expand, complex_migrate_reload, complex_stonith_ordering, complex_create_notify_element, } }; static gboolean check_rsc_parameters(resource_t *rsc, node_t *node, xmlNode *rsc_entry, pe_working_set_t *data_set) { int attr_lpc = 0; gboolean force_restart = FALSE; gboolean delete_resource = FALSE; const char *value = NULL; const char *old_value = NULL; const char *attr_list[] = { XML_ATTR_TYPE, XML_AGENT_ATTR_CLASS, XML_AGENT_ATTR_PROVIDER }; for(; attr_lpc < DIMOF(attr_list); attr_lpc++) { value = crm_element_value(rsc->xml, attr_list[attr_lpc]); old_value = crm_element_value(rsc_entry, attr_list[attr_lpc]); if(value == old_value /* ie. NULL */ || crm_str_eq(value, old_value, TRUE)) { continue; } force_restart = TRUE; crm_notice("Forcing restart of %s on %s, %s changed: %s -> %s", rsc->id, node->details->uname, attr_list[attr_lpc], crm_str(old_value), crm_str(value)); } if(force_restart) { /* make sure the restart happens */ stop_action(rsc, node, FALSE); set_bit(rsc->flags, pe_rsc_start_pending); delete_resource = TRUE; } return delete_resource; } static gboolean check_action_definition(resource_t *rsc, node_t *active_node, xmlNode *xml_op, pe_working_set_t *data_set) { char *key = NULL; int interval = 0; const char *interval_s = NULL; gboolean did_change = FALSE; gboolean start_op = FALSE; xmlNode *params_all = NULL; xmlNode *params_restart = NULL; GHashTable *local_rsc_params = NULL; char *digest_all_calc = NULL; const char *digest_all = NULL; const char *restart_list = NULL; const char *digest_restart = NULL; char *digest_restart_calc = NULL; action_t *action = NULL; const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); const char *op_version = crm_element_value(xml_op, XML_ATTR_CRM_VERSION); CRM_CHECK(active_node != NULL, return FALSE); interval_s = crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL); interval = crm_parse_int(interval_s, "0"); /* we need to reconstruct the key because of the way we used to construct resource IDs */ key = generate_op_key(rsc->id, task, interval); if(interval > 0) { xmlNode *op_match = NULL; crm_debug_2("Checking parameters for %s", key); op_match = find_rsc_op_entry(rsc, key); if(op_match == NULL && data_set->stop_action_orphans) { /* create a cancel action */ action_t *cancel = NULL; - char *cancel_key = NULL; + char *cancel_key = crm_strdup(key); const char *call_id = crm_element_value(xml_op, XML_LRM_ATTR_CALLID); crm_info("Orphan action will be stopped: %s on %s", key, active_node->details->uname); - cancel_key = generate_op_key( - rsc->id, CRMD_ACTION_CANCEL, interval); + /* cancel_key = generate_op_key( */ + /* rsc->id, CRMD_ACTION_CANCEL, interval); */ cancel = custom_action( rsc, cancel_key, CRMD_ACTION_CANCEL, active_node, FALSE, TRUE, data_set); + crm_free(cancel->task); + cancel->task = crm_strdup(CRMD_ACTION_CANCEL); + add_hash_param(cancel->meta, XML_LRM_ATTR_TASK, task); add_hash_param(cancel->meta, XML_LRM_ATTR_CALLID, call_id); add_hash_param(cancel->meta, XML_LRM_ATTR_INTERVAL, interval_s); custom_action_order( rsc, stop_key(rsc), NULL, rsc, NULL, cancel, pe_order_optional, data_set); crm_free(key); key = NULL; return TRUE; } else if(op_match == NULL) { crm_debug("Orphan action detected: %s on %s", key, active_node->details->uname); crm_free(key); key = NULL; return TRUE; } } action = custom_action(rsc, key, task, active_node, TRUE, FALSE, data_set); local_rsc_params = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); unpack_instance_attributes( rsc->xml, XML_TAG_ATTR_SETS, active_node->details->attrs, local_rsc_params, NULL, data_set->now); params_all = create_xml_node(NULL, XML_TAG_PARAMS); g_hash_table_foreach(action->extra, hash2field, params_all); g_hash_table_foreach(rsc->parameters, hash2field, params_all); g_hash_table_foreach(action->meta, hash2metafield, params_all); g_hash_table_foreach(local_rsc_params, hash2field, params_all); filter_action_parameters(params_all, op_version); digest_all_calc = calculate_xml_digest(params_all, TRUE, FALSE); digest_all = crm_element_value(xml_op, XML_LRM_ATTR_OP_DIGEST); digest_restart = crm_element_value(xml_op, XML_LRM_ATTR_RESTART_DIGEST); restart_list = crm_element_value(xml_op, XML_LRM_ATTR_OP_RESTART); if(crm_str_eq(task, CRMD_ACTION_START, TRUE)) { start_op = TRUE; } if(start_op && digest_restart) { params_restart = copy_xml(params_all); if(restart_list) { filter_reload_parameters(params_restart, restart_list); } digest_restart_calc = calculate_xml_digest(params_restart, TRUE, FALSE); if(safe_str_neq(digest_restart_calc, digest_restart)) { did_change = TRUE; crm_log_xml_info(params_restart, "params:restart"); crm_warn("Parameters to %s on %s changed: recorded %s vs. %s (restart:%s) %s", key, active_node->details->uname, crm_str(digest_restart), digest_restart_calc, op_version, crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC)); key = generate_op_key(rsc->id, task, interval); custom_action(rsc, key, task, NULL, FALSE, TRUE, data_set); goto cleanup; } } if(safe_str_neq(digest_all_calc, digest_all)) { action_t *op = NULL; did_change = TRUE; crm_log_xml_info(params_all, "params:all"); crm_warn("Parameters to %s on %s changed: recorded %s vs. %s (all:%s) %s", key, active_node->details->uname, crm_str(digest_all), digest_all_calc, op_version, crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC)); key = generate_op_key(rsc->id, task, interval); op = custom_action(rsc, key, task, NULL, FALSE, TRUE, data_set); if(start_op && digest_restart) { op->allow_reload_conversion = TRUE; } else if(interval > 0) { custom_action_order(rsc, start_key(rsc), NULL, NULL, crm_strdup(op->task), op, pe_order_runnable_left, data_set); } } cleanup: free_xml(params_all); free_xml(params_restart); crm_free(digest_all_calc); crm_free(digest_restart_calc); g_hash_table_destroy(local_rsc_params); pe_free_action(action); return did_change; } extern gboolean DeleteRsc(resource_t *rsc, node_t *node, gboolean optional, pe_working_set_t *data_set); static void check_actions_for(xmlNode *rsc_entry, node_t *node, pe_working_set_t *data_set) { const char *id = NULL; const char *task = NULL; int interval = 0; const char *interval_s = NULL; GListPtr op_list = NULL; GListPtr sorted_op_list = NULL; const char *rsc_id = ID(rsc_entry); gboolean is_probe = FALSE; int start_index = 0, stop_index = 0; resource_t *rsc = pe_find_resource(data_set->resources, rsc_id); CRM_CHECK(rsc != NULL, return); CRM_CHECK(node != NULL, return); CRM_CHECK(rsc_id != NULL, return); if(is_set(rsc->flags, pe_rsc_orphan)) { crm_debug_2("Skipping param check for %s: orphan", rsc->id); return; } else if(pe_find_node_id(rsc->running_on, node->details->id) == NULL) { crm_debug_2("Skipping param check for %s: no longer active on %s", rsc->id, node->details->uname); return; } crm_debug_3("Processing %s on %s", rsc->id, node->details->uname); if(check_rsc_parameters(rsc, node, rsc_entry, data_set)) { DeleteRsc(rsc, node, FALSE, data_set); } xml_child_iter_filter( rsc_entry, rsc_op, XML_LRM_TAG_RSC_OP, op_list = g_list_append(op_list, rsc_op); ); sorted_op_list = g_list_sort(op_list, sort_op_by_callid); calculate_active_ops(sorted_op_list, &start_index, &stop_index); slist_iter( rsc_op, xmlNode, sorted_op_list, lpc, if(start_index < stop_index) { /* stopped */ continue; } else if(lpc < start_index) { /* action occurred prior to a start */ continue; } id = ID(rsc_op); is_probe = FALSE; task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); interval_s = crm_element_value(rsc_op, XML_LRM_ATTR_INTERVAL); interval = crm_parse_int(interval_s, "0"); if(interval == 0 && safe_str_eq(task, CRMD_ACTION_STATUS)) { is_probe = TRUE; } if(is_probe || safe_str_eq(task, CRMD_ACTION_START) || interval > 0) { check_action_definition(rsc, node, rsc_op, data_set); } ); g_list_free(sorted_op_list); } static void check_actions(pe_working_set_t *data_set) { const char *id = NULL; node_t *node = NULL; xmlNode *lrm_rscs = NULL; xmlNode *status = get_object_root(XML_CIB_TAG_STATUS, data_set->input); xml_child_iter_filter( status, node_state, XML_CIB_TAG_STATE, id = crm_element_value(node_state, XML_ATTR_ID); lrm_rscs = find_xml_node(node_state, XML_CIB_TAG_LRM, FALSE); lrm_rscs = find_xml_node(lrm_rscs, XML_LRM_TAG_RESOURCES, FALSE); node = pe_find_node_id(data_set->nodes, id); if(node == NULL) { continue; } else if(can_run_resources(node) == FALSE) { crm_debug_2("Skipping param check for %s: cant run resources", node->details->uname); continue; } crm_debug_2("Processing node %s", node->details->uname); if(node->details->online || data_set->stonith_enabled) { xml_child_iter_filter( lrm_rscs, rsc_entry, XML_LRM_TAG_RESOURCE, if(xml_has_children(rsc_entry)) { check_actions_for(rsc_entry, node, data_set); } ); } ); } static gboolean apply_placement_constraints(pe_working_set_t *data_set) { crm_debug_3("Applying constraints..."); slist_iter( cons, rsc_to_node_t, data_set->placement_constraints, lpc, cons->rsc_lh->cmds->rsc_location(cons->rsc_lh, cons); ); return TRUE; } static void complex_set_cmds(resource_t *rsc) { rsc->cmds = &resource_class_alloc_functions[rsc->variant]; slist_iter( child_rsc, resource_t, rsc->children, lpc, complex_set_cmds(child_rsc); ); } void set_alloc_actions(pe_working_set_t *data_set) { slist_iter( rsc, resource_t, data_set->resources, lpc, complex_set_cmds(rsc); ); } gboolean stage0(pe_working_set_t *data_set) { xmlNode * cib_constraints = get_object_root( XML_CIB_TAG_CONSTRAINTS, data_set->input); if(data_set->input == NULL) { return FALSE; } cluster_status(data_set); set_alloc_actions(data_set); unpack_constraints(cib_constraints, data_set); return TRUE; } /* * Check nodes for resources started outside of the LRM */ gboolean probe_resources(pe_working_set_t *data_set) { action_t *probe_complete = NULL; action_t *probe_node_complete = NULL; slist_iter( node, node_t, data_set->nodes, lpc, gboolean force_probe = FALSE; const char *probed = g_hash_table_lookup( node->details->attrs, CRM_OP_PROBED); crm_debug_2("%s probed: %s", node->details->uname, probed); if(node->details->online == FALSE) { continue; } else if(node->details->unclean) { continue; } else if(probe_complete == NULL) { probe_complete = get_pseudo_op(CRM_OP_PROBED, data_set); } if(probed != NULL && crm_is_true(probed) == FALSE) { force_probe = TRUE; } probe_node_complete = custom_action( NULL, crm_strdup(CRM_OP_PROBED), CRM_OP_PROBED, node, FALSE, TRUE, data_set); probe_node_complete->optional = crm_is_true(probed); probe_node_complete->priority = INFINITY; add_hash_param(probe_node_complete->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); custom_action_order(NULL, NULL, probe_node_complete, NULL, NULL, probe_complete, pe_order_optional, data_set); slist_iter( rsc, resource_t, data_set->resources, lpc2, if(rsc->cmds->create_probe( rsc, node, probe_node_complete, force_probe, data_set)) { probe_complete->optional = FALSE; probe_node_complete->optional = FALSE; custom_action_order( NULL, NULL, probe_complete, rsc, start_key(rsc), NULL, pe_order_optional, data_set); } ); ); return TRUE; } /* * Count how many valid nodes we have (so we know the maximum number of * colors we can resolve). * * Apply node constraints (ie. filter the "allowed_nodes" part of resources */ gboolean stage2(pe_working_set_t *data_set) { crm_debug_3("Applying placement constraints"); slist_iter( node, node_t, data_set->nodes, lpc, if(node == NULL) { /* error */ } else if(node->weight >= 0.0 /* global weight */ && node->details->online && node->details->type == node_member) { data_set->max_valid_nodes++; } ); apply_placement_constraints(data_set); return TRUE; } /* * Create internal resource constraints before allocation */ gboolean stage3(pe_working_set_t *data_set) { slist_iter( rsc, resource_t, data_set->resources, lpc, rsc->cmds->internal_constraints(rsc, data_set); ); return TRUE; } /* * Check for orphaned or redefined actions */ gboolean stage4(pe_working_set_t *data_set) { check_actions(data_set); return TRUE; } gboolean stage5(pe_working_set_t *data_set) { /* Take (next) highest resource, assign it and create its actions */ slist_iter( rsc, resource_t, data_set->resources, lpc, rsc->cmds->color(rsc, data_set); ); probe_resources(data_set); slist_iter( rsc, resource_t, data_set->resources, lpc, rsc->cmds->create_actions(rsc, data_set); ); return TRUE; } /* * Create dependancies for stonith and shutdown operations */ gboolean stage6(pe_working_set_t *data_set) { action_t *dc_down = NULL; action_t *stonith_op = NULL; action_t *last_stonith = NULL; gboolean integrity_lost = FALSE; action_t *ready = get_pseudo_op(STONITH_UP, data_set); action_t *all_stopped = get_pseudo_op(ALL_STOPPED, data_set); crm_debug_3("Processing fencing and shutdown cases"); slist_iter( node, node_t, data_set->nodes, lpc, stonith_op = NULL; if(node->details->unclean && data_set->stonith_enabled && (data_set->have_quorum || data_set->no_quorum_policy == no_quorum_ignore)) { pe_warn("Scheduling Node %s for STONITH", node->details->uname); stonith_op = custom_action( NULL, crm_strdup(CRM_OP_FENCE), CRM_OP_FENCE, node, FALSE, TRUE, data_set); add_hash_param( stonith_op->meta, XML_LRM_ATTR_TARGET, node->details->uname); add_hash_param( stonith_op->meta, XML_LRM_ATTR_TARGET_UUID, node->details->id); add_hash_param( stonith_op->meta, "stonith_action", data_set->stonith_action); stonith_constraints(node, stonith_op, data_set); order_actions(ready, stonith_op, pe_order_implies_left); order_actions(stonith_op, all_stopped, pe_order_implies_right); if(node->details->is_dc) { dc_down = stonith_op; } else { if(last_stonith) { order_actions(last_stonith, stonith_op, pe_order_implies_left); } last_stonith = stonith_op; } } else if(node->details->online && node->details->shutdown) { action_t *down_op = NULL; crm_info("Scheduling Node %s for shutdown", node->details->uname); down_op = custom_action( NULL, crm_strdup(CRM_OP_SHUTDOWN), CRM_OP_SHUTDOWN, node, FALSE, TRUE, data_set); shutdown_constraints(node, down_op, data_set); if(node->details->is_dc) { dc_down = down_op; } } if(node->details->unclean && stonith_op == NULL) { integrity_lost = TRUE; pe_warn("Node %s is unclean!", node->details->uname); } ); if(integrity_lost) { if(data_set->have_quorum == FALSE) { crm_notice("Cannot fence unclean nodes until quorum is" " attained (or no_quorum_policy is set to ignore)"); } else if(data_set->stonith_enabled == FALSE) { pe_warn("YOUR RESOURCES ARE NOW LIKELY COMPROMISED"); pe_err("ENABLE STONITH TO KEEP YOUR RESOURCES SAFE"); } } if(dc_down != NULL) { GListPtr shutdown_matches = find_actions( data_set->actions, CRM_OP_SHUTDOWN, NULL); crm_debug_2("Ordering shutdowns before %s on %s (DC)", dc_down->task, dc_down->node->details->uname); add_hash_param(dc_down->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); slist_iter( node_stop, action_t, shutdown_matches, lpc, if(node_stop->node->details->is_dc) { continue; } crm_debug("Ordering shutdown on %s before %s on %s", node_stop->node->details->uname, dc_down->task, dc_down->node->details->uname); order_actions(node_stop, dc_down, pe_order_implies_left); ); if(last_stonith && dc_down != last_stonith) { order_actions(last_stonith, dc_down, pe_order_implies_left); } g_list_free(shutdown_matches); } return TRUE; } /* * Determin the sets of independant actions and the correct order for the * actions in each set. * * Mark dependencies of un-runnable actions un-runnable * */ gboolean stage7(pe_working_set_t *data_set) { crm_debug_4("Applying ordering constraints"); slist_iter( order, order_constraint_t, data_set->ordering_constraints, lpc, resource_t *rsc = order->lh_rsc; crm_debug_2("Applying ordering constraint: %d", order->id); if(rsc != NULL) { crm_debug_4("rsc_action-to-*"); rsc->cmds->rsc_order_lh(rsc, order, data_set); continue; } rsc = order->rh_rsc; if(rsc != NULL) { crm_debug_4("action-to-rsc_action"); rsc->cmds->rsc_order_rh(order->lh_action, rsc, order); } else { crm_debug_4("action-to-action"); order_actions( order->lh_action, order->rh_action, order->type); } ); update_action_states(data_set->actions); slist_iter( rsc, resource_t, data_set->resources, lpc, rsc->cmds->migrate_reload(rsc, data_set); ); return TRUE; } int transition_id = -1; /* * Create a dependency graph to send to the transitioner (via the CRMd) */ gboolean stage8(pe_working_set_t *data_set) { const char *value = NULL; char *transition_id_s = NULL; transition_id++; transition_id_s = crm_itoa(transition_id); crm_debug_2("Creating transition graph %d.", transition_id); data_set->graph = create_xml_node(NULL, XML_TAG_GRAPH); value = pe_pref(data_set->config_hash, "cluster-delay"); crm_xml_add(data_set->graph, "cluster-delay", value); crm_xml_add(data_set->graph, "failed-stop-offset", "INFINITY"); value = pe_pref(data_set->config_hash, "start-failure-is-fatal"); if(crm_is_true(value)) { crm_xml_add(data_set->graph, "failed-start-offset", "INFINITY"); } else { crm_xml_add(data_set->graph, "failed-start-offset", "1"); } value = pe_pref(data_set->config_hash, "batch-limit"); crm_xml_add(data_set->graph, "batch-limit", value); crm_xml_add(data_set->graph, "transition_id", transition_id_s); crm_free(transition_id_s); /* errors... slist_iter(action, action_t, action_list, lpc, if(action->optional == FALSE && action->runnable == FALSE) { print_action("Ignoring", action, TRUE); } ); */ slist_iter( rsc, resource_t, data_set->resources, lpc, crm_debug_4("processing actions for rsc=%s", rsc->id); rsc->cmds->expand(rsc, data_set); ); crm_log_xml_debug_3( data_set->graph, "created resource-driven action list"); /* catch any non-resource specific actions */ crm_debug_4("processing non-resource actions"); slist_iter( action, action_t, data_set->actions, lpc, graph_element_from_action(action, data_set); ); crm_log_xml_debug_3(data_set->graph, "created generic action list"); crm_debug_2("Created transition graph %d.", transition_id); return TRUE; } void cleanup_alloc_calculations(pe_working_set_t *data_set) { if(data_set == NULL) { return; } crm_debug_3("deleting order cons: %p", data_set->ordering_constraints); pe_free_ordering(data_set->ordering_constraints); data_set->ordering_constraints = NULL; crm_debug_3("deleting node cons: %p", data_set->placement_constraints); pe_free_rsc_to_node(data_set->placement_constraints); data_set->placement_constraints = NULL; crm_debug_3("deleting inter-resource cons: %p", data_set->colocation_constraints); pe_free_shallow(data_set->colocation_constraints); data_set->colocation_constraints = NULL; cleanup_calculations(data_set); } diff --git a/pengine/master.c b/pengine/master.c index a6832b2e96..c84fd42a50 100644 --- a/pengine/master.c +++ b/pengine/master.c @@ -1,850 +1,850 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #define VARIANT_CLONE 1 #include extern gint sort_clone_instance(gconstpointer a, gconstpointer b); extern void clone_create_notifications( resource_t *rsc, action_t *action, action_t *action_complete, pe_working_set_t *data_set); extern int master_score(resource_t *rsc, node_t *node, int not_set_value); static void child_promoting_constraints( clone_variant_data_t *clone_data, enum pe_ordering type, resource_t *rsc, resource_t *child, resource_t *last, pe_working_set_t *data_set) { if(child == NULL) { if(clone_data->ordered && last != NULL) { crm_debug_4("Ordered version (last node)"); /* last child promote before promoted started */ custom_action_order( last, promote_key(last), NULL, rsc, promoted_key(rsc), NULL, type, data_set); } return; } /* child promote before global promoted */ custom_action_order( child, promote_key(child), NULL, rsc, promoted_key(rsc), NULL, type, data_set); /* global promote before child promote */ custom_action_order( rsc, promote_key(rsc), NULL, child, promote_key(child), NULL, type, data_set); if(clone_data->ordered) { crm_debug_4("Ordered version"); if(last == NULL) { /* global promote before first child promote */ last = rsc; } /* else: child/child relative promote */ order_start_start(last, child, type); custom_action_order( last, promote_key(last), NULL, child, promote_key(child), NULL, type, data_set); } else { crm_debug_4("Un-ordered version"); } } static void child_demoting_constraints( clone_variant_data_t *clone_data, enum pe_ordering type, resource_t *rsc, resource_t *child, resource_t *last, pe_working_set_t *data_set) { if(child == NULL) { if(clone_data->ordered && last != NULL) { crm_debug_4("Ordered version (last node)"); /* global demote before first child demote */ custom_action_order( rsc, demote_key(rsc), NULL, last, demote_key(last), NULL, pe_order_implies_left, data_set); } return; } /* child demote before global demoted */ custom_action_order( child, demote_key(child), NULL, rsc, demoted_key(rsc), NULL, pe_order_implies_right_printed, data_set); /* global demote before child demote */ custom_action_order( rsc, demote_key(rsc), NULL, child, demote_key(child), NULL, pe_order_implies_left_printed, data_set); if(clone_data->ordered && last != NULL) { crm_debug_4("Ordered version"); /* child/child relative demote */ custom_action_order(child, demote_key(child), NULL, last, demote_key(last), NULL, type, data_set); } else if(clone_data->ordered) { crm_debug_4("Ordered version (1st node)"); /* first child stop before global stopped */ custom_action_order( child, demote_key(child), NULL, rsc, demoted_key(rsc), NULL, type, data_set); } else { crm_debug_4("Un-ordered version"); } } static void master_update_pseudo_status( resource_t *rsc, gboolean *demoting, gboolean *promoting) { if(rsc->children) { slist_iter(child, resource_t, rsc->children, lpc, master_update_pseudo_status(child, demoting, promoting) ); return; } CRM_ASSERT(demoting != NULL); CRM_ASSERT(promoting != NULL); slist_iter( action, action_t, rsc->actions, lpc, if(*promoting && *demoting) { return; } else if(action->optional) { continue; } else if(safe_str_eq(CRMD_ACTION_DEMOTE, action->task)) { *demoting = TRUE; } else if(safe_str_eq(CRMD_ACTION_PROMOTE, action->task)) { *promoting = TRUE; } ); } #define apply_master_location(list) \ slist_iter( \ cons, rsc_to_node_t, list, lpc2, \ cons_node = NULL; \ if(cons->role_filter == RSC_ROLE_MASTER) { \ crm_debug_2("Applying %s to %s", \ cons->id, child_rsc->id); \ cons_node = pe_find_node_id( \ cons->node_list_rh, chosen->details->id); \ } \ if(cons_node != NULL) { \ int new_priority = merge_weights( \ child_rsc->priority, cons_node->weight); \ crm_debug_2("\t%s: %d->%d (%d)", child_rsc->id, \ child_rsc->priority, new_priority, cons_node->weight); \ child_rsc->priority = new_priority; \ } \ ); static node_t * can_be_master(resource_t *rsc) { node_t *node = NULL; node_t *local_node = NULL; resource_t *parent = uber_parent(rsc); clone_variant_data_t *clone_data = NULL; int level = LOG_DEBUG_2; #if 0 enum rsc_role_e role = RSC_ROLE_UNKNOWN; role = rsc->fns->state(rsc, FALSE); crm_info("%s role: %s", rsc->id, role2text(role)); #endif if(rsc->children) { slist_iter( child, resource_t, rsc->children, lpc, if(can_be_master(child) == NULL) { do_crm_log(level, "Child %s of %s can't be promoted", child->id, rsc->id); return NULL; } ); } node = rsc->fns->location(rsc, NULL, FALSE); if(rsc->priority < 0) { do_crm_log(level, "%s cannot be master: preference: %d", rsc->id, rsc->priority); return NULL; } else if(node == NULL) { do_crm_log(level, "%s cannot be master: not allocated", rsc->id); return NULL; } else if(can_run_resources(node) == FALSE) { do_crm_log(level, "Node cant run any resources: %s", node->details->uname); return NULL; } get_clone_variant_data(clone_data, parent); local_node = pe_find_node_id( parent->allowed_nodes, node->details->id); if(local_node == NULL) { crm_err("%s cannot run on %s: node not allowed", rsc->id, node->details->uname); return NULL; } else if(local_node->count < clone_data->master_node_max) { return local_node; } else { do_crm_log(level, "%s cannot be master on %s: node full", rsc->id, node->details->uname); } return NULL; } static gint sort_master_instance(gconstpointer a, gconstpointer b) { int rc; enum rsc_role_e role1 = RSC_ROLE_UNKNOWN; enum rsc_role_e role2 = RSC_ROLE_UNKNOWN; const resource_t *resource1 = (const resource_t*)a; const resource_t *resource2 = (const resource_t*)b; CRM_ASSERT(resource1 != NULL); CRM_ASSERT(resource2 != NULL); role1 = resource1->fns->state(resource1, TRUE); role2 = resource2->fns->state(resource2, TRUE); rc = sort_rsc_index(a, b); if( rc != 0 ) { return rc; } if(role1 > role2) { return -1; } else if(role1 < role2) { return 1; } return sort_clone_instance(a, b); } static void master_promotion_order(resource_t *rsc) { node_t *node = NULL; node_t *chosen = NULL; clone_variant_data_t *clone_data = NULL; get_clone_variant_data(clone_data, rsc); if(clone_data->merged_master_weights) { return; } clone_data->merged_master_weights = TRUE; crm_debug_2("Merging weights for %s", rsc->id); slist_iter( child, resource_t, rsc->children, lpc, crm_debug_2("%s: %d", child->id, child->sort_index); ); dump_node_scores(LOG_DEBUG_3, rsc, "Before", rsc->allowed_nodes); slist_iter( child, resource_t, rsc->children, lpc, chosen = child->fns->location(child, NULL, FALSE); if(chosen == NULL || child->sort_index < 0) { crm_debug_3("Skipping %s", child->id); continue; } node = (node_t*)pe_find_node_id( rsc->allowed_nodes, chosen->details->id); CRM_ASSERT(node != NULL); /* adds in master preferences and rsc_location.role=Master */ node->weight = merge_weights(child->sort_index, node->weight); ); dump_node_scores(LOG_DEBUG_3, rsc, "Middle", rsc->allowed_nodes); slist_iter( constraint, rsc_colocation_t, rsc->rsc_cons_lhs, lpc, /* (re-)adds location preferences of resource that wish to be * colocated with the master instance */ if(constraint->role_rh == RSC_ROLE_MASTER) { rsc->allowed_nodes = constraint->rsc_lh->cmds->merge_weights( constraint->rsc_lh, rsc->id, rsc->allowed_nodes, constraint->score/INFINITY, TRUE); } ); dump_node_scores(LOG_DEBUG_3, rsc, "After", rsc->allowed_nodes); /* write them back and sort */ slist_iter( child, resource_t, rsc->children, lpc, chosen = child->fns->location(child, NULL, FALSE); if(chosen == NULL || child->sort_index < 0) { crm_debug_2("%s: %d", child->id, child->sort_index); continue; } node = (node_t*)pe_find_node_id( rsc->allowed_nodes, chosen->details->id); CRM_ASSERT(node != NULL); child->sort_index = node->weight; crm_debug_2("%s: %d", child->id, child->sort_index); ); rsc->children = g_list_sort(rsc->children, sort_master_instance); } int master_score(resource_t *rsc, node_t *node, int not_set_value) { char *attr_name; char *name = rsc->id; const char *attr_value; int score = not_set_value, len = 0; if(rsc->fns->state(rsc, TRUE) < RSC_ROLE_STARTED) { return score; } if(rsc->running_on) { node_t *match = pe_find_node_id(rsc->allowed_nodes, node->details->id); if(match->weight < 0) { crm_debug_2("%s on %s has score: %d - ignoring master pref", rsc->id, match->details->uname, match->weight); return score; } } #if 0 if(rsc->clone_name) { name = rsc->clone_name; crm_err("%s ::= %s", rsc->id, rsc->clone_name); } #endif len = 8 + strlen(name); crm_malloc0(attr_name, len); sprintf(attr_name, "master-%s", name); crm_debug_3("looking for %s on %s", attr_name, node->details->uname); attr_value = g_hash_table_lookup( node->details->attrs, attr_name); if(attr_value == NULL) { crm_free(attr_name); len = 8 + strlen(rsc->long_name); crm_malloc0(attr_name, len); sprintf(attr_name, "master-%s", rsc->long_name); crm_debug_3("looking for %s on %s", attr_name, node->details->uname); attr_value = g_hash_table_lookup( node->details->attrs, attr_name); } if(attr_value != NULL) { crm_debug_2("%s[%s] = %s", attr_name, node->details->uname, crm_str(attr_value)); score = char2score(attr_value); } crm_free(attr_name); return score; } #define max(a, b) aapplied_master_prefs) { /* Make sure we only do this once */ return; } clone_data->applied_master_prefs = TRUE; slist_iter( child_rsc, resource_t, rsc->children, lpc, slist_iter( node, node_t, child_rsc->allowed_nodes, lpc, if(can_run_resources(node) == FALSE) { /* This node will never be promoted to master, * so don't apply the master score as that may * lead to clone shuffling */ continue; } score = master_score(child_rsc, node, 0); new_score = merge_weights(node->weight, score); if(new_score != node->weight) { crm_debug_2("\t%s: Updating preference for %s (%d->%d)", child_rsc->id, node->details->uname, node->weight, new_score); node->weight = new_score; } new_score = max(child_rsc->priority, score); if(new_score != child_rsc->priority) { crm_debug_2("\t%s: Updating priority (%d->%d)", child_rsc->id, child_rsc->priority, new_score); child_rsc->priority = new_score; } ); ); } static void set_role(resource_t *rsc, enum rsc_role_e role, gboolean current) { if(current) { if(rsc->variant == pe_native && rsc->running_on != NULL && role > RSC_ROLE_STOPPED) { crm_debug_6("Filtering change %s.role = %s (was %s)", rsc->id, role2text(role), role2text(rsc->role)); } else if(rsc->role != role) { crm_debug_5("Set %s.role = %s (was %s)", rsc->id, role2text(role), role2text(rsc->role)); rsc->role = role; } } else { if(rsc->next_role != role) { crm_debug_5("Set %s.next_role = %s (was %s)", rsc->id, role2text(role), role2text(rsc->next_role)); rsc->next_role = role; if(role == RSC_ROLE_MASTER) { add_hash_param(rsc->parameters, crm_meta_name("role"), role2text(role)); } } } slist_iter( child_rsc, resource_t, rsc->children, lpc, set_role(child_rsc, role, current); ); } node_t * master_color(resource_t *rsc, pe_working_set_t *data_set) { int promoted = 0; node_t *chosen = NULL; node_t *cons_node = NULL; enum rsc_role_e next_role = RSC_ROLE_UNKNOWN; clone_variant_data_t *clone_data = NULL; get_clone_variant_data(clone_data, rsc); apply_master_prefs(rsc); clone_color(rsc, data_set); /* count now tracks the number of masters allocated */ slist_iter(node, node_t, rsc->allowed_nodes, lpc, node->count = 0; ); /* * assign priority */ slist_iter( child_rsc, resource_t, rsc->children, lpc, GListPtr list = NULL; crm_debug_2("Assigning priority for %s", child_rsc->id); if(child_rsc->fns->state(child_rsc, TRUE) == RSC_ROLE_STARTED) { set_role(child_rsc, RSC_ROLE_SLAVE, TRUE); } chosen = child_rsc->fns->location(child_rsc, &list, FALSE); if(g_list_length(list) > 1) { crm_config_err("Cannot promote non-colocated child %s", child_rsc->id); } g_list_free(list); if(chosen == NULL) { continue; } next_role = child_rsc->fns->state(child_rsc, FALSE); switch(next_role) { case RSC_ROLE_STARTED: CRM_CHECK(chosen != NULL, break); /* * Default to -1 if no value is set * * This allows master locations to be specified * based solely on rsc_location constraints, * but prevents anyone from being promoted if * neither a constraint nor a master-score is present */ child_rsc->priority = master_score(child_rsc, chosen, -1); break; case RSC_ROLE_SLAVE: case RSC_ROLE_STOPPED: child_rsc->priority = -INFINITY; break; case RSC_ROLE_MASTER: /* the only reason we should be here is if * we're re-creating actions after a stonith */ promoted++; break; default: CRM_CHECK(FALSE/* unhandled */, crm_err("Unknown resource role: %d for %s", next_role, child_rsc->id)); } apply_master_location(child_rsc->rsc_location); apply_master_location(rsc->rsc_location); slist_iter( cons, rsc_colocation_t, child_rsc->rsc_cons, lpc2, child_rsc->cmds->rsc_colocation_lh(child_rsc, cons->rsc_rh, cons); ); child_rsc->sort_index = child_rsc->priority; crm_debug_2("Assigning priority for %s: %d", child_rsc->id, child_rsc->priority); if(next_role == RSC_ROLE_MASTER) { child_rsc->sort_index = INFINITY; } ); master_promotion_order(rsc); /* mark the first N as masters */ slist_iter( child_rsc, resource_t, rsc->children, lpc, chosen = NULL; crm_debug_2("Processing %s", child_rsc->id); chosen = child_rsc->fns->location(child_rsc, NULL, FALSE); do_crm_log(scores_log_level, "%s promotion score on %s: %d", child_rsc->id, chosen?chosen->details->uname:"none", child_rsc->sort_index); chosen = NULL; /* nuke 'chosen' so that we don't promote more than the * required number of instances */ if(promoted < clone_data->master_max) { chosen = can_be_master(child_rsc); } crm_debug("%s master score: %d", child_rsc->id, child_rsc->priority); if(chosen == NULL) { next_role = child_rsc->fns->state(child_rsc, FALSE); if(next_role == RSC_ROLE_STARTED) { set_role(child_rsc, RSC_ROLE_SLAVE, FALSE); } continue; } chosen->count++; crm_info("Promoting %s", child_rsc->id); set_role(child_rsc, RSC_ROLE_MASTER, FALSE); clone_data->masters_allocated++; promoted++; ); crm_info("%s: Promoted %d instances of a possible %d to master", rsc->id, promoted, clone_data->master_max); return NULL; } void master_create_actions(resource_t *rsc, pe_working_set_t *data_set) { action_t *action = NULL; action_t *action_complete = NULL; gboolean any_promoting = FALSE; gboolean any_demoting = FALSE; resource_t *last_promote_rsc = NULL; resource_t *last_demote_rsc = NULL; clone_variant_data_t *clone_data = NULL; get_clone_variant_data(clone_data, rsc); crm_debug("Creating actions for %s", rsc->id); /* create actions as normal */ clone_create_actions(rsc, data_set); slist_iter( child_rsc, resource_t, rsc->children, lpc, gboolean child_promoting = FALSE; gboolean child_demoting = FALSE; crm_debug_2("Creating actions for %s", child_rsc->id); child_rsc->cmds->create_actions(child_rsc, data_set); master_update_pseudo_status( child_rsc, &child_demoting, &child_promoting); any_demoting = any_demoting || child_demoting; any_promoting = any_promoting || child_promoting; crm_debug_2("Created actions for %s: %d %d", child_rsc->id, child_promoting, child_demoting); ); /* promote */ action = promote_action(rsc, NULL, !any_promoting); action_complete = custom_action( rsc, promoted_key(rsc), CRMD_ACTION_PROMOTED, NULL, !any_promoting, TRUE, data_set); action->pseudo = TRUE; action->runnable = FALSE; action_complete->pseudo = TRUE; action_complete->runnable = FALSE; action_complete->priority = INFINITY; if(clone_data->masters_allocated > 0) { action->runnable = TRUE; action_complete->runnable = TRUE; } child_promoting_constraints(clone_data, pe_order_optional, rsc, NULL, last_promote_rsc, data_set); clone_create_notifications(rsc, action, action_complete, data_set); /* demote */ action = demote_action(rsc, NULL, !any_demoting); action_complete = custom_action( rsc, demoted_key(rsc), CRMD_ACTION_DEMOTED, NULL, !any_demoting, TRUE, data_set); action_complete->priority = INFINITY; action->pseudo = TRUE; action->runnable = TRUE; action_complete->pseudo = TRUE; action_complete->runnable = TRUE; child_demoting_constraints(clone_data, pe_order_optional, rsc, NULL, last_demote_rsc, data_set); clone_create_notifications(rsc, action, action_complete, data_set); /* restore the correct priority */ slist_iter( child_rsc, resource_t, rsc->children, lpc, child_rsc->priority = rsc->priority; ); } void master_internal_constraints(resource_t *rsc, pe_working_set_t *data_set) { resource_t *last_rsc = NULL; clone_variant_data_t *clone_data = NULL; get_clone_variant_data(clone_data, rsc); clone_internal_constraints(rsc, data_set); /* global stopped before start */ custom_action_order( rsc, stopped_key(rsc), NULL, rsc, start_key(rsc), NULL, pe_order_optional, data_set); /* global stopped before promote */ custom_action_order( rsc, stopped_key(rsc), NULL, rsc, promote_key(rsc), NULL, - pe_order_optional|pe_order_test, data_set); + pe_order_optional, data_set); /* global demoted before start */ custom_action_order( rsc, demoted_key(rsc), NULL, rsc, start_key(rsc), NULL, pe_order_optional, data_set); /* global started before promote */ custom_action_order( rsc, started_key(rsc), NULL, rsc, promote_key(rsc), NULL, pe_order_optional, data_set); /* global demoted before stop */ custom_action_order( rsc, demoted_key(rsc), NULL, rsc, stop_key(rsc), NULL, pe_order_optional, data_set); /* global demote before demoted */ custom_action_order( rsc, demote_key(rsc), NULL, rsc, demoted_key(rsc), NULL, pe_order_optional, data_set); /* global demoted before promote */ custom_action_order( rsc, demoted_key(rsc), NULL, rsc, promote_key(rsc), NULL, pe_order_optional, data_set); slist_iter( child_rsc, resource_t, rsc->children, lpc, /* child demote before promote */ custom_action_order( child_rsc, demote_key(child_rsc), NULL, child_rsc, promote_key(child_rsc), NULL, pe_order_optional, data_set); child_promoting_constraints(clone_data, pe_order_optional, rsc, child_rsc, last_rsc, data_set); child_demoting_constraints(clone_data, pe_order_optional, rsc, child_rsc, last_rsc, data_set); last_rsc = child_rsc; ); } static void node_list_update_one(GListPtr list1, node_t *other, int score) { node_t *node = NULL; if(other == NULL) { return; } node = (node_t*)pe_find_node_id(list1, other->details->id); if(node != NULL) { crm_debug_2("%s: %d + %d", node->details->uname, node->weight, other->weight); node->weight = merge_weights(node->weight, score); } } void master_rsc_colocation_rh( resource_t *rsc_lh, resource_t *rsc_rh, rsc_colocation_t *constraint) { clone_variant_data_t *clone_data = NULL; get_clone_variant_data(clone_data, rsc_rh); CRM_CHECK(rsc_rh != NULL, return); if(is_set(rsc_rh->flags, pe_rsc_provisional)) { return; } else if(constraint->role_rh == RSC_ROLE_UNKNOWN) { crm_debug_3("Handling %s as a clone colocation", constraint->id); clone_rsc_colocation_rh(rsc_lh, rsc_rh, constraint); return; } CRM_CHECK(rsc_lh != NULL, return); CRM_CHECK(rsc_lh->variant == pe_native, return); crm_debug_2("Processing constraint %s: %d", constraint->id, constraint->score); if(constraint->role_rh == RSC_ROLE_UNKNOWN) { slist_iter( child_rsc, resource_t, rsc_rh->children, lpc, child_rsc->cmds->rsc_colocation_rh(rsc_lh, child_rsc, constraint); ); } else if(is_set(rsc_lh->flags, pe_rsc_provisional)) { GListPtr lhs = NULL, rhs = NULL; lhs = rsc_lh->allowed_nodes; slist_iter( child_rsc, resource_t, rsc_rh->children, lpc, node_t *chosen = child_rsc->fns->location(child_rsc, NULL, FALSE); enum rsc_role_e next_role = child_rsc->fns->state(child_rsc, FALSE); crm_debug_3("Processing: %s", child_rsc->id); if(chosen != NULL && next_role == constraint->role_rh) { crm_debug_3("Applying: %s %s", child_rsc->id, role2text(next_role)); node_list_update_one(rsc_lh->allowed_nodes, chosen, constraint->score); rhs = g_list_append(rhs, chosen); } ); /* Only do this if its not a master-master colocation * Doing this unconditionally would prevent the slaves from being started */ if(constraint->score > 0 && (constraint->role_lh != RSC_ROLE_MASTER || constraint->role_rh != RSC_ROLE_MASTER)) { rsc_lh->allowed_nodes = node_list_and(lhs, rhs, FALSE); pe_free_shallow(lhs); } pe_free_shallow_adv(rhs, FALSE); } else if(constraint->role_lh == RSC_ROLE_MASTER) { resource_t *rh_child = find_compatible_child(rsc_lh, rsc_rh, constraint->role_rh, FALSE); if(rh_child == NULL && constraint->score >= INFINITY) { crm_debug_2("%s can't be promoted %s", rsc_lh->id, constraint->id); rsc_lh->priority = -INFINITY; } else if(rh_child != NULL) { int new_priority = merge_weights(rsc_lh->priority, constraint->score); crm_debug("Applying %s to %s", constraint->id, rsc_lh->id); crm_debug("\t%s: %d->%d", rsc_lh->id, rsc_lh->priority, new_priority); rsc_lh->priority = new_priority; } } return; } diff --git a/pengine/testcases/orphan-1.dot b/pengine/testcases/orphan-1.dot index 3f202ddd9a..0dc90d36ee 100644 --- a/pengine/testcases/orphan-1.dot +++ b/pengine/testcases/orphan-1.dot @@ -1,41 +1,41 @@ digraph "g" { -"Cancel rsc_c001n02_cancel_5000 c001n02" [ style=bold color="green" fontcolor="black" ] -"Cancel rsc_c001n03_cancel_5000 c001n03" [ style=bold color="green" fontcolor="black" ] +"Cancel rsc_c001n02_monitor_5000 c001n02" [ style=bold color="green" fontcolor="black" ] +"Cancel rsc_c001n03_monitor_5000 c001n03" [ style=bold color="green" fontcolor="black" ] "DcIPaddr_monitor_0 c001n01" -> "probe_complete c001n01" [ style = bold] "DcIPaddr_monitor_0 c001n01" [ style=bold color="green" fontcolor="black" ] "DcIPaddr_monitor_0 c001n03" -> "probe_complete c001n03" [ style = bold] "DcIPaddr_monitor_0 c001n03" [ style=bold color="green" fontcolor="black" ] "DcIPaddr_monitor_0 c001n08" -> "probe_complete c001n08" [ style = bold] "DcIPaddr_monitor_0 c001n08" [ style=bold color="green" fontcolor="black" ] "all_stopped" [ style=bold color="green" fontcolor="orange" ] "probe_complete c001n01" -> "probe_complete" [ style = bold] "probe_complete c001n01" [ style=bold color="green" fontcolor="black" ] "probe_complete c001n02" -> "probe_complete" [ style = bold] "probe_complete c001n02" [ style=bold color="green" fontcolor="black" ] "probe_complete c001n03" -> "probe_complete" [ style = bold] "probe_complete c001n03" [ style=bold color="green" fontcolor="black" ] "probe_complete c001n08" -> "probe_complete" [ style = bold] "probe_complete c001n08" [ style=bold color="green" fontcolor="black" ] "probe_complete" [ style=bold color="green" fontcolor="orange" ] "rsc_c001n01_monitor_0 c001n02" -> "probe_complete c001n02" [ style = bold] "rsc_c001n01_monitor_0 c001n02" [ style=bold color="green" fontcolor="black" ] "rsc_c001n01_monitor_0 c001n03" -> "probe_complete c001n03" [ style = bold] "rsc_c001n01_monitor_0 c001n03" [ style=bold color="green" fontcolor="black" ] "rsc_c001n01_monitor_0 c001n08" -> "probe_complete c001n08" [ style = bold] "rsc_c001n01_monitor_0 c001n08" [ style=bold color="green" fontcolor="black" ] "rsc_c001n02_monitor_0 c001n01" -> "probe_complete c001n01" [ style = bold] "rsc_c001n02_monitor_0 c001n01" [ style=bold color="green" fontcolor="black" ] "rsc_c001n02_monitor_0 c001n03" -> "probe_complete c001n03" [ style = bold] "rsc_c001n02_monitor_0 c001n03" [ style=bold color="green" fontcolor="black" ] "rsc_c001n02_monitor_0 c001n08" -> "probe_complete c001n08" [ style = bold] "rsc_c001n02_monitor_0 c001n08" [ style=bold color="green" fontcolor="black" ] "rsc_c001n03_monitor_0 c001n01" -> "probe_complete c001n01" [ style = bold] "rsc_c001n03_monitor_0 c001n01" [ style=bold color="green" fontcolor="black" ] "rsc_c001n03_monitor_0 c001n02" -> "probe_complete c001n02" [ style = bold] "rsc_c001n03_monitor_0 c001n02" [ style=bold color="green" fontcolor="black" ] "rsc_c001n03_monitor_0 c001n08" -> "probe_complete c001n08" [ style = bold] "rsc_c001n03_monitor_0 c001n08" [ style=bold color="green" fontcolor="black" ] "rsc_c001n03_monitor_6000 c001n03" [ style=bold color="green" fontcolor="black" ] "rsc_c001n08_stop_0 c001n08" -> "all_stopped" [ style = bold] "rsc_c001n08_stop_0 c001n08" [ style=bold color="green" fontcolor="black" ] } diff --git a/pengine/testcases/orphan-1.exp b/pengine/testcases/orphan-1.exp index f40afe39c1..5cb98f3382 100644 --- a/pengine/testcases/orphan-1.exp +++ b/pengine/testcases/orphan-1.exp @@ -1,252 +1,252 @@ - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + diff --git a/pengine/testcases/params-1.dot b/pengine/testcases/params-1.dot index a9d1e28049..ea35b0b6a1 100644 --- a/pengine/testcases/params-1.dot +++ b/pengine/testcases/params-1.dot @@ -1,52 +1,52 @@ digraph "g" { -"Cancel rsc_c001n02_cancel_5000 c001n02" [ style=bold color="green" fontcolor="black" ] +"Cancel rsc_c001n02_monitor_5000 c001n02" [ style=bold color="green" fontcolor="black" ] "DcIPaddr_monitor_0 c001n01" -> "probe_complete c001n01" [ style = bold] "DcIPaddr_monitor_0 c001n01" [ style=bold color="green" fontcolor="black" ] "DcIPaddr_monitor_0 c001n03" -> "probe_complete c001n03" [ style = bold] "DcIPaddr_monitor_0 c001n03" [ style=bold color="green" fontcolor="black" ] "DcIPaddr_monitor_0 c001n08" -> "probe_complete c001n08" [ style = bold] "DcIPaddr_monitor_0 c001n08" [ style=bold color="green" fontcolor="black" ] "DcIPaddr_monitor_5000 c001n02" [ style=bold color="green" fontcolor="black" ] "DcIPaddr_start_0 c001n02" -> "DcIPaddr_monitor_5000 c001n02" [ style = bold] "DcIPaddr_start_0 c001n02" [ style=bold color="green" fontcolor="black" ] "DcIPaddr_stop_0 c001n02" -> "DcIPaddr_start_0 c001n02" [ style = bold] "DcIPaddr_stop_0 c001n02" -> "all_stopped" [ style = bold] "DcIPaddr_stop_0 c001n02" [ style=bold color="green" fontcolor="black" ] "all_stopped" [ style=bold color="green" fontcolor="orange" ] "probe_complete c001n01" -> "probe_complete" [ style = bold] "probe_complete c001n01" [ style=bold color="green" fontcolor="black" ] "probe_complete c001n02" -> "probe_complete" [ style = bold] "probe_complete c001n02" [ style=bold color="green" fontcolor="black" ] "probe_complete c001n03" -> "probe_complete" [ style = bold] "probe_complete c001n03" [ style=bold color="green" fontcolor="black" ] "probe_complete c001n08" -> "probe_complete" [ style = bold] "probe_complete c001n08" [ style=bold color="green" fontcolor="black" ] "probe_complete" -> "DcIPaddr_start_0 c001n02" [ style = bold] "probe_complete" [ style=bold color="green" fontcolor="orange" ] "rsc_c001n01_monitor_0 c001n02" -> "probe_complete c001n02" [ style = bold] "rsc_c001n01_monitor_0 c001n02" [ style=bold color="green" fontcolor="black" ] "rsc_c001n01_monitor_0 c001n03" -> "probe_complete c001n03" [ style = bold] "rsc_c001n01_monitor_0 c001n03" [ style=bold color="green" fontcolor="black" ] "rsc_c001n01_monitor_0 c001n08" -> "probe_complete c001n08" [ style = bold] "rsc_c001n01_monitor_0 c001n08" [ style=bold color="green" fontcolor="black" ] "rsc_c001n02_monitor_0 c001n01" -> "probe_complete c001n01" [ style = bold] "rsc_c001n02_monitor_0 c001n01" [ style=bold color="green" fontcolor="black" ] "rsc_c001n02_monitor_0 c001n03" -> "probe_complete c001n03" [ style = bold] "rsc_c001n02_monitor_0 c001n03" [ style=bold color="green" fontcolor="black" ] "rsc_c001n02_monitor_0 c001n08" -> "probe_complete c001n08" [ style = bold] "rsc_c001n02_monitor_0 c001n08" [ style=bold color="green" fontcolor="black" ] "rsc_c001n02_monitor_6000 c001n02" [ style=bold color="green" fontcolor="black" ] "rsc_c001n03_monitor_0 c001n01" -> "probe_complete c001n01" [ style = bold] "rsc_c001n03_monitor_0 c001n01" [ style=bold color="green" fontcolor="black" ] "rsc_c001n03_monitor_0 c001n02" -> "probe_complete c001n02" [ style = bold] "rsc_c001n03_monitor_0 c001n02" [ style=bold color="green" fontcolor="black" ] "rsc_c001n03_monitor_0 c001n08" -> "probe_complete c001n08" [ style = bold] "rsc_c001n03_monitor_0 c001n08" [ style=bold color="green" fontcolor="black" ] "rsc_c001n08_monitor_0 c001n01" -> "probe_complete c001n01" [ style = bold] "rsc_c001n08_monitor_0 c001n01" [ style=bold color="green" fontcolor="black" ] "rsc_c001n08_monitor_0 c001n02" -> "probe_complete c001n02" [ style = bold] "rsc_c001n08_monitor_0 c001n02" [ style=bold color="green" fontcolor="black" ] "rsc_c001n08_monitor_0 c001n03" -> "probe_complete c001n03" [ style = bold] "rsc_c001n08_monitor_0 c001n03" [ style=bold color="green" fontcolor="black" ] "rsc_c001n08_monitor_5000 c001n08" [ style=bold color="green" fontcolor="black" ] } diff --git a/pengine/testcases/params-1.exp b/pengine/testcases/params-1.exp index 0c4126395f..3c8a1af1e9 100644 --- a/pengine/testcases/params-1.exp +++ b/pengine/testcases/params-1.exp @@ -1,317 +1,317 @@ - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + diff --git a/pengine/testcases/params-4.dot b/pengine/testcases/params-4.dot index df2a67cd45..fb361e593a 100644 --- a/pengine/testcases/params-4.dot +++ b/pengine/testcases/params-4.dot @@ -1,52 +1,52 @@ digraph "g" { -"Cancel rsc_c001n02_cancel_5000 c001n02" [ style=bold color="green" fontcolor="black" ] +"Cancel rsc_c001n02_monitor_5000 c001n02" [ style=bold color="green" fontcolor="black" ] "DcIPaddr_monitor_0 c001n01" -> "probe_complete c001n01" [ style = bold] "DcIPaddr_monitor_0 c001n01" [ style=bold color="green" fontcolor="black" ] "DcIPaddr_monitor_0 c001n03" -> "probe_complete c001n03" [ style = bold] "DcIPaddr_monitor_0 c001n03" [ style=bold color="green" fontcolor="black" ] "DcIPaddr_monitor_0 c001n08" -> "probe_complete c001n08" [ style = bold] "DcIPaddr_monitor_0 c001n08" [ style=bold color="green" fontcolor="black" ] "DcIPaddr_monitor_5000 c001n02" [ style=bold color="green" fontcolor="black" ] "DcIPaddr_reload_0 c001n02" -> "DcIPaddr_monitor_5000 c001n02" [ style = bold] "DcIPaddr_reload_0 c001n02" [ style=bold color="green" fontcolor="black" ] "DcIPaddr_stop_0 c001n02" -> "DcIPaddr_reload_0 c001n02" [ style = bold] "DcIPaddr_stop_0 c001n02" -> "all_stopped" [ style = bold] "DcIPaddr_stop_0 c001n02" [ style=bold color="green" fontcolor="orange" ] "all_stopped" [ style=bold color="green" fontcolor="orange" ] "probe_complete c001n01" -> "probe_complete" [ style = bold] "probe_complete c001n01" [ style=bold color="green" fontcolor="black" ] "probe_complete c001n02" -> "probe_complete" [ style = bold] "probe_complete c001n02" [ style=bold color="green" fontcolor="black" ] "probe_complete c001n03" -> "probe_complete" [ style = bold] "probe_complete c001n03" [ style=bold color="green" fontcolor="black" ] "probe_complete c001n08" -> "probe_complete" [ style = bold] "probe_complete c001n08" [ style=bold color="green" fontcolor="black" ] "probe_complete" -> "DcIPaddr_reload_0 c001n02" [ style = bold] "probe_complete" [ style=bold color="green" fontcolor="orange" ] "rsc_c001n01_monitor_0 c001n02" -> "probe_complete c001n02" [ style = bold] "rsc_c001n01_monitor_0 c001n02" [ style=bold color="green" fontcolor="black" ] "rsc_c001n01_monitor_0 c001n03" -> "probe_complete c001n03" [ style = bold] "rsc_c001n01_monitor_0 c001n03" [ style=bold color="green" fontcolor="black" ] "rsc_c001n01_monitor_0 c001n08" -> "probe_complete c001n08" [ style = bold] "rsc_c001n01_monitor_0 c001n08" [ style=bold color="green" fontcolor="black" ] "rsc_c001n02_monitor_0 c001n01" -> "probe_complete c001n01" [ style = bold] "rsc_c001n02_monitor_0 c001n01" [ style=bold color="green" fontcolor="black" ] "rsc_c001n02_monitor_0 c001n03" -> "probe_complete c001n03" [ style = bold] "rsc_c001n02_monitor_0 c001n03" [ style=bold color="green" fontcolor="black" ] "rsc_c001n02_monitor_0 c001n08" -> "probe_complete c001n08" [ style = bold] "rsc_c001n02_monitor_0 c001n08" [ style=bold color="green" fontcolor="black" ] "rsc_c001n02_monitor_6000 c001n02" [ style=bold color="green" fontcolor="black" ] "rsc_c001n03_monitor_0 c001n01" -> "probe_complete c001n01" [ style = bold] "rsc_c001n03_monitor_0 c001n01" [ style=bold color="green" fontcolor="black" ] "rsc_c001n03_monitor_0 c001n02" -> "probe_complete c001n02" [ style = bold] "rsc_c001n03_monitor_0 c001n02" [ style=bold color="green" fontcolor="black" ] "rsc_c001n03_monitor_0 c001n08" -> "probe_complete c001n08" [ style = bold] "rsc_c001n03_monitor_0 c001n08" [ style=bold color="green" fontcolor="black" ] "rsc_c001n08_monitor_0 c001n01" -> "probe_complete c001n01" [ style = bold] "rsc_c001n08_monitor_0 c001n01" [ style=bold color="green" fontcolor="black" ] "rsc_c001n08_monitor_0 c001n02" -> "probe_complete c001n02" [ style = bold] "rsc_c001n08_monitor_0 c001n02" [ style=bold color="green" fontcolor="black" ] "rsc_c001n08_monitor_0 c001n03" -> "probe_complete c001n03" [ style = bold] "rsc_c001n08_monitor_0 c001n03" [ style=bold color="green" fontcolor="black" ] "rsc_c001n08_monitor_5000 c001n08" [ style=bold color="green" fontcolor="black" ] } diff --git a/pengine/testcases/params-4.exp b/pengine/testcases/params-4.exp index 47ec33c214..014600aa33 100644 --- a/pengine/testcases/params-4.exp +++ b/pengine/testcases/params-4.exp @@ -1,316 +1,316 @@ - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + diff --git a/pengine/testcases/rec-rsc-2.dot b/pengine/testcases/rec-rsc-2.dot index fee86780b8..a3ee82d470 100644 --- a/pengine/testcases/rec-rsc-2.dot +++ b/pengine/testcases/rec-rsc-2.dot @@ -1,17 +1,17 @@ digraph "g" { -"Cancel rsc1_cancel_1 node1" [ style=bold color="green" fontcolor="black" ] +"Cancel rsc1_monitor_1 node1" [ style=bold color="green" fontcolor="black" ] "all_stopped" [ style=bold color="green" fontcolor="orange" ] "probe_complete node1" -> "probe_complete" [ style = bold] "probe_complete node1" [ style=bold color="green" fontcolor="black" ] "probe_complete node2" -> "probe_complete" [ style = bold] "probe_complete node2" [ style=bold color="green" fontcolor="black" ] "probe_complete" -> "rsc1_start_0 node1" [ style = bold] "probe_complete" [ style=bold color="green" fontcolor="orange" ] "rsc1_monitor_0 node2" -> "probe_complete node2" [ style = bold] "rsc1_monitor_0 node2" [ style=bold color="green" fontcolor="black" ] "rsc1_start_0 node1" [ style=bold color="green" fontcolor="black" ] -"rsc1_stop_0 node1" -> "Cancel rsc1_cancel_1 node1" [ style = bold] +"rsc1_stop_0 node1" -> "Cancel rsc1_monitor_1 node1" [ style = bold] "rsc1_stop_0 node1" -> "all_stopped" [ style = bold] "rsc1_stop_0 node1" -> "rsc1_start_0 node1" [ style = bold] "rsc1_stop_0 node1" [ style=bold color="green" fontcolor="black" ] } diff --git a/pengine/testcases/rec-rsc-2.exp b/pengine/testcases/rec-rsc-2.exp index 425f030e2a..a6347b618b 100644 --- a/pengine/testcases/rec-rsc-2.exp +++ b/pengine/testcases/rec-rsc-2.exp @@ -1,97 +1,97 @@ - - - - - - - - - - + + + + + + + + + + - + - + - + - + - + - + - + - + - +