diff --git a/pengine/allocate.c b/pengine/allocate.c index 3c707ba2f6..6e238d769d 100644 --- a/pengine/allocate.c +++ b/pengine/allocate.c @@ -1,959 +1,974 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include void set_alloc_actions(pe_working_set_t *data_set); void migrate_reload_madness(pe_working_set_t *data_set); resource_alloc_functions_t resource_class_alloc_functions[] = { { native_merge_weights, native_color, native_create_actions, native_create_probe, native_internal_constraints, native_rsc_colocation_lh, native_rsc_colocation_rh, native_rsc_order_lh, native_rsc_order_rh, native_rsc_location, native_expand, complex_migrate_reload, complex_stonith_ordering, complex_create_notify_element, }, { group_merge_weights, group_color, group_create_actions, native_create_probe, group_internal_constraints, group_rsc_colocation_lh, group_rsc_colocation_rh, group_rsc_order_lh, group_rsc_order_rh, group_rsc_location, group_expand, complex_migrate_reload, complex_stonith_ordering, complex_create_notify_element, }, { native_merge_weights, clone_color, clone_create_actions, clone_create_probe, clone_internal_constraints, clone_rsc_colocation_lh, clone_rsc_colocation_rh, clone_rsc_order_lh, clone_rsc_order_rh, clone_rsc_location, clone_expand, complex_migrate_reload, complex_stonith_ordering, complex_create_notify_element, }, { native_merge_weights, master_color, master_create_actions, clone_create_probe, master_internal_constraints, clone_rsc_colocation_lh, master_rsc_colocation_rh, clone_rsc_order_lh, clone_rsc_order_rh, clone_rsc_location, clone_expand, complex_migrate_reload, complex_stonith_ordering, complex_create_notify_element, } }; static gboolean check_rsc_parameters(resource_t *rsc, node_t *node, xmlNode *rsc_entry, pe_working_set_t *data_set) { int attr_lpc = 0; gboolean force_restart = FALSE; gboolean delete_resource = FALSE; const char *value = NULL; const char *old_value = NULL; const char *attr_list[] = { XML_ATTR_TYPE, XML_AGENT_ATTR_CLASS, XML_AGENT_ATTR_PROVIDER }; for(; attr_lpc < DIMOF(attr_list); attr_lpc++) { value = crm_element_value(rsc->xml, attr_list[attr_lpc]); old_value = crm_element_value(rsc_entry, attr_list[attr_lpc]); if(value == old_value /* ie. NULL */ || crm_str_eq(value, old_value, TRUE)) { continue; } force_restart = TRUE; crm_notice("Forcing restart of %s on %s, %s changed: %s -> %s", rsc->id, node->details->uname, attr_list[attr_lpc], crm_str(old_value), crm_str(value)); } if(force_restart) { /* make sure the restart happens */ stop_action(rsc, node, FALSE); set_bit(rsc->flags, pe_rsc_start_pending); delete_resource = TRUE; } return delete_resource; } static gboolean check_action_definition(resource_t *rsc, node_t *active_node, xmlNode *xml_op, pe_working_set_t *data_set) { char *key = NULL; int interval = 0; const char *interval_s = NULL; gboolean did_change = FALSE; gboolean start_op = FALSE; xmlNode *params_all = NULL; xmlNode *params_restart = NULL; GHashTable *local_rsc_params = NULL; char *digest_all_calc = NULL; const char *digest_all = NULL; const char *restart_list = NULL; const char *digest_restart = NULL; char *digest_restart_calc = NULL; action_t *action = NULL; const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); const char *op_version = crm_element_value(xml_op, XML_ATTR_CRM_VERSION); CRM_CHECK(active_node != NULL, return FALSE); interval_s = crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL); interval = crm_parse_int(interval_s, "0"); /* we need to reconstruct the key because of the way we used to construct resource IDs */ key = generate_op_key(rsc->id, task, interval); if(interval > 0) { xmlNode *op_match = NULL; crm_debug_2("Checking parameters for %s", key); op_match = find_rsc_op_entry(rsc, key); if(op_match == NULL && data_set->stop_action_orphans) { /* create a cancel action */ action_t *cancel = NULL; char *cancel_key = crm_strdup(key); const char *call_id = crm_element_value(xml_op, XML_LRM_ATTR_CALLID); crm_info("Orphan action will be stopped: %s on %s", key, active_node->details->uname); /* cancel_key = generate_op_key( */ /* rsc->id, CRMD_ACTION_CANCEL, interval); */ cancel = custom_action( rsc, cancel_key, CRMD_ACTION_CANCEL, active_node, FALSE, TRUE, data_set); crm_free(cancel->task); cancel->task = crm_strdup(CRMD_ACTION_CANCEL); add_hash_param(cancel->meta, XML_LRM_ATTR_TASK, task); add_hash_param(cancel->meta, XML_LRM_ATTR_CALLID, call_id); add_hash_param(cancel->meta, XML_LRM_ATTR_INTERVAL, interval_s); custom_action_order( rsc, stop_key(rsc), NULL, rsc, NULL, cancel, pe_order_optional, data_set); crm_free(key); key = NULL; return TRUE; } else if(op_match == NULL) { crm_debug("Orphan action detected: %s on %s", key, active_node->details->uname); crm_free(key); key = NULL; return TRUE; } } action = custom_action(rsc, key, task, active_node, TRUE, FALSE, data_set); local_rsc_params = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); unpack_instance_attributes( rsc->xml, XML_TAG_ATTR_SETS, active_node->details->attrs, local_rsc_params, NULL, FALSE, data_set->now); params_all = create_xml_node(NULL, XML_TAG_PARAMS); g_hash_table_foreach(action->extra, hash2field, params_all); g_hash_table_foreach(rsc->parameters, hash2field, params_all); g_hash_table_foreach(action->meta, hash2metafield, params_all); g_hash_table_foreach(local_rsc_params, hash2field, params_all); filter_action_parameters(params_all, op_version); digest_all_calc = calculate_xml_digest(params_all, TRUE, FALSE); digest_all = crm_element_value(xml_op, XML_LRM_ATTR_OP_DIGEST); digest_restart = crm_element_value(xml_op, XML_LRM_ATTR_RESTART_DIGEST); restart_list = crm_element_value(xml_op, XML_LRM_ATTR_OP_RESTART); if(crm_str_eq(task, CRMD_ACTION_START, TRUE)) { start_op = TRUE; } if(start_op && digest_restart) { params_restart = copy_xml(params_all); if(restart_list) { filter_reload_parameters(params_restart, restart_list); } digest_restart_calc = calculate_xml_digest(params_restart, TRUE, FALSE); if(safe_str_neq(digest_restart_calc, digest_restart)) { did_change = TRUE; crm_log_xml_info(params_restart, "params:restart"); crm_warn("Parameters to %s on %s changed: recorded %s vs. %s (restart:%s) %s", key, active_node->details->uname, crm_str(digest_restart), digest_restart_calc, op_version, crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC)); key = generate_op_key(rsc->id, task, interval); custom_action(rsc, key, task, NULL, FALSE, TRUE, data_set); goto cleanup; } } if(safe_str_neq(digest_all_calc, digest_all)) { action_t *op = NULL; did_change = TRUE; crm_log_xml_info(params_all, "params:all"); crm_warn("Parameters to %s on %s changed: recorded %s vs. %s (all:%s) %s", key, active_node->details->uname, crm_str(digest_all), digest_all_calc, op_version, crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC)); key = generate_op_key(rsc->id, task, interval); op = custom_action(rsc, key, task, NULL, FALSE, TRUE, data_set); if(start_op && digest_restart) { op->allow_reload_conversion = TRUE; } else if(interval > 0) { custom_action_order(rsc, start_key(rsc), NULL, NULL, crm_strdup(op->task), op, pe_order_runnable_left, data_set); } } cleanup: free_xml(params_all); free_xml(params_restart); crm_free(digest_all_calc); crm_free(digest_restart_calc); g_hash_table_destroy(local_rsc_params); pe_free_action(action); return did_change; } extern gboolean DeleteRsc(resource_t *rsc, node_t *node, gboolean optional, pe_working_set_t *data_set); static void check_actions_for(xmlNode *rsc_entry, node_t *node, pe_working_set_t *data_set) { const char *id = NULL; const char *task = NULL; int interval = 0; const char *interval_s = NULL; GListPtr op_list = NULL; GListPtr sorted_op_list = NULL; const char *rsc_id = ID(rsc_entry); gboolean is_probe = FALSE; int start_index = 0, stop_index = 0; resource_t *rsc = pe_find_resource(data_set->resources, rsc_id); CRM_CHECK(rsc != NULL, return); CRM_CHECK(node != NULL, return); CRM_CHECK(rsc_id != NULL, return); if(is_set(rsc->flags, pe_rsc_orphan)) { crm_debug_2("Skipping param check for %s: orphan", rsc->id); return; } else if(pe_find_node_id(rsc->running_on, node->details->id) == NULL) { crm_debug_2("Skipping param check for %s: no longer active on %s", rsc->id, node->details->uname); return; } crm_debug_3("Processing %s on %s", rsc->id, node->details->uname); if(check_rsc_parameters(rsc, node, rsc_entry, data_set)) { DeleteRsc(rsc, node, FALSE, data_set); } xml_child_iter_filter( rsc_entry, rsc_op, XML_LRM_TAG_RSC_OP, op_list = g_list_append(op_list, rsc_op); ); sorted_op_list = g_list_sort(op_list, sort_op_by_callid); calculate_active_ops(sorted_op_list, &start_index, &stop_index); slist_iter( rsc_op, xmlNode, sorted_op_list, lpc, if(start_index < stop_index) { /* stopped */ continue; } else if(lpc < start_index) { /* action occurred prior to a start */ continue; } id = ID(rsc_op); is_probe = FALSE; task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); interval_s = crm_element_value(rsc_op, XML_LRM_ATTR_INTERVAL); interval = crm_parse_int(interval_s, "0"); if(interval == 0 && safe_str_eq(task, CRMD_ACTION_STATUS)) { is_probe = TRUE; } if(is_probe || safe_str_eq(task, CRMD_ACTION_START) || interval > 0) { check_action_definition(rsc, node, rsc_op, data_set); } ); g_list_free(sorted_op_list); } static void check_actions(pe_working_set_t *data_set) { const char *id = NULL; node_t *node = NULL; xmlNode *lrm_rscs = NULL; xmlNode *status = get_object_root(XML_CIB_TAG_STATUS, data_set->input); xml_child_iter_filter( status, node_state, XML_CIB_TAG_STATE, id = crm_element_value(node_state, XML_ATTR_ID); lrm_rscs = find_xml_node(node_state, XML_CIB_TAG_LRM, FALSE); lrm_rscs = find_xml_node(lrm_rscs, XML_LRM_TAG_RESOURCES, FALSE); node = pe_find_node_id(data_set->nodes, id); if(node == NULL) { continue; } else if(can_run_resources(node) == FALSE) { crm_debug_2("Skipping param check for %s: cant run resources", node->details->uname); continue; } crm_debug_2("Processing node %s", node->details->uname); if(node->details->online || data_set->stonith_enabled) { xml_child_iter_filter( lrm_rscs, rsc_entry, XML_LRM_TAG_RESOURCE, if(xml_has_children(rsc_entry)) { check_actions_for(rsc_entry, node, data_set); } ); } ); } static gboolean apply_placement_constraints(pe_working_set_t *data_set) { crm_debug_3("Applying constraints..."); slist_iter( cons, rsc_to_node_t, data_set->placement_constraints, lpc, cons->rsc_lh->cmds->rsc_location(cons->rsc_lh, cons); ); return TRUE; } static void common_apply_stickiness(resource_t *rsc, node_t *node, pe_working_set_t *data_set) { int fail_count = 0; char *fail_attr = NULL; const char *value = NULL; + resource_t *failed = rsc; GHashTable *meta_hash = NULL; if(rsc->children) { slist_iter( child_rsc, resource_t, rsc->children, lpc, common_apply_stickiness(child_rsc, node, data_set); ); return; } - + meta_hash = g_hash_table_new_full( g_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); get_meta_attributes(meta_hash, rsc, node, data_set); /* update resource preferences that relate to the current node */ value = g_hash_table_lookup(meta_hash, "resource_stickiness"); if(value != NULL && safe_str_neq("default", value)) { rsc->stickiness = char2score(value); } else { rsc->stickiness = data_set->default_resource_stickiness; } value = g_hash_table_lookup(meta_hash, XML_RSC_ATTR_FAIL_STICKINESS); if(value != NULL && safe_str_neq("default", value)) { rsc->migration_threshold = char2score(value); } else { rsc->migration_threshold = data_set->default_migration_threshold; } /* process failure stickiness */ fail_attr = crm_concat("fail-count", rsc->id, '-'); value = g_hash_table_lookup(node->details->attrs, fail_attr); if(value != NULL) { fail_count = char2score(value); crm_info("%s has failed %d on %s", rsc->id, fail_count, node->details->uname); } crm_free(fail_attr); - - if(fail_count > 0 && rsc->migration_threshold != 0) { - resource_t *failed = rsc; - if(is_not_set(rsc->flags, pe_rsc_unique)) { - failed = uber_parent(rsc); - } + + if(is_not_set(rsc->flags, pe_rsc_unique)) { + failed = uber_parent(rsc); + } + fail_attr = crm_concat("last-failure", rsc->id, '-'); + value = g_hash_table_lookup(node->details->attrs, fail_attr); + if(value != NULL && rsc->failure_timeout) { + int last_failure = crm_parse_int(value, NULL); + if(last_failure > 0) { + time_t now = get_timet_now(data_set); + if(now > (last_failure + rsc->failure_timeout)) { + crm_notice("Failcount for %s on %s has expired (limit was %ds)", + failed->id, node->details->uname, rsc->failure_timeout); + fail_count = 0; + } + } + } + crm_free(fail_attr); + + if(fail_count > 0 && rsc->migration_threshold != 0) { if(rsc->migration_threshold <= fail_count) { resource_location(failed, node, -INFINITY, "__fail_limit__", data_set); crm_warn("Forcing %s away from %s after %d failures (max=%d)", failed->id, node->details->uname, fail_count, rsc->migration_threshold); } else { crm_notice("%s can fail %d more times on %s before being forced off", failed->id, rsc->migration_threshold - fail_count, node->details->uname); } } g_hash_table_destroy(meta_hash); } static void complex_set_cmds(resource_t *rsc) { rsc->cmds = &resource_class_alloc_functions[rsc->variant]; slist_iter( child_rsc, resource_t, rsc->children, lpc, complex_set_cmds(child_rsc); ); } void set_alloc_actions(pe_working_set_t *data_set) { slist_iter( rsc, resource_t, data_set->resources, lpc, complex_set_cmds(rsc); ); } gboolean stage0(pe_working_set_t *data_set) { xmlNode * cib_constraints = get_object_root( XML_CIB_TAG_CONSTRAINTS, data_set->input); if(data_set->input == NULL) { return FALSE; } cluster_status(data_set); set_alloc_actions(data_set); slist_iter(node, node_t, data_set->nodes, lpc, slist_iter( rsc, resource_t, data_set->resources, lpc2, common_apply_stickiness(rsc, node, data_set); ); ); unpack_constraints(cib_constraints, data_set); return TRUE; } /* * Check nodes for resources started outside of the LRM */ gboolean probe_resources(pe_working_set_t *data_set) { action_t *probe_complete = NULL; action_t *probe_node_complete = NULL; slist_iter( node, node_t, data_set->nodes, lpc, gboolean force_probe = FALSE; const char *probed = g_hash_table_lookup( node->details->attrs, CRM_OP_PROBED); crm_debug_2("%s probed: %s", node->details->uname, probed); if(node->details->online == FALSE) { continue; } else if(node->details->unclean) { continue; } else if(probe_complete == NULL) { probe_complete = get_pseudo_op(CRM_OP_PROBED, data_set); } if(probed != NULL && crm_is_true(probed) == FALSE) { force_probe = TRUE; } probe_node_complete = custom_action( NULL, crm_strdup(CRM_OP_PROBED), CRM_OP_PROBED, node, FALSE, TRUE, data_set); probe_node_complete->optional = crm_is_true(probed); probe_node_complete->priority = INFINITY; add_hash_param(probe_node_complete->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); custom_action_order(NULL, NULL, probe_node_complete, NULL, NULL, probe_complete, pe_order_optional, data_set); slist_iter( rsc, resource_t, data_set->resources, lpc2, if(rsc->cmds->create_probe( rsc, node, probe_node_complete, force_probe, data_set)) { probe_complete->optional = FALSE; probe_node_complete->optional = FALSE; custom_action_order( NULL, NULL, probe_complete, rsc, start_key(rsc), NULL, pe_order_optional, data_set); } ); ); return TRUE; } /* * Count how many valid nodes we have (so we know the maximum number of * colors we can resolve). * * Apply node constraints (ie. filter the "allowed_nodes" part of resources */ gboolean stage2(pe_working_set_t *data_set) { crm_debug_3("Applying placement constraints"); slist_iter( node, node_t, data_set->nodes, lpc, if(node == NULL) { /* error */ } else if(node->weight >= 0.0 /* global weight */ && node->details->online && node->details->type == node_member) { data_set->max_valid_nodes++; } ); apply_placement_constraints(data_set); return TRUE; } /* * Create internal resource constraints before allocation */ gboolean stage3(pe_working_set_t *data_set) { slist_iter( rsc, resource_t, data_set->resources, lpc, rsc->cmds->internal_constraints(rsc, data_set); ); return TRUE; } /* * Check for orphaned or redefined actions */ gboolean stage4(pe_working_set_t *data_set) { check_actions(data_set); return TRUE; } gboolean stage5(pe_working_set_t *data_set) { /* Take (next) highest resource, assign it and create its actions */ slist_iter( rsc, resource_t, data_set->resources, lpc, rsc->cmds->color(rsc, data_set); ); probe_resources(data_set); slist_iter( rsc, resource_t, data_set->resources, lpc, rsc->cmds->create_actions(rsc, data_set); ); return TRUE; } static gboolean is_managed(const resource_t *rsc) { if(is_set(rsc->flags, pe_rsc_managed)) { return TRUE; } slist_iter( child_rsc, resource_t, rsc->children, lpc, if(is_managed(child_rsc)) { return TRUE; } ); return FALSE; } static gboolean any_managed_resouces(pe_working_set_t *data_set) { slist_iter( rsc, resource_t, data_set->resources, lpc, if(is_managed(rsc)) { return TRUE; } ); return FALSE; } /* * Create dependancies for stonith and shutdown operations */ gboolean stage6(pe_working_set_t *data_set) { action_t *dc_down = NULL; action_t *stonith_op = NULL; action_t *last_stonith = NULL; gboolean integrity_lost = FALSE; action_t *ready = get_pseudo_op(STONITH_UP, data_set); action_t *all_stopped = get_pseudo_op(ALL_STOPPED, data_set); gboolean need_stonith = FALSE; crm_debug_3("Processing fencing and shutdown cases"); if(data_set->stonith_enabled && (data_set->have_quorum || data_set->no_quorum_policy == no_quorum_ignore)) { need_stonith = TRUE; } if(need_stonith && any_managed_resouces(data_set) == FALSE) { crm_crit("Delaying fencing operations until there are resources to manage"); need_stonith = FALSE; } slist_iter( node, node_t, data_set->nodes, lpc, stonith_op = NULL; if(node->details->unclean && need_stonith) { pe_warn("Scheduling Node %s for STONITH", node->details->uname); stonith_op = custom_action( NULL, crm_strdup(CRM_OP_FENCE), CRM_OP_FENCE, node, FALSE, TRUE, data_set); add_hash_param( stonith_op->meta, XML_LRM_ATTR_TARGET, node->details->uname); add_hash_param( stonith_op->meta, XML_LRM_ATTR_TARGET_UUID, node->details->id); add_hash_param( stonith_op->meta, "stonith_action", data_set->stonith_action); stonith_constraints(node, stonith_op, data_set); order_actions(ready, stonith_op, pe_order_implies_left); order_actions(stonith_op, all_stopped, pe_order_implies_right); if(node->details->is_dc) { dc_down = stonith_op; } else { if(last_stonith) { order_actions(last_stonith, stonith_op, pe_order_implies_left); } last_stonith = stonith_op; } } else if(node->details->online && node->details->shutdown) { action_t *down_op = NULL; crm_info("Scheduling Node %s for shutdown", node->details->uname); down_op = custom_action( NULL, crm_strdup(CRM_OP_SHUTDOWN), CRM_OP_SHUTDOWN, node, FALSE, TRUE, data_set); shutdown_constraints(node, down_op, data_set); if(node->details->is_dc) { dc_down = down_op; } } if(node->details->unclean && stonith_op == NULL) { integrity_lost = TRUE; pe_warn("Node %s is unclean!", node->details->uname); } ); if(integrity_lost) { if(data_set->have_quorum == FALSE) { crm_notice("Cannot fence unclean nodes until quorum is" " attained (or no_quorum_policy is set to ignore)"); } else if(data_set->stonith_enabled == FALSE) { pe_warn("YOUR RESOURCES ARE NOW LIKELY COMPROMISED"); pe_err("ENABLE STONITH TO KEEP YOUR RESOURCES SAFE"); } } if(dc_down != NULL) { GListPtr shutdown_matches = find_actions( data_set->actions, CRM_OP_SHUTDOWN, NULL); crm_debug_2("Ordering shutdowns before %s on %s (DC)", dc_down->task, dc_down->node->details->uname); add_hash_param(dc_down->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); slist_iter( node_stop, action_t, shutdown_matches, lpc, if(node_stop->node->details->is_dc) { continue; } crm_debug("Ordering shutdown on %s before %s on %s", node_stop->node->details->uname, dc_down->task, dc_down->node->details->uname); order_actions(node_stop, dc_down, pe_order_implies_left); ); if(last_stonith && dc_down != last_stonith) { order_actions(last_stonith, dc_down, pe_order_implies_left); } g_list_free(shutdown_matches); } return TRUE; } /* * Determin the sets of independant actions and the correct order for the * actions in each set. * * Mark dependencies of un-runnable actions un-runnable * */ gboolean stage7(pe_working_set_t *data_set) { crm_debug_4("Applying ordering constraints"); slist_iter( order, order_constraint_t, data_set->ordering_constraints, lpc, resource_t *rsc = order->lh_rsc; crm_debug_2("Applying ordering constraint: %d", order->id); if(rsc != NULL) { crm_debug_4("rsc_action-to-*"); rsc->cmds->rsc_order_lh(rsc, order, data_set); continue; } rsc = order->rh_rsc; if(rsc != NULL) { crm_debug_4("action-to-rsc_action"); rsc->cmds->rsc_order_rh(order->lh_action, rsc, order); } else { crm_debug_4("action-to-action"); order_actions( order->lh_action, order->rh_action, order->type); } ); update_action_states(data_set->actions); slist_iter( rsc, resource_t, data_set->resources, lpc, rsc->cmds->migrate_reload(rsc, data_set); ); return TRUE; } int transition_id = -1; /* * Create a dependency graph to send to the transitioner (via the CRMd) */ gboolean stage8(pe_working_set_t *data_set) { const char *value = NULL; char *transition_id_s = NULL; transition_id++; transition_id_s = crm_itoa(transition_id); crm_debug_2("Creating transition graph %d.", transition_id); data_set->graph = create_xml_node(NULL, XML_TAG_GRAPH); value = pe_pref(data_set->config_hash, "cluster-delay"); crm_xml_add(data_set->graph, "cluster-delay", value); crm_xml_add(data_set->graph, "failed-stop-offset", "INFINITY"); value = pe_pref(data_set->config_hash, "start-failure-is-fatal"); if(crm_is_true(value)) { crm_xml_add(data_set->graph, "failed-start-offset", "INFINITY"); } else { crm_xml_add(data_set->graph, "failed-start-offset", "1"); } value = pe_pref(data_set->config_hash, "batch-limit"); crm_xml_add(data_set->graph, "batch-limit", value); crm_xml_add(data_set->graph, "transition_id", transition_id_s); crm_free(transition_id_s); /* errors... slist_iter(action, action_t, action_list, lpc, if(action->optional == FALSE && action->runnable == FALSE) { print_action("Ignoring", action, TRUE); } ); */ slist_iter( rsc, resource_t, data_set->resources, lpc, crm_debug_4("processing actions for rsc=%s", rsc->id); rsc->cmds->expand(rsc, data_set); ); crm_log_xml_debug_3( data_set->graph, "created resource-driven action list"); /* catch any non-resource specific actions */ crm_debug_4("processing non-resource actions"); slist_iter( action, action_t, data_set->actions, lpc, graph_element_from_action(action, data_set); ); crm_log_xml_debug_3(data_set->graph, "created generic action list"); crm_debug_2("Created transition graph %d.", transition_id); return TRUE; } void cleanup_alloc_calculations(pe_working_set_t *data_set) { if(data_set == NULL) { return; } crm_debug_3("deleting order cons: %p", data_set->ordering_constraints); pe_free_ordering(data_set->ordering_constraints); data_set->ordering_constraints = NULL; crm_debug_3("deleting node cons: %p", data_set->placement_constraints); pe_free_rsc_to_node(data_set->placement_constraints); data_set->placement_constraints = NULL; crm_debug_3("deleting inter-resource cons: %p", data_set->colocation_constraints); pe_free_shallow(data_set->colocation_constraints); data_set->colocation_constraints = NULL; cleanup_calculations(data_set); } diff --git a/transitioner/events.c b/transitioner/events.c index 3fda03d509..0242bda5cf 100644 --- a/transitioner/events.c +++ b/transitioner/events.c @@ -1,569 +1,582 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #include #include char *failed_stop_offset = NULL; char *failed_start_offset = NULL; xmlNode *need_abort(xmlNode *update); void process_graph_event(xmlNode *event, const char *event_node); int match_graph_event(int action_id, xmlNode *event, const char *event_node, int op_status, int op_rc); xmlNode * need_abort(xmlNode *update) { xmlNode *section_xml = NULL; const char *section = NULL; if(update == NULL) { return NULL; } xml_prop_iter(update, name, value, if(safe_str_eq(name, XML_ATTR_HAVE_QUORUM)) { goto do_abort; /* possibly not required */ } else if(safe_str_eq(name, XML_ATTR_NUMPEERS)) { goto do_abort; } else if(safe_str_eq(name, XML_ATTR_GENERATION)) { goto do_abort; } else if(safe_str_eq(name, XML_ATTR_GENERATION_ADMIN)) { goto do_abort; } continue; do_abort: crm_debug("Aborting on change to %s", name); crm_log_xml_debug(update, "Abort: CIB Attrs"); return update; ); section = XML_CIB_TAG_NODES; section_xml = get_object_root(section, update); xml_child_iter(section_xml, child, return section_xml; ); section = XML_CIB_TAG_RESOURCES; section_xml = get_object_root(section, update); xml_child_iter(section_xml, child, return section_xml; ); section = XML_CIB_TAG_CONSTRAINTS; section_xml = get_object_root(section, update); xml_child_iter(section_xml, child, return section_xml; ); section = XML_CIB_TAG_CRMCONFIG; section_xml = get_object_root(section, update); xml_child_iter(section_xml, child, return section_xml; ); return NULL; } static gboolean fail_incompletable_actions(crm_graph_t *graph, const char *down_node) { const char *target = NULL; xmlNode *last_action = NULL; slist_iter( synapse, synapse_t, graph->synapses, lpc, if (synapse->confirmed) { continue; } slist_iter( action, crm_action_t, synapse->actions, lpc, if(action->type == action_type_pseudo || action->confirmed) { continue; } target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); if(safe_str_eq(target, down_node)) { action->failed = TRUE; last_action = action->xml; update_graph(graph, action); crm_notice("Action %d (%s) is scheduled for %s (offline)", action->id, ID(action->xml), down_node); } ); ); if(last_action != NULL) { crm_warn("Node %s shutdown resulted in un-runnable actions", down_node); abort_transition(INFINITY, tg_restart, "Node failure", last_action); return TRUE; } return FALSE; } gboolean extract_event(xmlNode *msg) { int shutdown = 0; const char *shutdown_s = NULL; const char *event_node = NULL; /* [cib fragment] ... */ crm_debug_4("Extracting event from %s", crm_element_name(msg)); xml_child_iter_filter( msg, node_state, XML_CIB_TAG_STATE, xmlNode *attrs = NULL; xmlNode *resources = NULL; const char *ccm_state = crm_element_value( node_state, XML_CIB_ATTR_INCCM); const char *crmd_state = crm_element_value( node_state, XML_CIB_ATTR_CRMDSTATE); /* Transient node attribute changes... */ event_node = crm_element_value(node_state, XML_ATTR_ID); crm_debug_2("Processing state update from %s", event_node); crm_log_xml_debug_3(node_state, "Processing"); attrs = find_xml_node( node_state, XML_TAG_TRANSIENT_NODEATTRS, FALSE); if(attrs != NULL) { crm_info("Aborting on "XML_TAG_TRANSIENT_NODEATTRS" changes for %s", event_node); abort_transition(INFINITY, tg_restart, XML_TAG_TRANSIENT_NODEATTRS, attrs); } resources = find_xml_node(node_state, XML_CIB_TAG_LRM, FALSE); resources = find_xml_node( resources, XML_LRM_TAG_RESOURCES, FALSE); /* LRM resource update... */ xml_child_iter( resources, rsc, xml_child_iter( rsc, rsc_op, crm_log_xml_debug_3(rsc_op, "Processing resource update"); process_graph_event(rsc_op, event_node); ); ); /* * node state update... possibly from a shutdown we requested */ if(safe_str_eq(ccm_state, XML_BOOLEAN_FALSE) || safe_str_eq(crmd_state, CRMD_JOINSTATE_DOWN)) { crm_action_t *shutdown = NULL; shutdown = match_down_event(0, event_node, NULL); if(shutdown != NULL) { update_graph(transition_graph, shutdown); trigger_graph(); } else { crm_info("Stonith/shutdown of %s not matched", event_node); abort_transition(INFINITY, tg_restart, "Node failure", node_state); } fail_incompletable_actions(transition_graph, event_node); } shutdown_s = crm_element_value(node_state, XML_CIB_ATTR_SHUTDOWN); if(shutdown_s) { shutdown = crm_parse_int(shutdown_s, NULL); } if(shutdown_s && shutdown > 0) { crm_info("Aborting on "XML_CIB_ATTR_SHUTDOWN" attribute for %s", event_node); abort_transition(INFINITY, tg_restart, "Shutdown request", node_state); } ); return TRUE; } static void update_failcount(xmlNode *event, const char *event_node, int rc) { int interval = 0; char *task = NULL; char *rsc_id = NULL; char *attr_name = NULL; const char *id = ID(event); const char *on_uuid = event_node; const char *value = NULL; if(rc == 99) { /* this is an internal code for "we're busy, try again" */ return; } if(failed_stop_offset == NULL) { failed_stop_offset = crm_strdup(INFINITY_S); } if(failed_start_offset == NULL) { failed_start_offset = crm_strdup(INFINITY_S); } CRM_CHECK(on_uuid != NULL, return); CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval), crm_err("Couldn't parse: %s", ID(event)); goto bail); CRM_CHECK(task != NULL, goto bail); CRM_CHECK(rsc_id != NULL, goto bail); if(safe_str_eq(task, CRMD_ACTION_START)) { interval = 1; value = failed_start_offset; } else if(safe_str_eq(task, CRMD_ACTION_STOP)) { interval = 1; value = failed_stop_offset; } if(value == NULL || safe_str_neq(value, INFINITY_S)) { value = XML_NVPAIR_ATTR_VALUE"++"; } if(interval > 0) { int call_id = 0; + char *now = crm_itoa(time(NULL)); + attr_name = crm_concat("fail-count", rsc_id, '-'); - crm_warn("Updating failcount for %s on %s after failed %s: rc=%d (update=%s)", - rsc_id, on_uuid, task, rc, value); + crm_warn("Updating failcount for %s on %s after failed %s:" + " rc=%d (update=%s, time=%s)", rsc_id, on_uuid, task, rc, value, now); + /* don't let notificatios of these updates cause new transitions */ call_id = update_attr(te_cib_conn, cib_inhibit_notify, XML_CIB_TAG_STATUS, - on_uuid, NULL,NULL, attr_name, value, FALSE); + on_uuid, NULL,NULL, attr_name, value, FALSE); + + add_cib_op_callback(call_id, FALSE, NULL, cib_failcount_updated); + crm_free(attr_name); + + attr_name = crm_concat("last-failure", rsc_id, '-'); /* don't let notificatios of these updates cause new transitions */ + call_id = update_attr(te_cib_conn, cib_inhibit_notify, XML_CIB_TAG_STATUS, + on_uuid, NULL,NULL, attr_name, value, FALSE); + add_cib_op_callback(call_id, FALSE, NULL, cib_failcount_updated); crm_free(attr_name); + + crm_free(now); } bail: crm_free(rsc_id); crm_free(task); } static int status_from_rc(crm_action_t *action, int orig_status, int rc) { int target_rc = 0; int status = orig_status; const char *target_rc_s = g_hash_table_lookup( action->params, crm_meta_name(XML_ATTR_TE_TARGET_RC)); if(target_rc_s != NULL) { crm_debug_2("Target rc: %s vs. %d", target_rc_s, rc); target_rc = crm_parse_int(target_rc_s, NULL); } if(target_rc == rc) { crm_debug_2("Target rc: == %d", rc); if(status != LRM_OP_DONE) { crm_debug_2("Re-mapping op status to" " LRM_OP_DONE for rc=%d", rc); status = LRM_OP_DONE; } } else { crm_debug_2("Target rc: != %d", rc); if(status != LRM_OP_ERROR) { crm_info("Re-mapping op status to" " LRM_OP_ERROR for rc=%d", rc); status = LRM_OP_ERROR; } } /* 99 is the code we use for direct nack's */ if(rc != 99 && status != LRM_OP_DONE) { const char *task, *uname; task = crm_element_value(action->xml, XML_LRM_ATTR_TASK); uname = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); crm_warn("Action %d (%s) on %s failed (target: %s vs. rc: %d): %s", action->id, task, uname, crm_str(target_rc_s), rc, op_status2text(status)); } return status; } /* * returns the ID of the action if a match is found * returns -1 if a match was not found * returns -2 if a match was found but the action failed (and was * not allowed to) */ int match_graph_event(int action_id, xmlNode *event, const char *event_node, int op_status, int op_rc) { const char *target = NULL; const char *allow_fail = NULL; const char *this_event = ID(event); crm_action_t *action = NULL; action = get_action(action_id, FALSE); if(action == NULL) { return -1; } op_status = status_from_rc(action, op_status, op_rc); if(op_status != LRM_OP_DONE) { update_failcount(event, event_node, op_rc); } /* Process OP status */ switch(op_status) { case LRM_OP_PENDING: crm_debug("Ignoring pending operation"); return action->id; break; case LRM_OP_DONE: break; case LRM_OP_ERROR: case LRM_OP_TIMEOUT: case LRM_OP_NOTSUPPORTED: action->failed = TRUE; break; case LRM_OP_CANCELLED: /* do nothing?? */ crm_err("Dont know what to do for cancelled ops yet"); break; default: action->failed = TRUE; crm_err("Unsupported action result: %d", op_status); } /* stop this event's timer if it had one */ stop_te_timer(action->timer); action->confirmed = TRUE; update_graph(transition_graph, action); trigger_graph(); if(action->failed) { allow_fail = g_hash_table_lookup( action->params, crm_meta_name(XML_ATTR_TE_ALLOWFAIL)); if(crm_is_true(allow_fail)) { action->failed = FALSE; } } if(action->failed) { abort_transition(action->synapse->priority+1, tg_restart, "Event failed", event); } target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET); te_log_action(LOG_INFO, "Action %s (%d) confirmed on %s (rc=%d)", crm_str(this_event), action->id, crm_str(target), op_status); return action->id; } crm_action_t * get_action(int id, gboolean confirmed) { slist_iter( synapse, synapse_t, transition_graph->synapses, lpc, slist_iter( action, crm_action_t, synapse->actions, lpc2, if(action->id == id) { if(confirmed) { stop_te_timer(action->timer); action->confirmed = TRUE; } return action; } ) ); return NULL; } crm_action_t * match_down_event(int id, const char *target, const char *filter) { const char *this_action = NULL; const char *this_node = NULL; crm_action_t *match = NULL; slist_iter( synapse, synapse_t, transition_graph->synapses, lpc, /* lookup event */ slist_iter( action, crm_action_t, synapse->actions, lpc2, if(id > 0 && action->id == id) { match = action; break; } this_action = crm_element_value( action->xml, XML_LRM_ATTR_TASK); if(action->type != action_type_crm) { continue; } else if(safe_str_eq(this_action, CRM_OP_LRM_REFRESH)){ continue; } else if(filter != NULL && safe_str_neq(this_action, filter)) { continue; } this_node = crm_element_value( action->xml, XML_LRM_ATTR_TARGET_UUID); if(this_node == NULL) { crm_log_xml_err(action->xml, "No node uuid"); } if(safe_str_neq(this_node, target)) { crm_debug("Action %d : Node mismatch: %s", action->id, this_node); continue; } match = action; break; ); if(match != NULL) { /* stop this event's timer if it had one */ break; } ); if(match != NULL) { /* stop this event's timer if it had one */ crm_debug("Match found for action %d: %s on %s", id, crm_element_value(match->xml, XML_LRM_ATTR_TASK_KEY), target); stop_te_timer(match->timer); match->confirmed = TRUE; } else if(id > 0) { crm_err("No match for action %d", id); } else { crm_warn("No match for shutdown action on %s", target); } return match; } void process_graph_event(xmlNode *event, const char *event_node) { int rc = -1; int status = -1; int action = -1; int transition_num = -1; char *update_te_uuid = NULL; gboolean passed = FALSE; const char *id = NULL; const char *magic = NULL; CRM_ASSERT(event != NULL); id = ID(event); magic = crm_element_value(event, XML_ATTR_TRANSITION_MAGIC); if(magic == NULL) { /* non-change */ return; } CRM_CHECK(decode_transition_magic( magic, &update_te_uuid, &transition_num, &action, &status, &rc), crm_err("Invalid event %s detected", id); abort_transition(INFINITY, tg_restart,"Bad event", event); ); if(status == LRM_OP_PENDING) { goto bail; } if(transition_num == -1) { crm_err("Action %s (%s) initiated outside of a transition", id, magic); abort_transition(INFINITY, tg_restart,"Unexpected event",event); } else if(action < 0 || safe_str_neq(update_te_uuid, te_uuid)) { crm_info("Action %s (%s) initiated by a different transitioner", id, magic); abort_transition(INFINITY, tg_restart,"Foreign event", event); } else if(transition_graph->id != transition_num) { crm_info("Detected action %s from a different transition:" " %d vs. %d", id, transition_num, transition_graph->id); abort_transition(INFINITY, tg_restart,"Old event", event); } else if(transition_graph->complete) { crm_info("Action %s arrived after a completed transition", id); abort_transition(INFINITY, tg_restart, "Inactive graph", event); } else if(match_graph_event( action, event, event_node, status, rc) < 0) { crm_err("Unknown graph action %s", id); abort_transition(INFINITY, tg_restart, "Unknown event", event); } else { passed = TRUE; crm_debug_2("Processed update to %s: %s", id, magic); } if(passed == FALSE && rc != EXECRA_OK) { update_failcount(event, event_node, rc); } bail: crm_free(update_te_uuid); return; }