diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c index de757fdce0..0e0ac96e10 100644 --- a/daemons/fenced/pacemaker-fenced.c +++ b/daemons/fenced/pacemaker-fenced.c @@ -1,1499 +1,1503 @@ /* * Copyright 2009-2018 Andrew Beekhof * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include /* U32T ~ PRIu32, X32T ~ PRIx32 */ #include #include #include #include #include #include #include #include #include #include #include #include #include char *stonith_our_uname = NULL; char *stonith_our_uuid = NULL; long stonith_watchdog_timeout_ms = 0; static GMainLoop *mainloop = NULL; gboolean stand_alone = FALSE; static gboolean no_cib_connect = FALSE; static gboolean stonith_shutdown_flag = FALSE; static qb_ipcs_service_t *ipcs = NULL; static xmlNode *local_cib = NULL; GHashTable *known_peer_names = NULL; static cib_t *cib_api = NULL; static void *cib_library = NULL; static void stonith_shutdown(int nsig); static void stonith_cleanup(void); static int32_t st_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) { if (stonith_shutdown_flag) { crm_info("Ignoring new client [%d] during shutdown", crm_ipcs_client_pid(c)); return -EPERM; } if (crm_client_new(c, uid, gid) == NULL) { return -EIO; } return 0; } static void st_ipc_created(qb_ipcs_connection_t * c) { crm_trace("Connection created for %p", c); } /* Exit code means? */ static int32_t st_ipc_dispatch(qb_ipcs_connection_t * qbc, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; int call_options = 0; xmlNode *request = NULL; crm_client_t *c = crm_client_get(qbc); const char *op = NULL; if (c == NULL) { crm_info("Invalid client: %p", qbc); return 0; } request = crm_ipcs_recv(c, data, size, &id, &flags); if (request == NULL) { crm_ipcs_send_ack(c, id, flags, "nack", __FUNCTION__, __LINE__); return 0; } op = crm_element_value(request, F_CRM_TASK); if(safe_str_eq(op, CRM_OP_RM_NODE_CACHE)) { crm_xml_add(request, F_TYPE, T_STONITH_NG); crm_xml_add(request, F_STONITH_OPERATION, op); crm_xml_add(request, F_STONITH_CLIENTID, c->id); crm_xml_add(request, F_STONITH_CLIENTNAME, crm_client_name(c)); crm_xml_add(request, F_STONITH_CLIENTNODE, stonith_our_uname); send_cluster_message(NULL, crm_msg_stonith_ng, request, FALSE); free_xml(request); return 0; } if (c->name == NULL) { const char *value = crm_element_value(request, F_STONITH_CLIENTNAME); if (value == NULL) { value = "unknown"; } c->name = crm_strdup_printf("%s.%u", value, c->pid); } crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); crm_trace("Flags %" X32T "/%u for command %" U32T " from %s", flags, call_options, id, crm_client_name(c)); if (is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(flags & crm_ipc_client_response); CRM_LOG_ASSERT(c->request_id == 0); /* This means the client has two synchronous events in-flight */ c->request_id = id; /* Reply only to the last one */ } crm_xml_add(request, F_STONITH_CLIENTID, c->id); crm_xml_add(request, F_STONITH_CLIENTNAME, crm_client_name(c)); crm_xml_add(request, F_STONITH_CLIENTNODE, stonith_our_uname); crm_log_xml_trace(request, "Client[inbound]"); stonith_command(c, id, flags, request, NULL); free_xml(request); return 0; } /* Error code means? */ static int32_t st_ipc_closed(qb_ipcs_connection_t * c) { crm_client_t *client = crm_client_get(c); if (client == NULL) { return 0; } crm_trace("Connection %p closed", c); crm_client_destroy(client); /* 0 means: yes, go ahead and destroy the connection */ return 0; } static void st_ipc_destroy(qb_ipcs_connection_t * c) { crm_trace("Connection %p destroyed", c); st_ipc_closed(c); } static void stonith_peer_callback(xmlNode * msg, void *private_data) { const char *remote_peer = crm_element_value(msg, F_ORIG); const char *op = crm_element_value(msg, F_STONITH_OPERATION); if (crm_str_eq(op, "poke", TRUE)) { return; } crm_log_xml_trace(msg, "Peer[inbound]"); stonith_command(NULL, 0, 0, msg, remote_peer); } #if SUPPORT_COROSYNC static void stonith_peer_ais_callback(cpg_handle_t handle, const struct cpg_name *groupName, uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) { uint32_t kind = 0; xmlNode *xml = NULL; const char *from = NULL; char *data = pcmk_message_common_cs(handle, nodeid, pid, msg, &kind, &from); if(data == NULL) { return; } if (kind == crm_class_cluster) { xml = string2xml(data); if (xml == NULL) { crm_err("Invalid XML: '%.120s'", data); free(data); return; } crm_xml_add(xml, F_ORIG, from); /* crm_xml_add_int(xml, F_SEQ, wrapper->id); */ stonith_peer_callback(xml, NULL); } free_xml(xml); free(data); return; } static void stonith_peer_cs_destroy(gpointer user_data) { crm_crit("Lost connection to cluster layer, shutting down"); stonith_shutdown(0); } #endif void do_local_reply(xmlNode * notify_src, const char *client_id, gboolean sync_reply, gboolean from_peer) { /* send callback to originating child */ crm_client_t *client_obj = NULL; int local_rc = pcmk_ok; crm_trace("Sending response"); client_obj = crm_client_get_by_id(client_id); crm_trace("Sending callback to request originator"); if (client_obj == NULL) { local_rc = -1; crm_trace("No client to sent the response to. F_STONITH_CLIENTID not set."); } else { int rid = 0; if (sync_reply) { CRM_LOG_ASSERT(client_obj->request_id); rid = client_obj->request_id; client_obj->request_id = 0; crm_trace("Sending response %d to %s %s", rid, client_obj->name, from_peer ? "(originator of delegated request)" : ""); } else { crm_trace("Sending an event to %s %s", client_obj->name, from_peer ? "(originator of delegated request)" : ""); } local_rc = crm_ipcs_send(client_obj, rid, notify_src, sync_reply?crm_ipc_flags_none:crm_ipc_server_event); } if (local_rc < pcmk_ok && client_obj != NULL) { crm_warn("%sSync reply to %s failed: %s", sync_reply ? "" : "A-", client_obj ? client_obj->name : "", pcmk_strerror(local_rc)); } } long long get_stonith_flag(const char *name) { if (safe_str_eq(name, T_STONITH_NOTIFY_FENCE)) { return st_callback_notify_fence; } else if (safe_str_eq(name, STONITH_OP_DEVICE_ADD)) { return st_callback_device_add; } else if (safe_str_eq(name, STONITH_OP_DEVICE_DEL)) { return st_callback_device_del; } else if (safe_str_eq(name, T_STONITH_NOTIFY_HISTORY)) { return st_callback_notify_history; } return st_callback_unknown; } static void stonith_notify_client(gpointer key, gpointer value, gpointer user_data) { xmlNode *update_msg = user_data; crm_client_t *client = value; const char *type = NULL; CRM_CHECK(client != NULL, return); CRM_CHECK(update_msg != NULL, return); type = crm_element_value(update_msg, F_SUBTYPE); CRM_CHECK(type != NULL, crm_log_xml_err(update_msg, "notify"); return); if (client->ipcs == NULL) { crm_trace("Skipping client with NULL channel"); return; } if (client->options & get_stonith_flag(type)) { int rc = crm_ipcs_send(client, 0, update_msg, crm_ipc_server_event | crm_ipc_server_error); if (rc <= 0) { crm_warn("%s notification of client %s.%.6s failed: %s (%d)", type, crm_client_name(client), client->id, pcmk_strerror(rc), rc); } else { crm_trace("Sent %s notification to client %s.%.6s", type, crm_client_name(client), client->id); } } } void do_stonith_async_timeout_update(const char *client_id, const char *call_id, int timeout) { crm_client_t *client = NULL; xmlNode *notify_data = NULL; if (!timeout || !call_id || !client_id) { return; } client = crm_client_get_by_id(client_id); if (!client) { return; } notify_data = create_xml_node(NULL, T_STONITH_TIMEOUT_VALUE); crm_xml_add(notify_data, F_TYPE, T_STONITH_TIMEOUT_VALUE); crm_xml_add(notify_data, F_STONITH_CALLID, call_id); crm_xml_add_int(notify_data, F_STONITH_TIMEOUT, timeout); crm_trace("timeout update is %d for client %s and call id %s", timeout, client_id, call_id); if (client) { crm_ipcs_send(client, 0, notify_data, crm_ipc_server_event); } free_xml(notify_data); } void do_stonith_notify(int options, const char *type, int result, xmlNode * data) { /* TODO: Standardize the contents of data */ xmlNode *update_msg = create_xml_node(NULL, "notify"); CRM_CHECK(type != NULL,;); crm_xml_add(update_msg, F_TYPE, T_STONITH_NOTIFY); crm_xml_add(update_msg, F_SUBTYPE, type); crm_xml_add(update_msg, F_STONITH_OPERATION, type); crm_xml_add_int(update_msg, F_STONITH_RC, result); if (data != NULL) { add_message_xml(update_msg, F_STONITH_CALLDATA, data); } crm_trace("Notifying clients"); g_hash_table_foreach(client_connections, stonith_notify_client, update_msg); free_xml(update_msg); crm_trace("Notify complete"); } static void do_stonith_notify_config(int options, const char *op, int rc, const char *desc, int active) { xmlNode *notify_data = create_xml_node(NULL, op); CRM_CHECK(notify_data != NULL, return); crm_xml_add(notify_data, F_STONITH_DEVICE, desc); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); do_stonith_notify(options, op, rc, notify_data); free_xml(notify_data); } void do_stonith_notify_device(int options, const char *op, int rc, const char *desc) { do_stonith_notify_config(options, op, rc, desc, g_hash_table_size(device_list)); } void do_stonith_notify_level(int options, const char *op, int rc, const char *desc) { do_stonith_notify_config(options, op, rc, desc, g_hash_table_size(topology)); } static void topology_remove_helper(const char *node, int level) { int rc; char *desc = NULL; xmlNode *data = create_xml_node(NULL, XML_TAG_FENCING_LEVEL); crm_xml_add(data, F_STONITH_ORIGIN, __FUNCTION__); crm_xml_add_int(data, XML_ATTR_STONITH_INDEX, level); crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); rc = stonith_level_remove(data, &desc); do_stonith_notify_level(0, STONITH_OP_LEVEL_DEL, rc, desc); free_xml(data); free(desc); } static void remove_cib_device(xmlXPathObjectPtr xpathObj) { int max = numXpathResults(xpathObj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { const char *rsc_id = NULL; const char *standard = NULL; xmlNode *match = getXpathResult(xpathObj, lpc); CRM_LOG_ASSERT(match != NULL); if(match != NULL) { standard = crm_element_value(match, XML_AGENT_ATTR_CLASS); } if (safe_str_neq(standard, PCMK_RESOURCE_CLASS_STONITH)) { continue; } rsc_id = crm_element_value(match, XML_ATTR_ID); stonith_device_remove(rsc_id, TRUE); } } static void handle_topology_change(xmlNode *match, bool remove) { int rc; char *desc = NULL; CRM_CHECK(match != NULL, return); crm_trace("Updating %s", ID(match)); if(remove) { int index = 0; char *key = stonith_level_key(match, -1); crm_element_value_int(match, XML_ATTR_STONITH_INDEX, &index); topology_remove_helper(key, index); free(key); } rc = stonith_level_register(match, &desc); do_stonith_notify_level(0, STONITH_OP_LEVEL_ADD, rc, desc); free(desc); } static void remove_fencing_topology(xmlXPathObjectPtr xpathObj) { int max = numXpathResults(xpathObj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpathObj, lpc); CRM_LOG_ASSERT(match != NULL); if (match && crm_element_value(match, XML_DIFF_MARKER)) { /* Deletion */ int index = 0; char *target = stonith_level_key(match, -1); crm_element_value_int(match, XML_ATTR_STONITH_INDEX, &index); if (target == NULL) { crm_err("Invalid fencing target in element %s", ID(match)); } else if (index <= 0) { crm_err("Invalid level for %s in element %s", target, ID(match)); } else { topology_remove_helper(target, index); } /* } else { Deal with modifications during the 'addition' stage */ } } } static void register_fencing_topology(xmlXPathObjectPtr xpathObj) { int max = numXpathResults(xpathObj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpathObj, lpc); handle_topology_change(match, TRUE); } } /* Fencing */ static void fencing_topology_init() { xmlXPathObjectPtr xpathObj = NULL; const char *xpath = "//" XML_TAG_FENCING_LEVEL; crm_trace("Full topology refresh"); free_topology_list(); init_topology_list(); /* Grab everything */ xpathObj = xpath_search(local_cib, xpath); register_fencing_topology(xpathObj); freeXpathObject(xpathObj); } #define rsc_name(x) x->clone_name?x->clone_name:x->id /*! * \internal * \brief Check whether our uname is in a resource's allowed node list * * \param[in] rsc Resource to check * * \return Pointer to node object if found, NULL otherwise */ static node_t * our_node_allowed_for(resource_t *rsc) { GHashTableIter iter; node_t *node = NULL; if (rsc && stonith_our_uname) { g_hash_table_iter_init(&iter, rsc->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) { if (node && strcmp(node->details->uname, stonith_our_uname) == 0) { break; } node = NULL; } } return node; } /*! * \internal * \brief If a resource or any of its children are STONITH devices, update their * definitions given a cluster working set. * * \param[in] rsc Resource to check * \param[in] data_set Cluster working set with device information */ static void cib_device_update(resource_t *rsc, pe_working_set_t *data_set) { node_t *node = NULL; const char *value = NULL; const char *rclass = NULL; node_t *parent = NULL; gboolean remove = TRUE; /* If this is a complex resource, check children rather than this resource itself. * TODO: Mark each installed device and remove if untouched when this process finishes. */ if(rsc->children) { GListPtr gIter = NULL; for (gIter = rsc->children; gIter != NULL; gIter = gIter->next) { cib_device_update(gIter->data, data_set); if(pe_rsc_is_clone(rsc)) { crm_trace("Only processing one copy of the clone %s", rsc->id); break; } } return; } /* We only care about STONITH resources. */ rclass = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); if (safe_str_neq(rclass, PCMK_RESOURCE_CLASS_STONITH)) { return; } /* If this STONITH resource is disabled, just remove it. */ value = g_hash_table_lookup(rsc->meta, XML_RSC_ATTR_TARGET_ROLE); if (safe_str_eq(value, RSC_STOPPED)) { crm_info("Device %s has been disabled", rsc->id); goto update_done; } /* Check whether our node is allowed for this resource (and its parent if in a group) */ node = our_node_allowed_for(rsc); if (rsc->parent && (rsc->parent->variant == pe_group)) { parent = our_node_allowed_for(rsc->parent); } if(node == NULL) { /* Our node is disallowed, so remove the device */ GHashTableIter iter; crm_info("Device %s has been disabled on %s: unknown", rsc->id, stonith_our_uname); g_hash_table_iter_init(&iter, rsc->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) { crm_trace("Available: %s = %d", node->details->uname, node->weight); } goto update_done; } else if(node->weight < 0 || (parent && parent->weight < 0)) { /* Our node (or its group) is disallowed by score, so remove the device */ char *score = score2char((node->weight < 0) ? node->weight : parent->weight); crm_info("Device %s has been disabled on %s: score=%s", rsc->id, stonith_our_uname, score); free(score); goto update_done; } else { /* Our node is allowed, so update the device information */ xmlNode *data; GHashTableIter gIter; stonith_key_value_t *params = NULL; const char *name = NULL; const char *agent = crm_element_value(rsc->xml, XML_EXPR_ATTR_TYPE); const char *rsc_provides = NULL; crm_debug("Device %s is allowed on %s: score=%d", rsc->id, stonith_our_uname, node->weight); get_rsc_attributes(rsc->parameters, rsc, node, data_set); get_meta_attributes(rsc->meta, rsc, node, data_set); rsc_provides = g_hash_table_lookup(rsc->meta, XML_RSC_ATTR_PROVIDES); g_hash_table_iter_init(&gIter, rsc->parameters); while (g_hash_table_iter_next(&gIter, (gpointer *) & name, (gpointer *) & value)) { if (!name || !value) { continue; } params = stonith_key_value_add(params, name, value); crm_trace(" %s=%s", name, value); } remove = FALSE; data = create_device_registration_xml(rsc_name(rsc), st_namespace_any, agent, params, rsc_provides); stonith_device_register(data, NULL, TRUE); stonith_key_value_freeall(params, 1, 1); free_xml(data); } update_done: if(remove && g_hash_table_lookup(device_list, rsc_name(rsc))) { stonith_device_remove(rsc_name(rsc), TRUE); } } extern xmlNode *do_calculations(pe_working_set_t * data_set, xmlNode * xml_input, crm_time_t * now); /*! * \internal * \brief Update all STONITH device definitions based on current CIB */ static void cib_devices_update(void) { GListPtr gIter = NULL; pe_working_set_t data_set; crm_info("Updating devices to version %s.%s.%s", crm_element_value(local_cib, XML_ATTR_GENERATION_ADMIN), crm_element_value(local_cib, XML_ATTR_GENERATION), crm_element_value(local_cib, XML_ATTR_NUMUPDATES)); set_working_set_defaults(&data_set); data_set.input = local_cib; data_set.now = crm_time_new(NULL); data_set.flags |= pe_flag_quick_location; data_set.localhost = stonith_our_uname; cluster_status(&data_set); do_calculations(&data_set, NULL, NULL); for (gIter = data_set.resources; gIter != NULL; gIter = gIter->next) { cib_device_update(gIter->data, &data_set); } data_set.input = NULL; /* Wasn't a copy */ cleanup_alloc_calculations(&data_set); } static void update_cib_stonith_devices_v2(const char *event, xmlNode * msg) { xmlNode *change = NULL; char *reason = NULL; bool needs_update = FALSE; xmlNode *patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); for (change = __xml_first_child(patchset); change != NULL; change = __xml_next(change)) { const char *op = crm_element_value(change, XML_DIFF_OP); const char *xpath = crm_element_value(change, XML_DIFF_PATH); const char *shortpath = NULL; if(op == NULL || strcmp(op, "move") == 0) { continue; } else if(safe_str_eq(op, "delete") && strstr(xpath, XML_CIB_TAG_RESOURCE)) { const char *rsc_id = NULL; char *search = NULL; char *mutable = NULL; if (strstr(xpath, XML_TAG_ATTR_SETS)) { needs_update = TRUE; break; } mutable = strdup(xpath); rsc_id = strstr(mutable, "primitive[@id=\'"); if (rsc_id != NULL) { rsc_id += strlen("primitive[@id=\'"); search = strchr(rsc_id, '\''); } if (search != NULL) { *search = 0; stonith_device_remove(rsc_id, TRUE); } else { crm_warn("Ignoring malformed CIB update (resource deletion)"); } free(mutable); } else if(strstr(xpath, XML_CIB_TAG_RESOURCES)) { shortpath = strrchr(xpath, '/'); CRM_ASSERT(shortpath); reason = crm_strdup_printf("%s %s", op, shortpath+1); needs_update = TRUE; break; } else if(strstr(xpath, XML_CIB_TAG_CONSTRAINTS)) { shortpath = strrchr(xpath, '/'); CRM_ASSERT(shortpath); reason = crm_strdup_printf("%s %s", op, shortpath+1); needs_update = TRUE; break; } } if(needs_update) { crm_info("Updating device list from the cib: %s", reason); cib_devices_update(); } else { crm_trace("No updates for device list found in cib"); } free(reason); } static void update_cib_stonith_devices_v1(const char *event, xmlNode * msg) { const char *reason = "none"; gboolean needs_update = FALSE; xmlXPathObjectPtr xpath_obj = NULL; /* process new constraints */ xpath_obj = xpath_search(msg, "//" F_CIB_UPDATE_RESULT "//" XML_CONS_TAG_RSC_LOCATION); if (numXpathResults(xpath_obj) > 0) { int max = numXpathResults(xpath_obj), lpc = 0; /* Safest and simplest to always recompute */ needs_update = TRUE; reason = "new location constraint"; for (lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpath_obj, lpc); crm_log_xml_trace(match, "new constraint"); } } freeXpathObject(xpath_obj); /* process deletions */ xpath_obj = xpath_search(msg, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//" XML_CIB_TAG_RESOURCE); if (numXpathResults(xpath_obj) > 0) { remove_cib_device(xpath_obj); } freeXpathObject(xpath_obj); /* process additions */ xpath_obj = xpath_search(msg, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_CIB_TAG_RESOURCE); if (numXpathResults(xpath_obj) > 0) { int max = numXpathResults(xpath_obj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { const char *rsc_id = NULL; const char *standard = NULL; xmlNode *match = getXpathResult(xpath_obj, lpc); rsc_id = crm_element_value(match, XML_ATTR_ID); standard = crm_element_value(match, XML_AGENT_ATTR_CLASS); if (safe_str_neq(standard, PCMK_RESOURCE_CLASS_STONITH)) { continue; } crm_trace("Fencing resource %s was added or modified", rsc_id); reason = "new resource"; needs_update = TRUE; } } freeXpathObject(xpath_obj); if(needs_update) { crm_info("Updating device list from the cib: %s", reason); cib_devices_update(); } } static void update_cib_stonith_devices(const char *event, xmlNode * msg) { int format = 1; xmlNode *patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); CRM_ASSERT(patchset); crm_element_value_int(patchset, "format", &format); switch(format) { case 1: update_cib_stonith_devices_v1(event, msg); break; case 2: update_cib_stonith_devices_v2(event, msg); break; default: crm_warn("Unknown patch format: %d", format); } } /* Needs to hold node name + attribute name + attribute value + 75 */ #define XPATH_MAX 512 /*! * \internal * \brief Check whether a node has a specific attribute name/value * * \param[in] node Name of node to check * \param[in] name Name of an attribute to look for * \param[in] value The value the named attribute needs to be set to in order to be considered a match * * \return TRUE if the locally cached CIB has the specified node attribute */ gboolean node_has_attr(const char *node, const char *name, const char *value) { char xpath[XPATH_MAX]; xmlNode *match; int n; CRM_CHECK(local_cib != NULL, return FALSE); /* Search for the node's attributes in the CIB. While the schema allows * multiple sets of instance attributes, and allows instance attributes to * use id-ref to reference values elsewhere, that is intended for resources, * so we ignore that here. */ n = snprintf(xpath, XPATH_MAX, "//" XML_CIB_TAG_NODES "/" XML_CIB_TAG_NODE "[@uname='%s']/" XML_TAG_ATTR_SETS "/" XML_CIB_TAG_NVPAIR "[@name='%s' and @value='%s']", node, name, value); match = get_xpath_object(xpath, local_cib, LOG_TRACE); CRM_CHECK(n < XPATH_MAX, return FALSE); return (match != NULL); } static void update_fencing_topology(const char *event, xmlNode * msg) { int format = 1; const char *xpath; xmlXPathObjectPtr xpathObj = NULL; xmlNode *patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); CRM_ASSERT(patchset); crm_element_value_int(patchset, "format", &format); if(format == 1) { /* Process deletions (only) */ xpath = "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//" XML_TAG_FENCING_LEVEL; xpathObj = xpath_search(msg, xpath); remove_fencing_topology(xpathObj); freeXpathObject(xpathObj); /* Process additions and changes */ xpath = "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_TAG_FENCING_LEVEL; xpathObj = xpath_search(msg, xpath); register_fencing_topology(xpathObj); freeXpathObject(xpathObj); } else if(format == 2) { xmlNode *change = NULL; int add[] = { 0, 0, 0 }; int del[] = { 0, 0, 0 }; xml_patch_versions(patchset, add, del); for (change = __xml_first_child(patchset); change != NULL; change = __xml_next(change)) { const char *op = crm_element_value(change, XML_DIFF_OP); const char *xpath = crm_element_value(change, XML_DIFF_PATH); if(op == NULL) { continue; } else if(strstr(xpath, "/" XML_TAG_FENCING_LEVEL) != NULL) { /* Change to a specific entry */ crm_trace("Handling %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); if(strcmp(op, "move") == 0) { continue; } else if(strcmp(op, "create") == 0) { handle_topology_change(change->children, FALSE); } else if(strcmp(op, "modify") == 0) { xmlNode *match = first_named_child(change, XML_DIFF_RESULT); if(match) { handle_topology_change(match->children, TRUE); } } else if(strcmp(op, "delete") == 0) { /* Nuclear option, all we have is the path and an id... not enough to remove a specific entry */ crm_info("Re-initializing fencing topology after %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); fencing_topology_init(); return; } } else if (strstr(xpath, "/" XML_TAG_FENCING_TOPOLOGY) != NULL) { /* Change to the topology in general */ crm_info("Re-initializing fencing topology after top-level %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); fencing_topology_init(); return; } else if (strstr(xpath, "/" XML_CIB_TAG_CONFIGURATION)) { /* Changes to the whole config section, possibly including the topology as a whild */ if(first_named_child(change, XML_TAG_FENCING_TOPOLOGY) == NULL) { crm_trace("Nothing for us in %s operation %d.%d.%d for %s.", op, add[0], add[1], add[2], xpath); } else if(strcmp(op, "delete") == 0 || strcmp(op, "create") == 0) { crm_info("Re-initializing fencing topology after top-level %s operation %d.%d.%d for %s.", op, add[0], add[1], add[2], xpath); fencing_topology_init(); return; } } else { crm_trace("Nothing for us in %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); } } } else { crm_warn("Unknown patch format: %d", format); } } static bool have_cib_devices = FALSE; static void update_cib_cache_cb(const char *event, xmlNode * msg) { int rc = pcmk_ok; xmlNode *stonith_enabled_xml = NULL; xmlNode *stonith_watchdog_xml = NULL; const char *stonith_enabled_s = NULL; static gboolean stonith_enabled_saved = TRUE; if(!have_cib_devices) { crm_trace("Skipping updates until we get a full dump"); return; } else if(msg == NULL) { crm_trace("Missing %s update", event); return; } /* Maintain a local copy of the CIB so that we have full access * to device definitions, location constraints, and node attributes */ if (local_cib != NULL) { int rc = pcmk_ok; xmlNode *patchset = NULL; crm_element_value_int(msg, F_CIB_RC, &rc); if (rc != pcmk_ok) { return; } patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); xml_log_patchset(LOG_TRACE, "Config update", patchset); rc = xml_apply_patchset(local_cib, patchset, TRUE); switch (rc) { case pcmk_ok: case -pcmk_err_old_data: break; case -pcmk_err_diff_resync: case -pcmk_err_diff_failed: crm_notice("[%s] Patch aborted: %s (%d)", event, pcmk_strerror(rc), rc); free_xml(local_cib); local_cib = NULL; break; default: crm_warn("[%s] ABORTED: %s (%d)", event, pcmk_strerror(rc), rc); free_xml(local_cib); local_cib = NULL; } } if (local_cib == NULL) { crm_trace("Re-requesting the full cib"); rc = cib_api->cmds->query(cib_api, NULL, &local_cib, cib_scope_local | cib_sync_call); if(rc != pcmk_ok) { crm_err("Couldn't retrieve the CIB: %s (%d)", pcmk_strerror(rc), rc); return; } CRM_ASSERT(local_cib != NULL); stonith_enabled_saved = FALSE; /* Trigger a full refresh below */ } stonith_enabled_xml = get_xpath_object("//nvpair[@name='stonith-enabled']", local_cib, LOG_TRACE); if (stonith_enabled_xml) { stonith_enabled_s = crm_element_value(stonith_enabled_xml, XML_NVPAIR_ATTR_VALUE); } if (stonith_enabled_s == NULL || crm_is_true(stonith_enabled_s)) { long timeout_ms = 0; const char *value = NULL; stonith_watchdog_xml = get_xpath_object("//nvpair[@name='stonith-watchdog-timeout']", local_cib, LOG_TRACE); if (stonith_watchdog_xml) { value = crm_element_value(stonith_watchdog_xml, XML_NVPAIR_ATTR_VALUE); } if(value) { timeout_ms = crm_get_msec(value); } if (timeout_ms < 0) { timeout_ms = crm_auto_watchdog_timeout(); } if(timeout_ms != stonith_watchdog_timeout_ms) { crm_notice("New watchdog timeout %lds (was %lds)", timeout_ms/1000, stonith_watchdog_timeout_ms/1000); stonith_watchdog_timeout_ms = timeout_ms; } } else { stonith_watchdog_timeout_ms = 0; } if (stonith_enabled_s && crm_is_true(stonith_enabled_s) == FALSE) { crm_trace("Ignoring cib updates while stonith is disabled"); stonith_enabled_saved = FALSE; return; } else if (stonith_enabled_saved == FALSE) { crm_info("Updating stonith device and topology lists now that stonith is enabled"); stonith_enabled_saved = TRUE; fencing_topology_init(); cib_devices_update(); } else { update_fencing_topology(event, msg); update_cib_stonith_devices(event, msg); } } static void init_cib_cache_cb(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { crm_info("Updating device list from the cib: init"); have_cib_devices = TRUE; local_cib = copy_xml(output); fencing_topology_init(); cib_devices_update(); } static void stonith_shutdown(int nsig) { stonith_shutdown_flag = TRUE; crm_info("Terminating with %d clients", crm_hash_table_size(client_connections)); if (mainloop != NULL && g_main_is_running(mainloop)) { g_main_loop_quit(mainloop); } else { stonith_cleanup(); crm_exit(CRM_EX_OK); } } static void cib_connection_destroy(gpointer user_data) { if (stonith_shutdown_flag) { crm_info("Connection to the CIB manager closed"); return; } else { crm_crit("Lost connection to the CIB manager, shutting down"); } if (cib_api) { cib_api->cmds->signoff(cib_api); } stonith_shutdown(0); } static void stonith_cleanup(void) { if (cib_api) { cib_api->cmds->signoff(cib_api); } if (ipcs) { qb_ipcs_destroy(ipcs); } if (known_peer_names != NULL) { g_hash_table_destroy(known_peer_names); known_peer_names = NULL; } crm_peer_destroy(); crm_client_cleanup(); free_stonith_remote_op_list(); free_topology_list(); free_device_list(); free_metadata_cache(); free(stonith_our_uname); stonith_our_uname = NULL; free_xml(local_cib); local_cib = NULL; } /* *INDENT-OFF* */ static struct crm_option long_options[] = { {"stand-alone", 0, 0, 's'}, {"stand-alone-w-cpg", 0, 0, 'c'}, {"logfile", 1, 0, 'l'}, {"verbose", 0, 0, 'V'}, {"version", 0, 0, '$'}, {"help", 0, 0, '?'}, {0, 0, 0, 0} }; /* *INDENT-ON* */ static void setup_cib(void) { int rc, retries = 0; static cib_t *(*cib_new_fn) (void) = NULL; if (cib_new_fn == NULL) { cib_new_fn = find_library_function(&cib_library, CIB_LIBRARY, "cib_new", TRUE); } if (cib_new_fn != NULL) { cib_api = (*cib_new_fn) (); } if (cib_api == NULL) { crm_err("No connection to the CIB manager"); return; } do { sleep(retries); rc = cib_api->cmds->signon(cib_api, CRM_SYSTEM_STONITHD, cib_command); } while (rc == -ENOTCONN && ++retries < 5); if (rc != pcmk_ok) { crm_err("Could not connect to the CIB manager: %s (%d)", pcmk_strerror(rc), rc); } else if (pcmk_ok != cib_api->cmds->add_notify_callback(cib_api, T_CIB_DIFF_NOTIFY, update_cib_cache_cb)) { crm_err("Could not set CIB notification callback"); } else { rc = cib_api->cmds->query(cib_api, NULL, NULL, cib_scope_local); cib_api->cmds->register_callback(cib_api, rc, 120, FALSE, NULL, "init_cib_cache_cb", init_cib_cache_cb); cib_api->cmds->set_connection_dnotify(cib_api, cib_connection_destroy); crm_info("Watching for stonith topology changes"); } } struct qb_ipcs_service_handlers ipc_callbacks = { .connection_accept = st_ipc_accept, .connection_created = st_ipc_created, .msg_process = st_ipc_dispatch, .connection_closed = st_ipc_closed, .connection_destroyed = st_ipc_destroy }; /*! * \internal * \brief Callback for peer status changes * * \param[in] type What changed * \param[in] node What peer had the change * \param[in] data Previous value of what changed */ static void st_peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data) { if ((type != crm_status_processes) && !is_set(node->flags, crm_remote_node)) { xmlNode *query = NULL; if (node->id && node->uname) { g_hash_table_insert(known_peer_names, GUINT_TO_POINTER(node->id), strdup(node->uname)); } /* * This is a hack until we can send to a nodeid and/or we fix node name lookups * These messages are ignored in stonith_peer_callback() */ query = create_xml_node(NULL, "stonith_command"); crm_xml_add(query, F_XML_TAGNAME, "stonith_command"); crm_xml_add(query, F_TYPE, T_STONITH_NG); crm_xml_add(query, F_STONITH_OPERATION, "poke"); crm_debug("Broadcasting our uname because of node %u", node->id); send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE); free_xml(query); } } int main(int argc, char **argv) { int flag; int lpc = 0; int argerr = 0; int option_index = 0; crm_cluster_t cluster; const char *actions[] = { "reboot", "off", "on", "list", "monitor", "status" }; crm_log_preinit(NULL, argc, argv); crm_set_options(NULL, "mode [options]", long_options, "Provides a summary of cluster's current state." "\n\nOutputs varying levels of detail in a number of different formats.\n"); while (1) { flag = crm_get_option(argc, argv, &option_index); if (flag == -1) { break; } switch (flag) { case 'V': crm_bump_log_level(argc, argv); break; case 'l': crm_add_logfile(optarg); break; case 's': stand_alone = TRUE; break; case 'c': stand_alone = FALSE; no_cib_connect = TRUE; break; case '$': case '?': crm_help(flag, CRM_EX_OK); break; default: ++argerr; break; } } if (argc - optind == 1 && safe_str_eq("metadata", argv[optind])) { printf("\n"); printf("\n"); printf(" 1.0\n"); printf(" Instance attributes available for all \"stonith\"-class resources" " and used by Pacemaker's fence daemon, formerly known as stonithd\n"); printf(" Instance attributes available for all \"stonith\"-class resources\n"); printf(" \n"); +#if 0 + // priority is not implemented yet printf(" \n"); - printf - (" The priority of the stonith resource. Devices are tried in order of highest priority to lowest.\n"); + printf(" Devices that are not in a topology " + "are tried in order of highest to lowest integer priority\n"); printf(" \n"); printf(" \n"); +#endif printf(" \n", STONITH_ATTR_HOSTARG); printf (" Advanced use only: An alternate parameter to supply instead of 'port'\n"); printf (" Some devices do not support the standard 'port' parameter or may provide additional ones.\n" "Use this to specify an alternate, device-specific, parameter that should indicate the machine to be fenced.\n" "A value of 'none' can be used to tell the cluster not to supply any additional parameters.\n" " \n"); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_HOSTMAP); printf (" A mapping of host names to ports numbers for devices that do not support host names.\n"); printf (" Eg. node1:1;node2:2,3 would tell the cluster to use port 1 for node1 and ports 2 and 3 for node2\n"); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_HOSTLIST); printf (" A list of machines controlled by this device (Optional unless %s=static-list).\n", STONITH_ATTR_HOSTCHECK); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_HOSTCHECK); printf (" How to determine which machines are controlled by the device.\n"); - printf - (" Allowed values: dynamic-list (query the device), static-list (check the %s attribute), none (assume every device can fence every machine)\n", - STONITH_ATTR_HOSTLIST); + printf(" Allowed values: dynamic-list " + "(query the device via the 'list' command), static-list " + "(check the " STONITH_ATTR_HOSTLIST " attribute), status " + "(query the device via the 'status' command), none (assume " + "every device can fence every machine)\n"); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_DELAY_MAX); printf (" Enable a random delay for stonith actions and specify the maximum of random delay.\n"); printf (" This prevents double fencing when using slow devices such as sbd.\n" "Use this to enable a random delay for stonith actions.\n" "The overall delay is derived from this random delay value adding a static delay so that the sum is kept below the maximum delay.\n"); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_DELAY_BASE); printf (" Enable a base delay for stonith actions and specify base delay value.\n"); printf (" This prevents double fencing when different delays are configured on the nodes.\n" "Use this to enable a static delay for stonith actions.\n" "The overall delay is derived from a random delay value adding this static delay so that the sum is kept below the maximum delay.\n"); printf(" \n"); printf(" \n"); printf(" \n", STONITH_ATTR_ACTION_LIMIT); printf (" The maximum number of actions can be performed in parallel on this device\n"); printf (" Cluster property concurrent-fencing=true needs to be configured first.\n" "Then use this to specify the maximum number of actions can be performed in parallel on this device. -1 is unlimited.\n"); printf(" \n"); printf(" \n"); for (lpc = 0; lpc < DIMOF(actions); lpc++) { printf(" \n", actions[lpc]); printf (" Advanced use only: An alternate command to run instead of '%s'\n", actions[lpc]); printf (" Some devices do not support the standard commands or may provide additional ones.\n" "Use this to specify an alternate, device-specific, command that implements the '%s' action.\n", actions[lpc]); printf(" \n", actions[lpc]); printf(" \n"); printf(" \n", actions[lpc]); printf (" Advanced use only: Specify an alternate timeout to use for %s actions instead of stonith-timeout\n", actions[lpc]); printf (" Some devices need much more/less time to complete than normal.\n" "Use this to specify an alternate, device-specific, timeout for '%s' actions.\n", actions[lpc]); printf(" \n"); printf(" \n"); printf(" \n", actions[lpc]); printf (" Advanced use only: The maximum number of times to retry the '%s' command within the timeout period\n", actions[lpc]); printf(" Some devices do not support multiple connections." " Operations may 'fail' if the device is busy with another task so Pacemaker will automatically retry the operation, if there is time remaining." " Use this option to alter the number of times Pacemaker retries '%s' actions before giving up." "\n", actions[lpc]); printf(" \n"); printf(" \n"); } printf(" \n"); printf("\n"); return CRM_EX_OK; } if (optind != argc) { ++argerr; } if (argerr) { crm_help('?', CRM_EX_USAGE); } crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE); mainloop_add_signal(SIGTERM, stonith_shutdown); crm_peer_init(); known_peer_names = g_hash_table_new_full(g_direct_hash, g_direct_equal, NULL, free); if (stand_alone == FALSE) { if (is_corosync_cluster()) { #if SUPPORT_COROSYNC cluster.destroy = stonith_peer_cs_destroy; cluster.cpg.cpg_deliver_fn = stonith_peer_ais_callback; cluster.cpg.cpg_confchg_fn = pcmk_cpg_membership; #endif } crm_set_status_callback(&st_peer_update_callback); if (crm_cluster_connect(&cluster) == FALSE) { crm_crit("Cannot sign in to the cluster... terminating"); crm_exit(CRM_EX_FATAL); } stonith_our_uname = cluster.uname; stonith_our_uuid = cluster.uuid; if (no_cib_connect == FALSE) { setup_cib(); } } else { stonith_our_uname = strdup("localhost"); } init_device_list(); init_topology_list(); if(stonith_watchdog_timeout_ms > 0) { xmlNode *xml; stonith_key_value_t *params = NULL; params = stonith_key_value_add(params, STONITH_ATTR_HOSTLIST, stonith_our_uname); xml = create_device_registration_xml("watchdog", st_namespace_internal, STONITH_WATCHDOG_AGENT, params, NULL); stonith_device_register(xml, NULL, FALSE); stonith_key_value_freeall(params, 1, 1); free_xml(xml); } stonith_ipc_server_init(&ipcs, &ipc_callbacks); /* Create the mainloop and run it... */ mainloop = g_main_loop_new(NULL, FALSE); crm_info("Starting %s mainloop", crm_system_name); g_main_loop_run(mainloop); stonith_cleanup(); - crm_info("Done"); return crm_exit(CRM_EX_OK); } diff --git a/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt b/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt index 69a6b4d20c..e7ba332fd0 100644 --- a/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt +++ b/doc/Pacemaker_Explained/en-US/Ch-Stonith.txt @@ -1,940 +1,942 @@ :compat-mode: legacy = STONITH = //// We prefer [[ch-stonith]], but older versions of asciidoc don't deal well with that construct for chapter headings //// anchor:ch-stonith[Chapter 13, STONITH] indexterm:[STONITH, Configuration] == What Is STONITH? == STONITH (an acronym for "Shoot The Other Node In The Head"), also called 'fencing', protects your data from being corrupted by rogue nodes or concurrent access. Just because a node is unresponsive, this doesn't mean it isn't accessing your data. The only way to be 100% sure that your data is safe, is to use STONITH so we can be certain that the node is truly offline, before allowing the data to be accessed from another node. STONITH also has a role to play in the event that a clustered service cannot be stopped. In this case, the cluster uses STONITH to force the whole node offline, thereby making it safe to start the service elsewhere. == What STONITH Device Should You Use? == It is crucial that the STONITH device can allow the cluster to differentiate between a node failure and a network one. The biggest mistake people make in choosing a STONITH device is to use a remote power switch (such as many on-board IPMI controllers) that shares power with the node it controls. In such cases, the cluster cannot be sure if the node is really offline, or active and suffering from a network fault. Likewise, any device that relies on the machine being active (such as SSH-based "devices" used during testing) are inappropriate. == Special Treatment of STONITH Resources == STONITH resources are somewhat special in Pacemaker. STONITH may be initiated by pacemaker or by other parts of the cluster (such as resources like DRBD or DLM). To accommodate this, pacemaker does not require the STONITH resource to be in the 'started' state in order to be used, thus allowing reliable use of STONITH devices in such a case. All nodes have access to STONITH devices' definitions and instantiate them on-the-fly when needed, but preference is given to 'verified' instances, which are the ones that are 'started' according to the cluster's knowledge. In the case of a cluster split, the partition with a verified instance will have a slight advantage, because the STONITH daemon in the other partition will have to hear from all its current peers before choosing a node to perform the fencing. Fencing resources do work the same as regular resources in some respects: * +target-role+ can be used to enable or disable the resource * Location constraints can be used to prevent a specific node from using the resource [IMPORTANT] =========== Currently there is a limitation that fencing resources may only have one set of meta-attributes and one set of instance attributes. This can be revisited if it becomes a significant limitation for people. =========== See the table below or run `man pacemaker-fenced` to see special instance attributes that may be set for any fencing resource, regardless of fence agent. .Additional Properties of Fencing Resources [width="95%",cols="5m,2,3,<10",options="header",align="center"] |========================================================= |Field |Type |Default |Description |stonith-timeout |NA |NA a|Older versions used this to override the default period to wait for a STONITH (reboot, on, off) action to complete for this device. It has been replaced by the +pcmk_reboot_timeout+ and +pcmk_off_timeout+ properties. indexterm:[stonith-timeout,Fencing] indexterm:[Fencing,Property,stonith-timeout] //// + (not yet implemented) priority integer 0 The priority of the STONITH resource. Devices are tried in order of highest priority to lowest. - indexterm:[priority,Fencing] - indexterm:[Fencing,Property,priority] + indexterm priority,Fencing + indexterm Fencing,Property,priority //// |provides |string | |Any special capability provided by the fence device. Currently, only one such capability is meaningful: +unfencing+ (see <>). - indexterm:[priority,Fencing] - indexterm:[Fencing,Property,priority] + indexterm:[provides,Fencing] + indexterm:[Fencing,Property,provides] |pcmk_host_map |string | |A mapping of host names to ports numbers for devices that do not support host names. Example: +node1:1;node2:2,3+ tells the cluster to use port 1 for *node1* and ports 2 and 3 for *node2*. indexterm:[pcmk_host_map,Fencing] indexterm:[Fencing,Property,pcmk_host_map] |pcmk_host_list |string | |A list of machines controlled by this device (optional unless +pcmk_host_check+ is +static-list+). indexterm:[pcmk_host_list,Fencing] indexterm:[Fencing,Property,pcmk_host_list] |pcmk_host_check |string |dynamic-list a|How to determine which machines are controlled by the device. Allowed values: -* +dynamic-list:+ query the device +* +dynamic-list:+ query the device via the "list" command * +static-list:+ check the +pcmk_host_list+ attribute +* +status:+ query the device via the "status" command * +none:+ assume every device can fence every machine indexterm:[pcmk_host_check,Fencing] indexterm:[Fencing,Property,pcmk_host_check] |pcmk_delay_max |time |0s |Enable a random delay of up to the time specified before executing stonith actions. This is sometimes used in two-node clusters to ensure that the nodes don't fence each other at the same time. The overall delay introduced by pacemaker is derived from this random delay value adding a static delay so that the sum is kept below the maximum delay. indexterm:[pcmk_delay_max,Fencing] indexterm:[Fencing,Property,pcmk_delay_max] |pcmk_delay_base |time |0s |Enable a static delay before executing stonith actions. This can be used e.g. in two-node clusters to ensure that the nodes don't fence each other, by having separate fencing resources with different values. The node that is fenced with the shorter delay will lose a fencing race. The overall delay introduced by pacemaker is derived from this value plus a random delay such that the sum is kept below the maximum delay. indexterm:[pcmk_delay_base,Fencing] indexterm:[Fencing,Property,pcmk_delay_base] |pcmk_action_limit |integer |1 |The maximum number of actions that can be performed in parallel on this device, if the cluster option +concurrent-fencing+ is +true+. -1 is unlimited. indexterm:[pcmk_action_limit,Fencing] indexterm:[Fencing,Property,pcmk_action_limit] |pcmk_host_argument |string |port |'Advanced use only.' Which parameter should be supplied to the resource agent to identify the node to be fenced. Some devices do not support the standard +port+ parameter or may provide additional ones. Use this to specify an alternate, device-specific parameter. A value of +none+ tells the cluster not to supply any additional parameters. indexterm:[pcmk_host_argument,Fencing] indexterm:[Fencing,Property,pcmk_host_argument] |pcmk_reboot_action |string |reboot |'Advanced use only.' The command to send to the resource agent in order to reboot a node. Some devices do not support the standard commands or may provide additional ones. Use this to specify an alternate, device-specific command. indexterm:[pcmk_reboot_action,Fencing] indexterm:[Fencing,Property,pcmk_reboot_action] |pcmk_reboot_timeout |time |60s |'Advanced use only.' Specify an alternate timeout to use for `reboot` actions instead of the value of +stonith-timeout+. Some devices need much more or less time to complete than normal. Use this to specify an alternate, device-specific timeout. indexterm:[pcmk_reboot_timeout,Fencing] indexterm:[Fencing,Property,pcmk_reboot_timeout] indexterm:[stonith-timeout,Fencing] indexterm:[Fencing,Property,stonith-timeout] |pcmk_reboot_retries |integer |2 |'Advanced use only.' The maximum number of times to retry the `reboot` command within the timeout period. Some devices do not support multiple connections, and operations may fail if the device is busy with another task, so Pacemaker will automatically retry the operation, if there is time remaining. Use this option to alter the number of times Pacemaker retries before giving up. indexterm:[pcmk_reboot_retries,Fencing] indexterm:[Fencing,Property,pcmk_reboot_retries] |pcmk_off_action |string |off |'Advanced use only.' The command to send to the resource agent in order to shut down a node. Some devices do not support the standard commands or may provide additional ones. Use this to specify an alternate, device-specific command. indexterm:[pcmk_off_action,Fencing] indexterm:[Fencing,Property,pcmk_off_action] |pcmk_off_timeout |time |60s |'Advanced use only.' Specify an alternate timeout to use for `off` actions instead of the value of +stonith-timeout+. Some devices need much more or less time to complete than normal. Use this to specify an alternate, device-specific timeout. indexterm:[pcmk_off_timeout,Fencing] indexterm:[Fencing,Property,pcmk_off_timeout] indexterm:[stonith-timeout,Fencing] indexterm:[Fencing,Property,stonith-timeout] |pcmk_off_retries |integer |2 |'Advanced use only.' The maximum number of times to retry the `off` command within the timeout period. Some devices do not support multiple connections, and operations may fail if the device is busy with another task, so Pacemaker will automatically retry the operation, if there is time remaining. Use this option to alter the number of times Pacemaker retries before giving up. indexterm:[pcmk_off_retries,Fencing] indexterm:[Fencing,Property,pcmk_off_retries] |pcmk_list_action |string |list |'Advanced use only.' The command to send to the resource agent in order to list nodes. Some devices do not support the standard commands or may provide additional ones. Use this to specify an alternate, device-specific command. indexterm:[pcmk_list_action,Fencing] indexterm:[Fencing,Property,pcmk_list_action] |pcmk_list_timeout |time |60s |'Advanced use only.' Specify an alternate timeout to use for `list` actions instead of the value of +stonith-timeout+. Some devices need much more or less time to complete than normal. Use this to specify an alternate, device-specific timeout. indexterm:[pcmk_list_timeout,Fencing] indexterm:[Fencing,Property,pcmk_list_timeout] |pcmk_list_retries |integer |2 |'Advanced use only.' The maximum number of times to retry the `list` command within the timeout period. Some devices do not support multiple connections, and operations may fail if the device is busy with another task, so Pacemaker will automatically retry the operation, if there is time remaining. Use this option to alter the number of times Pacemaker retries before giving up. indexterm:[pcmk_list_retries,Fencing] indexterm:[Fencing,Property,pcmk_list_retries] |pcmk_monitor_action |string |monitor |'Advanced use only.' The command to send to the resource agent in order to report extended status. Some devices do not support the standard commands or may provide additional ones. Use this to specify an alternate, device-specific command. indexterm:[pcmk_monitor_action,Fencing] indexterm:[Fencing,Property,pcmk_monitor_action] |pcmk_monitor_timeout |time |60s |'Advanced use only.' Specify an alternate timeout to use for `monitor` actions instead of the value of +stonith-timeout+. Some devices need much more or less time to complete than normal. Use this to specify an alternate, device-specific timeout. indexterm:[pcmk_monitor_timeout,Fencing] indexterm:[Fencing,Property,pcmk_monitor_timeout] |pcmk_monitor_retries |integer |2 |'Advanced use only.' The maximum number of times to retry the `monitor` command within the timeout period. Some devices do not support multiple connections, and operations may fail if the device is busy with another task, so Pacemaker will automatically retry the operation, if there is time remaining. Use this option to alter the number of times Pacemaker retries before giving up. indexterm:[pcmk_monitor_retries,Fencing] indexterm:[Fencing,Property,pcmk_monitor_retries] |pcmk_status_action |string |status |'Advanced use only.' The command to send to the resource agent in order to report status. Some devices do not support the standard commands or may provide additional ones. Use this to specify an alternate, device-specific command. indexterm:[pcmk_status_action,Fencing] indexterm:[Fencing,Property,pcmk_status_action] |pcmk_status_timeout |time |60s |'Advanced use only.' Specify an alternate timeout to use for `status` actions instead of the value of +stonith-timeout+. Some devices need much more or less time to complete than normal. Use this to specify an alternate, device-specific timeout. indexterm:[pcmk_status_timeout,Fencing] indexterm:[Fencing,Property,pcmk_status_timeout] |pcmk_status_retries |integer |2 |'Advanced use only.' The maximum number of times to retry the `status` command within the timeout period. Some devices do not support multiple connections, and operations may fail if the device is busy with another task, so Pacemaker will automatically retry the operation, if there is time remaining. Use this option to alter the number of times Pacemaker retries before giving up. indexterm:[pcmk_status_retries,Fencing] indexterm:[Fencing,Property,pcmk_status_retries] |========================================================= [[s-unfencing]] == Unfencing == Most fence devices cut the power to the target. By contrast, fence devices that perform 'fabric fencing' cut off a node's access to some critical resource, such as a shared disk or a network switch. With fabric fencing, it is expected that the cluster will fence the node, and then a system administrator must manually investigate what went wrong, correct any issues found, then reboot (or restart the cluster services on) the node. Once the node reboots and rejoins the cluster, some fabric fencing devices require that an explicit command to restore the node's access to the critical resource. This capability is called 'unfencing' and is typically implemented as the fence agent's +on+ command. If any cluster resource has +requires+ set to +unfencing+, then that resource will not be probed or started on a node until that node has been unfenced. == Configuring STONITH == [NOTE] =========== Higher-level configuration shells include functionality to simplify the process below, particularly the step for deciding which parameters are required. However since this document deals only with core components, you should refer to the STONITH chapter of the http://www.clusterlabs.org/doc/[Clusters from Scratch] guide for those details. =========== . Find the correct driver: + ---- # stonith_admin --list-installed ---- . Find the required parameters associated with the device (replacing $AGENT_NAME with the name obtained from the previous step): + ---- # stonith_admin --metadata --agent $AGENT_NAME ---- . Create a file called +stonith.xml+ containing a primitive resource with a class of +stonith+, a type equal to the agent name obtained earlier, and a parameter for each of the values returned in the previous step. . If the device does not know how to fence nodes based on their uname, you may also need to set the special +pcmk_host_map+ parameter. See `man pacemaker-fenced` for details. . If the device does not support the `list` command, you may also need to set the special +pcmk_host_list+ and/or +pcmk_host_check+ parameters. See `man pacemaker-fenced` for details. . If the device does not expect the victim to be specified with the `port` parameter, you may also need to set the special +pcmk_host_argument+ parameter. See `man pacemaker-fenced` for details. . Upload it into the CIB using cibadmin: + ---- # cibadmin -C -o resources --xml-file stonith.xml ---- . Set +stonith-enabled+ to true: + ---- # crm_attribute -t crm_config -n stonith-enabled -v true ---- . Once the stonith resource is running, you can test it by executing the following (although you might want to stop the cluster on that machine first): + ---- # stonith_admin --reboot nodename ---- === Example STONITH Configuration === Assume we have an chassis containing four nodes and an IPMI device active on 192.0.2.1. We would choose the `fence_ipmilan` driver, and obtain the following list of parameters: .Obtaining a list of STONITH Parameters ==== ---- # stonith_admin --metadata -a fence_ipmilan ---- [source,XML] ---- ---- ==== Based on that, we would create a STONITH resource fragment that might look like this: .An IPMI-based STONITH Resource ==== [source,XML] ---- ---- ==== Finally, we need to enable STONITH: ---- # crm_attribute -t crm_config -n stonith-enabled -v true ---- == Advanced STONITH Configurations == Some people consider that having one fencing device is a single point of failure footnote:[Not true, since a node or resource must fail before fencing even has a chance to]; others prefer removing the node from the storage and network instead of turning it off. Whatever the reason, Pacemaker supports fencing nodes with multiple devices through a feature called 'fencing topologies'. Simply create the individual devices as you normally would, then define one or more +fencing-level+ entries in the +fencing-topology+ section of the configuration. * Each fencing level is attempted in order of ascending +index+. Allowed values are 1 through 9. * If a device fails, processing terminates for the current level. No further devices in that level are exercised, and the next level is attempted instead. * If the operation succeeds for all the listed devices in a level, the level is deemed to have passed. * The operation is finished when a level has passed (success), or all levels have been attempted (failed). * If the operation failed, the next step is determined by the scheduler and/or the controller. Some possible uses of topologies include: * Try poison-pill and fail back to power * Try disk and network, and fall back to power if either fails * Initiate a kdump and then poweroff the node .Properties of Fencing Levels [width="95%",cols="1m,<3",options="header",align="center"] |========================================================= |Field |Description |id |A unique name for the level indexterm:[id,fencing-level] indexterm:[Fencing,fencing-level,id] |target |The name of a single node to which this level applies indexterm:[target,fencing-level] indexterm:[Fencing,fencing-level,target] |target-pattern |An extended regular expression (as defined in http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04[POSIX]) matching the names of nodes to which this level applies indexterm:[target-pattern,fencing-level] indexterm:[Fencing,fencing-level,target-pattern] |target-attribute |The name of a node attribute that is set (to +target-value+) for nodes to which this level applies indexterm:[target-attribute,fencing-level] indexterm:[Fencing,fencing-level,target-attribute] |target-value |The node attribute value (of +target-attribute+) that is set for nodes to which this level applies indexterm:[target-attribute,fencing-level] indexterm:[Fencing,fencing-level,target-attribute] |index |The order in which to attempt the levels. Levels are attempted in ascending order 'until one succeeds'. Valid values are 1 through 9. indexterm:[index,fencing-level] indexterm:[Fencing,fencing-level,index] |devices |A comma-separated list of devices that must all be tried for this level indexterm:[devices,fencing-level] indexterm:[Fencing,fencing-level,devices] |========================================================= .Fencing topology with different devices for different nodes ==== [source,XML] ---- ... ... ---- ==== === Example Dual-Layer, Dual-Device Fencing Topologies === The following example illustrates an advanced use of +fencing-topology+ in a cluster with the following properties: * 3 nodes (2 active prod-mysql nodes, 1 prod_mysql-rep in standby for quorum purposes) * the active nodes have an IPMI-controlled power board reached at 192.0.2.1 and 192.0.2.2 * the active nodes also have two independent PSUs (Power Supply Units) connected to two independent PDUs (Power Distribution Units) reached at 198.51.100.1 (port 10 and port 11) and 203.0.113.1 (port 10 and port 11) * the first fencing method uses the `fence_ipmi` agent * the second fencing method uses the `fence_apc_snmp` agent targetting 2 fencing devices (one per PSU, either port 10 or 11) * fencing is only implemented for the active nodes and has location constraints * fencing topology is set to try IPMI fencing first then default to a "sure-kill" dual PDU fencing In a normal failure scenario, STONITH will first select +fence_ipmi+ to try to kill the faulty node. Using a fencing topology, if that first method fails, STONITH will then move on to selecting +fence_apc_snmp+ twice: * once for the first PDU * again for the second PDU The fence action is considered successful only if both PDUs report the required status. If any of them fails, STONITH loops back to the first fencing method, +fence_ipmi+, and so on until the node is fenced or fencing action is cancelled. .First fencing method: single IPMI device Each cluster node has it own dedicated IPMI channel that can be called for fencing using the following primitives: [source,XML] ---- ---- .Second fencing method: dual PDU devices Each cluster node also has two distinct power channels controlled by two distinct PDUs. That means a total of 4 fencing devices configured as follows: - Node 1, PDU 1, PSU 1 @ port 10 - Node 1, PDU 2, PSU 2 @ port 10 - Node 2, PDU 1, PSU 1 @ port 11 - Node 2, PDU 2, PSU 2 @ port 11 The matching fencing agents are configured as follows: [source,XML] ---- ---- .Location Constraints To prevent STONITH from trying to run a fencing agent on the same node it is supposed to fence, constraints are placed on all the fencing primitives: [source,XML] ---- ---- .Fencing topology Now that all the fencing resources are defined, it's time to create the right topology. We want to first fence using IPMI and if that does not work, fence both PDUs to effectively and surely kill the node. [source,XML] ---- ---- Please note, in +fencing-topology+, the lowest +index+ value determines the priority of the first fencing method. .Final configuration Put together, the configuration looks like this: [source,XML] ---- ... ... ---- == Remapping Reboots == When the cluster needs to reboot a node, whether because +stonith-action+ is +reboot+ or because a reboot was manually requested (such as by `stonith_admin --reboot`), it will remap that to other commands in two cases: . If the chosen fencing device does not support the +reboot+ command, the cluster will ask it to perform +off+ instead. . If a fencing topology level with multiple devices must be executed, the cluster will ask all the devices to perform +off+, then ask the devices to perform +on+. To understand the second case, consider the example of a node with redundant power supplies connected to intelligent power switches. Rebooting one switch and then the other would have no effect on the node. Turning both switches off, and then on, actually reboots the node. In such a case, the fencing operation will be treated as successful as long as the +off+ commands succeed, because then it is safe for the cluster to recover any resources that were on the node. Timeouts and errors in the +on+ phase will be logged but ignored. When a reboot operation is remapped, any action-specific timeout for the remapped action will be used (for example, +pcmk_off_timeout+ will be used when executing the +off+ command, not +pcmk_reboot_timeout+).