diff --git a/daemons/fenced/pacemaker-fenced.c b/daemons/fenced/pacemaker-fenced.c index 1b954be5a4..12f331496c 100644 --- a/daemons/fenced/pacemaker-fenced.c +++ b/daemons/fenced/pacemaker-fenced.c @@ -1,1710 +1,1714 @@ /* * Copyright 2009-2022 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #include #include #include #include #include #include #include #include #include // PRIu32, PRIx32 #include #include #include #include #include #include #include #include #include #include #include #include #include #include char *stonith_our_uname = NULL; long stonith_watchdog_timeout_ms = 0; GList *stonith_watchdog_targets = NULL; static GMainLoop *mainloop = NULL; gboolean stand_alone = FALSE; static gboolean no_cib_connect = FALSE; static gboolean stonith_shutdown_flag = FALSE; static qb_ipcs_service_t *ipcs = NULL; static xmlNode *local_cib = NULL; static pe_working_set_t *fenced_data_set = NULL; static cib_t *cib_api = NULL; static pcmk__output_t *out = NULL; pcmk__supported_format_t formats[] = { PCMK__SUPPORTED_FORMAT_LOG, PCMK__SUPPORTED_FORMAT_NONE, PCMK__SUPPORTED_FORMAT_TEXT, { NULL, NULL, NULL } }; static void stonith_shutdown(int nsig); static void stonith_cleanup(void); static int32_t st_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) { if (stonith_shutdown_flag) { crm_info("Ignoring new client [%d] during shutdown", pcmk__client_pid(c)); return -EPERM; } if (pcmk__new_client(c, uid, gid) == NULL) { return -EIO; } return 0; } /* Exit code means? */ static int32_t st_ipc_dispatch(qb_ipcs_connection_t * qbc, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; int call_options = 0; xmlNode *request = NULL; pcmk__client_t *c = pcmk__find_client(qbc); const char *op = NULL; if (c == NULL) { crm_info("Invalid client: %p", qbc); return 0; } request = pcmk__client_data2xml(c, data, &id, &flags); if (request == NULL) { pcmk__ipc_send_ack(c, id, flags, "nack", CRM_EX_PROTOCOL); return 0; } op = crm_element_value(request, F_CRM_TASK); if(pcmk__str_eq(op, CRM_OP_RM_NODE_CACHE, pcmk__str_casei)) { crm_xml_add(request, F_TYPE, T_STONITH_NG); crm_xml_add(request, F_STONITH_OPERATION, op); crm_xml_add(request, F_STONITH_CLIENTID, c->id); crm_xml_add(request, F_STONITH_CLIENTNAME, pcmk__client_name(c)); crm_xml_add(request, F_STONITH_CLIENTNODE, stonith_our_uname); send_cluster_message(NULL, crm_msg_stonith_ng, request, FALSE); free_xml(request); return 0; } if (c->name == NULL) { const char *value = crm_element_value(request, F_STONITH_CLIENTNAME); if (value == NULL) { value = "unknown"; } c->name = crm_strdup_printf("%s.%u", value, c->pid); } crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options); crm_trace("Flags %#08" PRIx32 "/%#08x for command %" PRIu32 " from client %s", flags, call_options, id, pcmk__client_name(c)); if (pcmk_is_set(call_options, st_opt_sync_call)) { CRM_ASSERT(flags & crm_ipc_client_response); CRM_LOG_ASSERT(c->request_id == 0); /* This means the client has two synchronous events in-flight */ c->request_id = id; /* Reply only to the last one */ } crm_xml_add(request, F_STONITH_CLIENTID, c->id); crm_xml_add(request, F_STONITH_CLIENTNAME, pcmk__client_name(c)); crm_xml_add(request, F_STONITH_CLIENTNODE, stonith_our_uname); stonith_command(c, id, flags, request, NULL); free_xml(request); return 0; } /* Error code means? */ static int32_t st_ipc_closed(qb_ipcs_connection_t * c) { pcmk__client_t *client = pcmk__find_client(c); if (client == NULL) { return 0; } crm_trace("Connection %p closed", c); pcmk__free_client(client); /* 0 means: yes, go ahead and destroy the connection */ return 0; } static void st_ipc_destroy(qb_ipcs_connection_t * c) { crm_trace("Connection %p destroyed", c); st_ipc_closed(c); } static void stonith_peer_callback(xmlNode * msg, void *private_data) { const char *remote_peer = crm_element_value(msg, F_ORIG); const char *op = crm_element_value(msg, F_STONITH_OPERATION); if (pcmk__str_eq(op, "poke", pcmk__str_none)) { return; } crm_log_xml_trace(msg, "Peer[inbound]"); stonith_command(NULL, 0, 0, msg, remote_peer); } #if SUPPORT_COROSYNC static void stonith_peer_ais_callback(cpg_handle_t handle, const struct cpg_name *groupName, uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len) { uint32_t kind = 0; xmlNode *xml = NULL; const char *from = NULL; char *data = pcmk_message_common_cs(handle, nodeid, pid, msg, &kind, &from); if(data == NULL) { return; } if (kind == crm_class_cluster) { xml = string2xml(data); if (xml == NULL) { crm_err("Invalid XML: '%.120s'", data); free(data); return; } crm_xml_add(xml, F_ORIG, from); /* crm_xml_add_int(xml, F_SEQ, wrapper->id); */ stonith_peer_callback(xml, NULL); } free_xml(xml); free(data); return; } static void stonith_peer_cs_destroy(gpointer user_data) { crm_crit("Lost connection to cluster layer, shutting down"); stonith_shutdown(0); } #endif void do_local_reply(xmlNode * notify_src, const char *client_id, gboolean sync_reply, gboolean from_peer) { /* send callback to originating child */ pcmk__client_t *client_obj = NULL; int local_rc = pcmk_rc_ok; crm_trace("Sending response"); client_obj = pcmk__find_client_by_id(client_id); crm_trace("Sending callback to request originator"); if (client_obj == NULL) { local_rc = EPROTO; crm_trace("No client to sent the response to. F_STONITH_CLIENTID not set."); } else { int rid = 0; if (sync_reply) { CRM_LOG_ASSERT(client_obj->request_id); rid = client_obj->request_id; client_obj->request_id = 0; crm_trace("Sending response %d to client %s%s", rid, pcmk__client_name(client_obj), (from_peer? " (originator of delegated request)" : "")); } else { crm_trace("Sending an event to client %s%s", pcmk__client_name(client_obj), (from_peer? " (originator of delegated request)" : "")); } local_rc = pcmk__ipc_send_xml(client_obj, rid, notify_src, (sync_reply? crm_ipc_flags_none : crm_ipc_server_event)); } if ((local_rc != pcmk_rc_ok) && (client_obj != NULL)) { crm_warn("%s reply to client %s failed: %s", (sync_reply? "Synchronous" : "Asynchronous"), pcmk__client_name(client_obj), pcmk_rc_str(local_rc)); } } uint64_t get_stonith_flag(const char *name) { if (pcmk__str_eq(name, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) { return st_callback_notify_fence; } else if (pcmk__str_eq(name, STONITH_OP_DEVICE_ADD, pcmk__str_casei)) { return st_callback_device_add; } else if (pcmk__str_eq(name, STONITH_OP_DEVICE_DEL, pcmk__str_casei)) { return st_callback_device_del; } else if (pcmk__str_eq(name, T_STONITH_NOTIFY_HISTORY, pcmk__str_casei)) { return st_callback_notify_history; } else if (pcmk__str_eq(name, T_STONITH_NOTIFY_HISTORY_SYNCED, pcmk__str_casei)) { return st_callback_notify_history_synced; } return st_callback_unknown; } static void stonith_notify_client(gpointer key, gpointer value, gpointer user_data) { xmlNode *update_msg = user_data; pcmk__client_t *client = value; const char *type = NULL; CRM_CHECK(client != NULL, return); CRM_CHECK(update_msg != NULL, return); type = crm_element_value(update_msg, F_SUBTYPE); CRM_CHECK(type != NULL, crm_log_xml_err(update_msg, "notify"); return); if (client->ipcs == NULL) { crm_trace("Skipping client with NULL channel"); return; } if (pcmk_is_set(client->flags, get_stonith_flag(type))) { int rc = pcmk__ipc_send_xml(client, 0, update_msg, crm_ipc_server_event|crm_ipc_server_error); if (rc != pcmk_rc_ok) { crm_warn("%s notification of client %s failed: %s " CRM_XS " id=%.8s rc=%d", type, pcmk__client_name(client), pcmk_rc_str(rc), client->id, rc); } else { crm_trace("Sent %s notification to client %s", type, pcmk__client_name(client)); } } } void do_stonith_async_timeout_update(const char *client_id, const char *call_id, int timeout) { pcmk__client_t *client = NULL; xmlNode *notify_data = NULL; if (!timeout || !call_id || !client_id) { return; } client = pcmk__find_client_by_id(client_id); if (!client) { return; } notify_data = create_xml_node(NULL, T_STONITH_TIMEOUT_VALUE); crm_xml_add(notify_data, F_TYPE, T_STONITH_TIMEOUT_VALUE); crm_xml_add(notify_data, F_STONITH_CALLID, call_id); crm_xml_add_int(notify_data, F_STONITH_TIMEOUT, timeout); crm_trace("timeout update is %d for client %s and call id %s", timeout, client_id, call_id); if (client) { pcmk__ipc_send_xml(client, 0, notify_data, crm_ipc_server_event); } free_xml(notify_data); } /*! * \internal * \brief Notify relevant IPC clients of a fencing operation result * * \param[in] type Notification type * \param[in] result Result of fencing operation (assume success if NULL) * \param[in] data If not NULL, add to notification as call data */ void fenced_send_notification(const char *type, const pcmk__action_result_t *result, xmlNode *data) { /* TODO: Standardize the contents of data */ xmlNode *update_msg = create_xml_node(NULL, "notify"); CRM_LOG_ASSERT(type != NULL); crm_xml_add(update_msg, F_TYPE, T_STONITH_NOTIFY); crm_xml_add(update_msg, F_SUBTYPE, type); crm_xml_add(update_msg, F_STONITH_OPERATION, type); stonith__xe_set_result(update_msg, result); if (data != NULL) { add_message_xml(update_msg, F_STONITH_CALLDATA, data); } crm_trace("Notifying clients"); pcmk__foreach_ipc_client(stonith_notify_client, update_msg); free_xml(update_msg); crm_trace("Notify complete"); } /*! * \internal * \brief Send notifications for a configuration change to subscribed clients * * \param[in] op Notification type (STONITH_OP_DEVICE_ADD, * STONITH_OP_DEVICE_DEL, STONITH_OP_LEVEL_ADD, or * STONITH_OP_LEVEL_DEL) * \param[in] result Operation result * \param[in] desc Description of what changed * \param[in] active Current number of devices or topologies in use */ static void send_config_notification(const char *op, const pcmk__action_result_t *result, const char *desc, int active) { xmlNode *notify_data = create_xml_node(NULL, op); CRM_CHECK(notify_data != NULL, return); crm_xml_add(notify_data, F_STONITH_DEVICE, desc); crm_xml_add_int(notify_data, F_STONITH_ACTIVE, active); fenced_send_notification(op, result, notify_data); free_xml(notify_data); } /*! * \internal * \brief Send notifications for a device change to subscribed clients * * \param[in] op Notification type (STONITH_OP_DEVICE_ADD or * STONITH_OP_DEVICE_DEL) * \param[in] result Operation result * \param[in] desc ID of device that changed */ void fenced_send_device_notification(const char *op, const pcmk__action_result_t *result, const char *desc) { send_config_notification(op, result, desc, g_hash_table_size(device_list)); } /*! * \internal * \brief Send notifications for a topology level change to subscribed clients * * \param[in] op Notification type (STONITH_OP_LEVEL_ADD or * STONITH_OP_LEVEL_DEL) * \param[in] result Operation result * \param[in] desc String representation of level ([]) */ void fenced_send_level_notification(const char *op, const pcmk__action_result_t *result, const char *desc) { send_config_notification(op, result, desc, g_hash_table_size(topology)); } static void topology_remove_helper(const char *node, int level) { char *desc = NULL; pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; xmlNode *data = create_xml_node(NULL, XML_TAG_FENCING_LEVEL); crm_xml_add(data, F_STONITH_ORIGIN, __func__); crm_xml_add_int(data, XML_ATTR_STONITH_INDEX, level); crm_xml_add(data, XML_ATTR_STONITH_TARGET, node); fenced_unregister_level(data, &desc, &result); fenced_send_level_notification(STONITH_OP_LEVEL_DEL, &result, desc); pcmk__reset_result(&result); free_xml(data); free(desc); } static void remove_cib_device(xmlXPathObjectPtr xpathObj) { int max = numXpathResults(xpathObj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { const char *rsc_id = NULL; const char *standard = NULL; xmlNode *match = getXpathResult(xpathObj, lpc); CRM_LOG_ASSERT(match != NULL); if(match != NULL) { standard = crm_element_value(match, XML_AGENT_ATTR_CLASS); } if (!pcmk__str_eq(standard, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { continue; } rsc_id = crm_element_value(match, XML_ATTR_ID); stonith_device_remove(rsc_id, true); } } static void handle_topology_change(xmlNode *match, bool remove) { char *desc = NULL; pcmk__action_result_t result = PCMK__UNKNOWN_RESULT; CRM_CHECK(match != NULL, return); crm_trace("Updating %s", ID(match)); if(remove) { int index = 0; char *key = stonith_level_key(match, -1); crm_element_value_int(match, XML_ATTR_STONITH_INDEX, &index); topology_remove_helper(key, index); free(key); } fenced_register_level(match, &desc, &result); fenced_send_level_notification(STONITH_OP_LEVEL_ADD, &result, desc); pcmk__reset_result(&result); free(desc); } static void remove_fencing_topology(xmlXPathObjectPtr xpathObj) { int max = numXpathResults(xpathObj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpathObj, lpc); CRM_LOG_ASSERT(match != NULL); if (match && crm_element_value(match, XML_DIFF_MARKER)) { /* Deletion */ int index = 0; char *target = stonith_level_key(match, -1); crm_element_value_int(match, XML_ATTR_STONITH_INDEX, &index); if (target == NULL) { crm_err("Invalid fencing target in element %s", ID(match)); } else if (index <= 0) { crm_err("Invalid level for %s in element %s", target, ID(match)); } else { topology_remove_helper(target, index); } /* } else { Deal with modifications during the 'addition' stage */ } } } static void register_fencing_topology(xmlXPathObjectPtr xpathObj) { int max = numXpathResults(xpathObj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpathObj, lpc); handle_topology_change(match, TRUE); } } /* Fencing */ static void fencing_topology_init(void) { xmlXPathObjectPtr xpathObj = NULL; const char *xpath = "//" XML_TAG_FENCING_LEVEL; crm_trace("Full topology refresh"); free_topology_list(); init_topology_list(); /* Grab everything */ xpathObj = xpath_search(local_cib, xpath); register_fencing_topology(xpathObj); freeXpathObject(xpathObj); } #define rsc_name(x) x->clone_name?x->clone_name:x->id /*! * \internal * \brief Check whether our uname is in a resource's allowed node list * * \param[in] rsc Resource to check * * \return Pointer to node object if found, NULL otherwise */ static pe_node_t * our_node_allowed_for(pe_resource_t *rsc) { GHashTableIter iter; pe_node_t *node = NULL; if (rsc && stonith_our_uname) { g_hash_table_iter_init(&iter, rsc->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) { if (node && strcmp(node->details->uname, stonith_our_uname) == 0) { break; } node = NULL; } } return node; } static void watchdog_device_update(void) { if (stonith_watchdog_timeout_ms > 0) { if (!g_hash_table_lookup(device_list, STONITH_WATCHDOG_ID) && !stonith_watchdog_targets) { /* getting here watchdog-fencing enabled, no device there yet and reason isn't stonith_watchdog_targets preventing that */ int rc; xmlNode *xml; xml = create_device_registration_xml( STONITH_WATCHDOG_ID, st_namespace_internal, STONITH_WATCHDOG_AGENT, NULL, /* stonith_device_register will add our own name as PCMK_STONITH_HOST_LIST param so we can skip that here */ NULL); rc = stonith_device_register(xml, NULL, TRUE); free_xml(xml); if (rc != pcmk_ok) { crm_crit("Cannot register watchdog pseudo fence agent"); crm_exit(CRM_EX_FATAL); } } } else { /* be silent if no device - todo parameter to stonith_device_remove */ if (g_hash_table_lookup(device_list, STONITH_WATCHDOG_ID)) { stonith_device_remove(STONITH_WATCHDOG_ID, true); } } } static void update_stonith_watchdog_timeout_ms(xmlNode *cib) { xmlNode *stonith_enabled_xml = NULL; bool stonith_enabled = false; int rc = pcmk_rc_ok; long timeout_ms = 0; stonith_enabled_xml = get_xpath_object("//nvpair[@name='stonith-enabled']", cib, LOG_NEVER); rc = pcmk__xe_get_bool_attr(stonith_enabled_xml, XML_NVPAIR_ATTR_VALUE, &stonith_enabled); if (rc != pcmk_rc_ok || stonith_enabled) { xmlNode *stonith_watchdog_xml = NULL; const char *value = NULL; stonith_watchdog_xml = get_xpath_object("//nvpair[@name='stonith-watchdog-timeout']", cib, LOG_NEVER); if (stonith_watchdog_xml) { value = crm_element_value(stonith_watchdog_xml, XML_NVPAIR_ATTR_VALUE); } if (value) { timeout_ms = crm_get_msec(value); } if (timeout_ms < 0) { timeout_ms = pcmk__auto_watchdog_timeout(); } } stonith_watchdog_timeout_ms = timeout_ms; } /*! * \internal * \brief If a resource or any of its children are STONITH devices, update their * definitions given a cluster working set. * * \param[in] rsc Resource to check * \param[in] data_set Cluster working set with device information */ static void cib_device_update(pe_resource_t *rsc, pe_working_set_t *data_set) { pe_node_t *node = NULL; const char *value = NULL; const char *rclass = NULL; pe_node_t *parent = NULL; /* If this is a complex resource, check children rather than this resource itself. */ if(rsc->children) { GList *gIter = NULL; for (gIter = rsc->children; gIter != NULL; gIter = gIter->next) { cib_device_update(gIter->data, data_set); if(pe_rsc_is_clone(rsc)) { crm_trace("Only processing one copy of the clone %s", rsc->id); break; } } return; } /* We only care about STONITH resources. */ rclass = crm_element_value(rsc->xml, XML_AGENT_ATTR_CLASS); if (!pcmk__str_eq(rclass, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { return; } /* If this STONITH resource is disabled, remove it. */ if (pe__resource_is_disabled(rsc)) { crm_info("Device %s has been disabled", rsc->id); return; } /* if watchdog-fencing is disabled handle any watchdog-fence resource as if it was disabled */ if ((stonith_watchdog_timeout_ms <= 0) && pcmk__str_eq(rsc->id, STONITH_WATCHDOG_ID, pcmk__str_none)) { crm_info("Watchdog-fencing disabled thus handling " "device %s as disabled", rsc->id); return; } /* Check whether our node is allowed for this resource (and its parent if in a group) */ node = our_node_allowed_for(rsc); if (rsc->parent && (rsc->parent->variant == pe_group)) { parent = our_node_allowed_for(rsc->parent); } if(node == NULL) { /* Our node is disallowed, so remove the device */ GHashTableIter iter; crm_info("Device %s has been disabled on %s: unknown", rsc->id, stonith_our_uname); g_hash_table_iter_init(&iter, rsc->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (void **)&node)) { crm_trace("Available: %s = %d", node->details->uname, node->weight); } return; } else if(node->weight < 0 || (parent && parent->weight < 0)) { /* Our node (or its group) is disallowed by score, so remove the device */ char *score = score2char((node->weight < 0) ? node->weight : parent->weight); crm_info("Device %s has been disabled on %s: score=%s", rsc->id, stonith_our_uname, score); free(score); return; } else { /* Our node is allowed, so update the device information */ int rc; xmlNode *data; GHashTable *rsc_params = NULL; GHashTableIter gIter; stonith_key_value_t *params = NULL; const char *name = NULL; const char *agent = crm_element_value(rsc->xml, XML_EXPR_ATTR_TYPE); const char *rsc_provides = NULL; crm_debug("Device %s is allowed on %s: score=%d", rsc->id, stonith_our_uname, node->weight); rsc_params = pe_rsc_params(rsc, node, data_set); get_meta_attributes(rsc->meta, rsc, node, data_set); rsc_provides = g_hash_table_lookup(rsc->meta, PCMK_STONITH_PROVIDES); g_hash_table_iter_init(&gIter, rsc_params); while (g_hash_table_iter_next(&gIter, (gpointer *) & name, (gpointer *) & value)) { if (!name || !value) { continue; } params = stonith_key_value_add(params, name, value); crm_trace(" %s=%s", name, value); } data = create_device_registration_xml(rsc_name(rsc), st_namespace_any, agent, params, rsc_provides); stonith_key_value_freeall(params, 1, 1); rc = stonith_device_register(data, NULL, TRUE); CRM_ASSERT(rc == pcmk_ok); free_xml(data); } } /*! * \internal * \brief Update all STONITH device definitions based on current CIB */ static void cib_devices_update(void) { GHashTableIter iter; stonith_device_t *device = NULL; crm_info("Updating devices to version %s.%s.%s", crm_element_value(local_cib, XML_ATTR_GENERATION_ADMIN), crm_element_value(local_cib, XML_ATTR_GENERATION), crm_element_value(local_cib, XML_ATTR_NUMUPDATES)); CRM_ASSERT(fenced_data_set != NULL); fenced_data_set->input = local_cib; fenced_data_set->now = crm_time_new(NULL); fenced_data_set->localhost = stonith_our_uname; pe__set_working_set_flags(fenced_data_set, pe_flag_quick_location); cluster_status(fenced_data_set); pcmk__schedule_actions(fenced_data_set, NULL, NULL); g_hash_table_iter_init(&iter, device_list); while (g_hash_table_iter_next(&iter, NULL, (void **)&device)) { if (device->cib_registered) { device->dirty = TRUE; } } /* have list repopulated if cib has a watchdog-fencing-resource TODO: keep a cached list for queries happening while we are refreshing */ g_list_free_full(stonith_watchdog_targets, free); stonith_watchdog_targets = NULL; g_list_foreach(fenced_data_set->resources, (GFunc) cib_device_update, fenced_data_set); g_hash_table_iter_init(&iter, device_list); while (g_hash_table_iter_next(&iter, NULL, (void **)&device)) { if (device->dirty) { g_hash_table_iter_remove(&iter); } } fenced_data_set->input = NULL; // Wasn't a copy, so don't let API free it pe_reset_working_set(fenced_data_set); } static void update_cib_stonith_devices_v2(const char *event, xmlNode * msg) { xmlNode *change = NULL; char *reason = NULL; bool needs_update = FALSE; xmlNode *patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); for (change = pcmk__xml_first_child(patchset); change != NULL; change = pcmk__xml_next(change)) { const char *op = crm_element_value(change, XML_DIFF_OP); const char *xpath = crm_element_value(change, XML_DIFF_PATH); const char *shortpath = NULL; if ((op == NULL) || (strcmp(op, "move") == 0) || strstr(xpath, "/"XML_CIB_TAG_STATUS)) { continue; } else if (pcmk__str_eq(op, "delete", pcmk__str_casei) && strstr(xpath, "/"XML_CIB_TAG_RESOURCE)) { const char *rsc_id = NULL; char *search = NULL; char *mutable = NULL; if (strstr(xpath, XML_TAG_ATTR_SETS) || strstr(xpath, XML_TAG_META_SETS)) { needs_update = TRUE; reason = strdup("(meta) attribute deleted from resource"); break; } mutable = strdup(xpath); rsc_id = strstr(mutable, "primitive[@id=\'"); if (rsc_id != NULL) { rsc_id += strlen("primitive[@id=\'"); search = strchr(rsc_id, '\''); } if (search != NULL) { *search = 0; stonith_device_remove(rsc_id, true); /* watchdog_device_update called afterwards to fall back to implicit definition if needed */ } else { crm_warn("Ignoring malformed CIB update (resource deletion)"); } free(mutable); } else if (strstr(xpath, "/"XML_CIB_TAG_RESOURCES) || strstr(xpath, "/"XML_CIB_TAG_CONSTRAINTS) || strstr(xpath, "/"XML_CIB_TAG_RSCCONFIG)) { shortpath = strrchr(xpath, '/'); CRM_ASSERT(shortpath); reason = crm_strdup_printf("%s %s", op, shortpath+1); needs_update = TRUE; break; } } if(needs_update) { crm_info("Updating device list from CIB: %s", reason); cib_devices_update(); } else { crm_trace("No updates for device list found in CIB"); } free(reason); } static void update_cib_stonith_devices_v1(const char *event, xmlNode * msg) { const char *reason = "none"; gboolean needs_update = FALSE; xmlXPathObjectPtr xpath_obj = NULL; /* process new constraints */ xpath_obj = xpath_search(msg, "//" F_CIB_UPDATE_RESULT "//" XML_CONS_TAG_RSC_LOCATION); if (numXpathResults(xpath_obj) > 0) { int max = numXpathResults(xpath_obj), lpc = 0; /* Safest and simplest to always recompute */ needs_update = TRUE; reason = "new location constraint"; for (lpc = 0; lpc < max; lpc++) { xmlNode *match = getXpathResult(xpath_obj, lpc); crm_log_xml_trace(match, "new constraint"); } } freeXpathObject(xpath_obj); /* process deletions */ xpath_obj = xpath_search(msg, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//" XML_CIB_TAG_RESOURCE); if (numXpathResults(xpath_obj) > 0) { remove_cib_device(xpath_obj); } freeXpathObject(xpath_obj); /* process additions */ xpath_obj = xpath_search(msg, "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_CIB_TAG_RESOURCE); if (numXpathResults(xpath_obj) > 0) { int max = numXpathResults(xpath_obj), lpc = 0; for (lpc = 0; lpc < max; lpc++) { const char *rsc_id = NULL; const char *standard = NULL; xmlNode *match = getXpathResult(xpath_obj, lpc); rsc_id = crm_element_value(match, XML_ATTR_ID); standard = crm_element_value(match, XML_AGENT_ATTR_CLASS); if (!pcmk__str_eq(standard, PCMK_RESOURCE_CLASS_STONITH, pcmk__str_casei)) { continue; } crm_trace("Fencing resource %s was added or modified", rsc_id); reason = "new resource"; needs_update = TRUE; } } freeXpathObject(xpath_obj); if(needs_update) { crm_info("Updating device list from CIB: %s", reason); cib_devices_update(); } } static void update_cib_stonith_devices(const char *event, xmlNode * msg) { int format = 1; xmlNode *patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); CRM_ASSERT(patchset); crm_element_value_int(patchset, "format", &format); switch(format) { case 1: update_cib_stonith_devices_v1(event, msg); break; case 2: update_cib_stonith_devices_v2(event, msg); break; default: crm_warn("Unknown patch format: %d", format); } } /* Needs to hold node name + attribute name + attribute value + 75 */ #define XPATH_MAX 512 /*! * \internal * \brief Check whether a node has a specific attribute name/value * * \param[in] node Name of node to check * \param[in] name Name of an attribute to look for * \param[in] value The value the named attribute needs to be set to in order to be considered a match * * \return TRUE if the locally cached CIB has the specified node attribute */ gboolean node_has_attr(const char *node, const char *name, const char *value) { char xpath[XPATH_MAX]; xmlNode *match; int n; CRM_CHECK(local_cib != NULL, return FALSE); /* Search for the node's attributes in the CIB. While the schema allows * multiple sets of instance attributes, and allows instance attributes to * use id-ref to reference values elsewhere, that is intended for resources, * so we ignore that here. */ n = snprintf(xpath, XPATH_MAX, "//" XML_CIB_TAG_NODES "/" XML_CIB_TAG_NODE "[@uname='%s']/" XML_TAG_ATTR_SETS "/" XML_CIB_TAG_NVPAIR "[@name='%s' and @value='%s']", node, name, value); match = get_xpath_object(xpath, local_cib, LOG_NEVER); CRM_CHECK(n < XPATH_MAX, return FALSE); return (match != NULL); } /*! * \internal * \brief Check whether a node does watchdog-fencing * * \param[in] node Name of node to check * * \return TRUE if node found in stonith_watchdog_targets * or stonith_watchdog_targets is empty indicating * all nodes are doing watchdog-fencing */ gboolean node_does_watchdog_fencing(const char *node) { return ((stonith_watchdog_targets == NULL) || pcmk__str_in_list(node, stonith_watchdog_targets, pcmk__str_casei)); } static void update_fencing_topology(const char *event, xmlNode * msg) { int format = 1; const char *xpath; xmlXPathObjectPtr xpathObj = NULL; xmlNode *patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); CRM_ASSERT(patchset); crm_element_value_int(patchset, "format", &format); if(format == 1) { /* Process deletions (only) */ xpath = "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_REMOVED "//" XML_TAG_FENCING_LEVEL; xpathObj = xpath_search(msg, xpath); remove_fencing_topology(xpathObj); freeXpathObject(xpathObj); /* Process additions and changes */ xpath = "//" F_CIB_UPDATE_RESULT "//" XML_TAG_DIFF_ADDED "//" XML_TAG_FENCING_LEVEL; xpathObj = xpath_search(msg, xpath); register_fencing_topology(xpathObj); freeXpathObject(xpathObj); } else if(format == 2) { xmlNode *change = NULL; int add[] = { 0, 0, 0 }; int del[] = { 0, 0, 0 }; xml_patch_versions(patchset, add, del); for (change = pcmk__xml_first_child(patchset); change != NULL; change = pcmk__xml_next(change)) { const char *op = crm_element_value(change, XML_DIFF_OP); const char *xpath = crm_element_value(change, XML_DIFF_PATH); if(op == NULL) { continue; } else if(strstr(xpath, "/" XML_TAG_FENCING_LEVEL) != NULL) { /* Change to a specific entry */ crm_trace("Handling %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); if(strcmp(op, "move") == 0) { continue; } else if(strcmp(op, "create") == 0) { handle_topology_change(change->children, FALSE); } else if(strcmp(op, "modify") == 0) { xmlNode *match = first_named_child(change, XML_DIFF_RESULT); if(match) { handle_topology_change(match->children, TRUE); } } else if(strcmp(op, "delete") == 0) { /* Nuclear option, all we have is the path and an id... not enough to remove a specific entry */ crm_info("Re-initializing fencing topology after %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); fencing_topology_init(); return; } } else if (strstr(xpath, "/" XML_TAG_FENCING_TOPOLOGY) != NULL) { /* Change to the topology in general */ crm_info("Re-initializing fencing topology after top-level %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); fencing_topology_init(); return; } else if (strstr(xpath, "/" XML_CIB_TAG_CONFIGURATION)) { /* Changes to the whole config section, possibly including the topology as a whild */ if(first_named_child(change, XML_TAG_FENCING_TOPOLOGY) == NULL) { crm_trace("Nothing for us in %s operation %d.%d.%d for %s.", op, add[0], add[1], add[2], xpath); } else if(strcmp(op, "delete") == 0 || strcmp(op, "create") == 0) { crm_info("Re-initializing fencing topology after top-level %s operation %d.%d.%d for %s.", op, add[0], add[1], add[2], xpath); fencing_topology_init(); return; } } else { crm_trace("Nothing for us in %s operation %d.%d.%d for %s", op, add[0], add[1], add[2], xpath); } } } else { crm_warn("Unknown patch format: %d", format); } } static bool have_cib_devices = FALSE; static void update_cib_cache_cb(const char *event, xmlNode * msg) { int rc = pcmk_ok; xmlNode *stonith_enabled_xml = NULL; static gboolean stonith_enabled_saved = TRUE; long timeout_ms_saved = stonith_watchdog_timeout_ms; gboolean need_full_refresh = FALSE; bool value = false; if(!have_cib_devices) { crm_trace("Skipping updates until we get a full dump"); return; } else if(msg == NULL) { crm_trace("Missing %s update", event); return; } /* Maintain a local copy of the CIB so that we have full access * to device definitions, location constraints, and node attributes */ if (local_cib != NULL) { int rc = pcmk_ok; xmlNode *patchset = NULL; crm_element_value_int(msg, F_CIB_RC, &rc); if (rc != pcmk_ok) { return; } patchset = get_message_xml(msg, F_CIB_UPDATE_RESULT); xml_log_patchset(LOG_TRACE, "Config update", patchset); rc = xml_apply_patchset(local_cib, patchset, TRUE); switch (rc) { case pcmk_ok: case -pcmk_err_old_data: break; case -pcmk_err_diff_resync: case -pcmk_err_diff_failed: crm_notice("[%s] Patch aborted: %s (%d)", event, pcmk_strerror(rc), rc); free_xml(local_cib); local_cib = NULL; break; default: crm_warn("[%s] ABORTED: %s (%d)", event, pcmk_strerror(rc), rc); free_xml(local_cib); local_cib = NULL; } } if (local_cib == NULL) { crm_trace("Re-requesting full CIB"); rc = cib_api->cmds->query(cib_api, NULL, &local_cib, cib_scope_local | cib_sync_call); if(rc != pcmk_ok) { crm_err("Couldn't retrieve the CIB: %s (%d)", pcmk_strerror(rc), rc); return; } CRM_ASSERT(local_cib != NULL); stonith_enabled_saved = FALSE; /* Trigger a full refresh below */ } pcmk__refresh_node_caches_from_cib(local_cib); update_stonith_watchdog_timeout_ms(local_cib); stonith_enabled_xml = get_xpath_object("//nvpair[@name='stonith-enabled']", local_cib, LOG_NEVER); if (pcmk__xe_get_bool_attr(stonith_enabled_xml, XML_NVPAIR_ATTR_VALUE, &value) == pcmk_rc_ok && !value) { crm_trace("Ignoring CIB updates while fencing is disabled"); stonith_enabled_saved = FALSE; } else if (stonith_enabled_saved == FALSE) { crm_info("Updating fencing device and topology lists " "now that fencing is enabled"); stonith_enabled_saved = TRUE; need_full_refresh = TRUE; } else { if (timeout_ms_saved != stonith_watchdog_timeout_ms) { need_full_refresh = TRUE; } else { update_fencing_topology(event, msg); update_cib_stonith_devices(event, msg); watchdog_device_update(); } } if (need_full_refresh) { fencing_topology_init(); cib_devices_update(); watchdog_device_update(); } } static void init_cib_cache_cb(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data) { crm_info("Updating device list from CIB"); have_cib_devices = TRUE; local_cib = copy_xml(output); pcmk__refresh_node_caches_from_cib(local_cib); update_stonith_watchdog_timeout_ms(local_cib); fencing_topology_init(); cib_devices_update(); watchdog_device_update(); } static void stonith_shutdown(int nsig) { crm_info("Terminating with %d clients", pcmk__ipc_client_count()); stonith_shutdown_flag = TRUE; if (mainloop != NULL && g_main_loop_is_running(mainloop)) { g_main_loop_quit(mainloop); } else { stonith_cleanup(); crm_exit(CRM_EX_OK); } } static void cib_connection_destroy(gpointer user_data) { if (stonith_shutdown_flag) { crm_info("Connection to the CIB manager closed"); return; } else { crm_crit("Lost connection to the CIB manager, shutting down"); } if (cib_api) { cib_api->cmds->signoff(cib_api); } stonith_shutdown(0); } static void stonith_cleanup(void) { if (cib_api) { cib_api->cmds->del_notify_callback(cib_api, T_CIB_DIFF_NOTIFY, update_cib_cache_cb); cib_api->cmds->signoff(cib_api); } if (ipcs) { qb_ipcs_destroy(ipcs); } crm_peer_destroy(); pcmk__client_cleanup(); free_stonith_remote_op_list(); free_topology_list(); free_device_list(); free_metadata_cache(); free(stonith_our_uname); stonith_our_uname = NULL; free_xml(local_cib); local_cib = NULL; } static pcmk__cli_option_t long_options[] = { // long option, argument type, storage, short option, description, flags { "stand-alone", no_argument, 0, 's', NULL, pcmk__option_default }, { "stand-alone-w-cpg", no_argument, 0, 'c', NULL, pcmk__option_default }, { "logfile", required_argument, 0, 'l', NULL, pcmk__option_default }, { "verbose", no_argument, 0, 'V', NULL, pcmk__option_default }, { "version", no_argument, 0, '$', NULL, pcmk__option_default }, { "help", no_argument, 0, '?', NULL, pcmk__option_default }, { 0, 0, 0, 0 } }; static void setup_cib(void) { int rc, retries = 0; cib_api = cib_new(); if (cib_api == NULL) { crm_err("No connection to the CIB manager"); return; } do { sleep(retries); rc = cib_api->cmds->signon(cib_api, CRM_SYSTEM_STONITHD, cib_command); } while (rc == -ENOTCONN && ++retries < 5); if (rc != pcmk_ok) { crm_err("Could not connect to the CIB manager: %s (%d)", pcmk_strerror(rc), rc); } else if (pcmk_ok != cib_api->cmds->add_notify_callback(cib_api, T_CIB_DIFF_NOTIFY, update_cib_cache_cb)) { crm_err("Could not set CIB notification callback"); } else { rc = cib_api->cmds->query(cib_api, NULL, NULL, cib_scope_local); cib_api->cmds->register_callback(cib_api, rc, 120, FALSE, NULL, "init_cib_cache_cb", init_cib_cache_cb); cib_api->cmds->set_connection_dnotify(cib_api, cib_connection_destroy); crm_info("Watching for fencing topology changes"); } } struct qb_ipcs_service_handlers ipc_callbacks = { .connection_accept = st_ipc_accept, .connection_created = NULL, .msg_process = st_ipc_dispatch, .connection_closed = st_ipc_closed, .connection_destroyed = st_ipc_destroy }; /*! * \internal * \brief Callback for peer status changes * * \param[in] type What changed * \param[in] node What peer had the change * \param[in] data Previous value of what changed */ static void st_peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data) { if ((type != crm_status_processes) && !pcmk_is_set(node->flags, crm_remote_node)) { /* * This is a hack until we can send to a nodeid and/or we fix node name lookups * These messages are ignored in stonith_peer_callback() */ xmlNode *query = create_xml_node(NULL, "stonith_command"); crm_xml_add(query, F_XML_TAGNAME, "stonith_command"); crm_xml_add(query, F_TYPE, T_STONITH_NG); crm_xml_add(query, F_STONITH_OPERATION, "poke"); crm_debug("Broadcasting our uname because of node %u", node->id); send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE); free_xml(query); } } int main(int argc, char **argv) { int flag; int lpc = 0; int argerr = 0; int option_index = 0; crm_cluster_t *cluster = NULL; const char *actions[] = { "reboot", "off", "on", "list", "monitor", "status" }; crm_ipc_t *old_instance = NULL; int rc = pcmk_rc_ok; crm_log_preinit(NULL, argc, argv); pcmk__set_cli_options(NULL, "[options]", long_options, "daemon for executing fencing devices in a " "Pacemaker cluster"); while (1) { flag = pcmk__next_cli_option(argc, argv, &option_index, NULL); if (flag == -1) { break; } switch (flag) { case 'V': crm_bump_log_level(argc, argv); break; case 'l': { int rc = pcmk__add_logfile(optarg); if (rc != pcmk_rc_ok) { /* Logging has not yet been initialized, so stderr is * the only way to get information out */ fprintf(stderr, "Logging to %s is disabled: %s\n", optarg, pcmk_rc_str(rc)); } } break; case 's': stand_alone = TRUE; break; case 'c': stand_alone = FALSE; no_cib_connect = TRUE; break; case '$': case '?': pcmk__cli_help(flag, CRM_EX_OK); break; default: ++argerr; break; } } if (argc - optind == 1 && pcmk__str_eq("metadata", argv[optind], pcmk__str_casei)) { printf("\n"); printf("\n"); printf(" 1.0\n"); printf(" Instance attributes available for all \"stonith\"-class resources" " and used by Pacemaker's fence daemon, formerly known as stonithd\n"); printf(" Instance attributes available for all \"stonith\"-class resources\n"); printf(" \n"); #if 0 // priority is not implemented yet printf(" \n"); printf(" Devices that are not in a topology " "are tried in order of highest to lowest integer priority\n"); printf(" \n"); printf(" \n"); #endif printf(" \n", PCMK_STONITH_HOST_ARGUMENT); printf (" Advanced use only: An alternate parameter to supply instead of 'port'\n"); printf (" Some devices do not support the standard 'port' parameter or may provide additional ones.\n" "Use this to specify an alternate, device-specific, parameter that should indicate the machine to be fenced.\n" "A value of 'none' can be used to tell the cluster not to supply any additional parameters.\n" " \n"); printf(" \n"); printf(" \n"); printf(" \n", PCMK_STONITH_HOST_MAP); printf (" A mapping of host names to ports numbers for devices that do not support host names.\n"); printf (" Eg. node1:1;node2:2,3 would tell the cluster to use port 1 for node1 and ports 2 and 3 for node2\n"); printf(" \n"); printf(" \n"); printf(" \n", PCMK_STONITH_HOST_LIST); printf(" A list of machines controlled by " "this device (Optional unless %s=static-list).\n", PCMK_STONITH_HOST_CHECK); printf(" \n"); printf(" \n"); printf(" \n", PCMK_STONITH_HOST_CHECK); printf (" How to determine which machines are controlled by the device.\n"); printf(" Allowed values: dynamic-list " "(query the device via the 'list' command), static-list " "(check the " PCMK_STONITH_HOST_LIST " attribute), status " "(query the device via the 'status' command), none (assume " "every device can fence every machine)\n"); printf(" \n"); printf(" \n"); printf(" \n", PCMK_STONITH_DELAY_MAX); printf(" Enable a delay of no more than the " "time specified before executing fencing actions. Pacemaker " "derives the overall delay by taking the value of " PCMK_STONITH_DELAY_BASE " and adding a random delay value such " "that the sum is kept below this maximum.\n"); printf(" This prevents double fencing when " "using slow devices such as sbd.\nUse this to enable a random " "delay for fencing actions.\nThe overall delay is derived from " "this random delay value adding a static delay so that the sum " "is kept below the maximum delay.\n"); printf(" \n"); printf(" \n"); printf(" \n", PCMK_STONITH_DELAY_BASE); printf(" Enable a base delay for " "fencing actions and specify base delay value.\n"); - printf(" This prevents double fencing when " - "different delays are configured on the nodes.\nUse this to " - "enable a static delay for fencing actions.\nThe overall delay " - "is derived from a random delay value adding this static delay " - "so that the sum is kept below the maximum delay.\nSet to eg. " - "node1:1s;node2:5 to set different value per node.\n"); - printf(" \n"); + printf(" This enables a static delay for " + "fencing actions, which can help avoid \"death matches\" where " + "two nodes try to fence each other at the same time. If " + PCMK_STONITH_DELAY_MAX " is also used, a random delay will be " + "added such that the total delay is kept below that value.\n" + "This can be set to a single time value to apply to any node " + "targeted by this device (useful if a separate device is " + "configured for each target), or to a node map (for example, " + "\"node1:1s;node2:5\") to set a different value per target.\n" + " \n"); + printf(" \n"); printf(" \n"); printf(" \n", PCMK_STONITH_ACTION_LIMIT); printf (" The maximum number of actions can be performed in parallel on this device\n"); printf (" Cluster property concurrent-fencing=true needs to be configured first.\n" "Then use this to specify the maximum number of actions can be performed in parallel on this device. -1 is unlimited.\n"); printf(" \n"); printf(" \n"); for (lpc = 0; lpc < PCMK__NELEM(actions); lpc++) { printf(" \n", actions[lpc]); printf (" Advanced use only: An alternate command to run instead of '%s'\n", actions[lpc]); printf (" Some devices do not support the standard commands or may provide additional ones.\n" "Use this to specify an alternate, device-specific, command that implements the '%s' action.\n", actions[lpc]); printf(" \n", actions[lpc]); printf(" \n"); printf(" \n", actions[lpc]); printf (" Advanced use only: Specify an alternate timeout to use for %s actions instead of stonith-timeout\n", actions[lpc]); printf (" Some devices need much more/less time to complete than normal.\n" "Use this to specify an alternate, device-specific, timeout for '%s' actions.\n", actions[lpc]); printf(" \n"); printf(" \n"); printf(" \n", actions[lpc]); printf (" Advanced use only: The maximum number of times to retry the '%s' command within the timeout period\n", actions[lpc]); printf(" Some devices do not support multiple connections." " Operations may 'fail' if the device is busy with another task so Pacemaker will automatically retry the operation, if there is time remaining." " Use this option to alter the number of times Pacemaker retries '%s' actions before giving up." "\n", actions[lpc]); printf(" \n"); printf(" \n"); } printf(" \n"); printf("\n"); return CRM_EX_OK; } if (optind != argc) { ++argerr; } if (argerr) { pcmk__cli_help('?', CRM_EX_USAGE); } crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE); crm_notice("Starting Pacemaker fencer"); old_instance = crm_ipc_new("stonith-ng", 0); if (crm_ipc_connect(old_instance)) { /* IPC end-point already up */ crm_ipc_close(old_instance); crm_ipc_destroy(old_instance); crm_err("pacemaker-fenced is already active, aborting startup"); crm_exit(CRM_EX_OK); } else { /* not up or not authentic, we'll proceed either way */ crm_ipc_destroy(old_instance); old_instance = NULL; } mainloop_add_signal(SIGTERM, stonith_shutdown); crm_peer_init(); fenced_data_set = pe_new_working_set(); CRM_ASSERT(fenced_data_set != NULL); pe__set_working_set_flags(fenced_data_set, pe_flag_no_counts|pe_flag_no_compat); pe__set_working_set_flags(fenced_data_set, pe_flag_show_utilization); cluster = calloc(1, sizeof(crm_cluster_t)); CRM_ASSERT(cluster != NULL); if (stand_alone == FALSE) { if (is_corosync_cluster()) { #if SUPPORT_COROSYNC cluster->destroy = stonith_peer_cs_destroy; cluster->cpg.cpg_deliver_fn = stonith_peer_ais_callback; cluster->cpg.cpg_confchg_fn = pcmk_cpg_membership; #endif } crm_set_status_callback(&st_peer_update_callback); if (crm_cluster_connect(cluster) == FALSE) { crm_crit("Cannot sign in to the cluster... terminating"); crm_exit(CRM_EX_FATAL); } stonith_our_uname = strdup(cluster->uname); if (no_cib_connect == FALSE) { setup_cib(); } } else { stonith_our_uname = strdup("localhost"); } init_device_list(); init_topology_list(); pcmk__serve_fenced_ipc(&ipcs, &ipc_callbacks); pcmk__register_formats(NULL, formats); rc = pcmk__output_new(&out, "log", NULL, argv); if ((rc != pcmk_rc_ok) || (out == NULL)) { crm_err("Can't log resource details due to internal error: %s\n", pcmk_rc_str(rc)); crm_exit(CRM_EX_FATAL); } pe__register_messages(out); pcmk__register_lib_messages(out); pcmk__output_set_log_level(out, LOG_TRACE); fenced_data_set->priv = out; /* Create the mainloop and run it... */ mainloop = g_main_loop_new(NULL, FALSE); crm_notice("Pacemaker fencer successfully started and accepting connections"); g_main_loop_run(mainloop); stonith_cleanup(); free(cluster->uuid); free(cluster->uname); free(cluster); pe_free_working_set(fenced_data_set); out->finish(out, CRM_EX_OK, true, NULL); pcmk__output_free(out); pcmk__unregister_formats(); crm_exit(CRM_EX_OK); } diff --git a/doc/sphinx/Pacemaker_Development/components.rst b/doc/sphinx/Pacemaker_Development/components.rst index 760da77c9b..42b8a5eecf 100644 --- a/doc/sphinx/Pacemaker_Development/components.rst +++ b/doc/sphinx/Pacemaker_Development/components.rst @@ -1,383 +1,383 @@ Coding Particular Pacemaker Components -------------------------------------- The Pacemaker code can be intricate and difficult to follow. This chapter has some high-level descriptions of how individual components work. .. index:: single: fencer single: pacemaker-fenced Fencer ###### ``pacemaker-fenced`` is the Pacemaker daemon that handles fencing requests. In the broadest terms, fencing works like this: #. The initiator (an external program such as ``stonith_admin``, or the cluster itself via the controller) asks the local fencer, "Hey, could you please fence this node?" #. The local fencer asks all the fencers in the cluster (including itself), "Hey, what fencing devices do you have access to that can fence this node?" #. Each fencer in the cluster replies with a list of available devices that it knows about. #. Once the original fencer gets all the replies, it asks the most appropriate fencer peer to actually carry out the fencing. It may send out more than one such request if the target node must be fenced with multiple devices. #. The chosen fencer(s) call the appropriate fencing resource agent(s) to do the fencing, then reply to the original fencer with the result. #. The original fencer broadcasts the result to all fencers. #. Each fencer sends the result to each of its local clients (including, at some point, the initiator). A more detailed description follows. .. index:: single: libstonithd Initiating a fencing request ____________________________ A fencing request can be initiated by the cluster or externally, using the libstonithd API. * The cluster always initiates fencing via ``daemons/controld/controld_fencing.c:te_fence_node()`` (which calls the ``fence()`` API method). This occurs when a transition graph synapse contains a ``CRM_OP_FENCE`` XML operation. * The main external clients are ``stonith_admin`` and ``cts-fence-helper``. The ``DLM`` project also uses Pacemaker for fencing. Highlights of the fencing API: * ``stonith_api_new()`` creates and returns a new ``stonith_t`` object, whose ``cmds`` member has methods for connect, disconnect, fence, etc. * the ``fence()`` method creates and sends a ``STONITH_OP_FENCE XML`` request with the desired action and target node. Callers do not have to choose or even have any knowledge about particular fencing devices. Fencing queries _______________ The function calls for a fencing request go something like this: The local fencer receives the client's request via an IPC or messaging layer callback, which calls * ``stonith_command()``, which (for requests) calls * ``handle_request()``, which (for ``STONITH_OP_FENCE`` from a client) calls * ``initiate_remote_stonith_op()``, which creates a ``STONITH_OP_QUERY`` XML request with the target, desired action, timeout, etc. then broadcasts the operation to the cluster group (i.e. all fencer instances) and starts a timer. The query is broadcast because (1) location constraints might prevent the local node from accessing the stonith device directly, and (2) even if the local node does have direct access, another node might be preferred to carry out the fencing. Each fencer receives the original fencer's ``STONITH_OP_QUERY`` broadcast request via IPC or messaging layer callback, which calls: * ``stonith_command()``, which (for requests) calls * ``handle_request()``, which (for ``STONITH_OP_QUERY`` from a peer) calls * ``stonith_query()``, which calls - * ``get_capable_devices()`` with ``stonith_query_capable_device_db()`` to add + * ``get_capable_devices()`` with ``stonith_query_capable_device_cb()`` to add device information to an XML reply and send it. (A message is considered a reply if it contains ``T_STONITH_REPLY``, which is only set by fencer peers, not clients.) The original fencer receives all peers' ``STONITH_OP_QUERY`` replies via IPC or messaging layer callback, which calls: * ``stonith_command()``, which (for replies) calls * ``handle_reply()`` which (for ``STONITH_OP_QUERY``) calls * ``process_remote_stonith_query()``, which allocates a new query result structure, parses device information into it, and adds it to the operation object. It increments the number of replies received for this operation, and compares it against the expected number of replies (i.e. the number of active peers), and if this is the last expected reply, calls * ``request_peer_fencing()``, which calculates the timeout and sends ``STONITH_OP_FENCE`` request(s) to carry out the fencing. If the target node has a fencing "topology" (which allows specifications such as "this node can be fenced either with device A, or devices B and C in combination"), it will choose the device(s), and send out as many requests as needed. If it chooses a device, it will choose the peer; a peer is preferred if it has "verified" access to the desired device, meaning that it has the device "running" on it and thus has a monitor operation ensuring reachability. Fencing operations __________________ Each ``STONITH_OP_FENCE`` request goes something like this: The chosen peer fencer receives the ``STONITH_OP_FENCE`` request via IPC or messaging layer callback, which calls: * ``stonith_command()``, which (for requests) calls * ``handle_request()``, which (for ``STONITH_OP_FENCE`` from a peer) calls * ``stonith_fence()``, which calls * ``schedule_stonith_command()`` (using supplied device if ``F_STONITH_DEVICE`` was set, otherwise the highest-priority capable device obtained via ``get_capable_devices()`` with ``stonith_fence_get_devices_cb()``), which adds the operation to the device's pending operations list and triggers processing. The chosen peer fencer's mainloop is triggered and calls * ``stonith_device_dispatch()``, which calls * ``stonith_device_execute()``, which pops off the next item from the device's pending operations list. If acting as the (internally implemented) watchdog agent, it panics the node, otherwise it calls * ``stonith_action_create()`` and ``stonith_action_execute_async()`` to call the fencing agent. The chosen peer fencer's mainloop is triggered again once the fencing agent returns, and calls * ``stonith_action_async_done()`` which adds the results to an action object then calls its * done callback (``st_child_done()``), which calls ``schedule_stonith_command()`` for a new device if there are further required actions to execute or if the original action failed, then builds and sends an XML reply to the original fencer (via ``send_async_reply()``), then checks whether any pending actions are the same as the one just executed and merges them if so. Fencing replies _______________ The original fencer receives the ``STONITH_OP_FENCE`` reply via IPC or messaging layer callback, which calls: * ``stonith_command()``, which (for replies) calls * ``handle_reply()``, which calls * ``fenced_process_fencing_reply()``, which calls either ``request_peer_fencing()`` (to retry a failed operation, or try the next device in a topology if appropriate, which issues a new ``STONITH_OP_FENCE`` request, proceeding as before) or ``finalize_op()`` (if the operation is definitively failed or successful). * ``finalize_op()`` broadcasts the result to all peers. Finally, all peers receive the broadcast result and call * ``finalize_op()``, which sends the result to all local clients. .. index:: single: fence history Fencing History _______________ The fencer keeps a running history of all fencing operations. The bulk of the relevant code is in `fenced_history.c` and ensures the history is synchronized across all nodes even if a node leaves and rejoins the cluster. In libstonithd, this information is represented by `stonith_history_t` and is queryable by the `stonith_api_operations_t:history()` method. `crm_mon` and `stonith_admin` use this API to display the history. .. index:: single: scheduler single: pacemaker-schedulerd single: libpe_status single: libpe_rules single: libpacemaker Scheduler ######### ``pacemaker-schedulerd`` is the Pacemaker daemon that runs the Pacemaker scheduler for the controller, but "the scheduler" in general refers to related library code in ``libpe_status`` and ``libpe_rules`` (``lib/pengine/*.c``), and some of ``libpacemaker`` (``lib/pacemaker/pcmk_sched_*.c``). The purpose of the scheduler is to take a CIB as input and generate a transition graph (list of actions that need to be taken) as output. The controller invokes the scheduler by contacting the scheduler daemon via local IPC. Tools such as ``crm_simulate``, ``crm_mon``, and ``crm_resource`` can also invoke the scheduler, but do so by calling the library functions directly. This allows them to run using a ``CIB_file`` without the cluster needing to be active. The main entry point for the scheduler code is ``lib/pacemaker/pcmk_sched_messages.c:pcmk__schedule_actions()``. It sets defaults and calls a series of functions for each "stage" of the scheduling. (Some of the functions are named like ``stageN()`` but the code has evolved over time to where the numbers no longer make sense. A project is in progress to reorganize and rename them.) * ``stage0()`` "unpacks" most of the CIB XML into data structures, and determines the current cluster status. It also creates implicit location constraints for the node health feature. * ``stage2()`` applies factors that make resources prefer certain nodes (such as shutdown locks, location constraints, and stickiness). * ``pcmk__create_internal_constraints()`` creates internal constraints (such as the implicit ordering for group members, or start actions being implicitly ordered before promote actions). * ``stage4()`` "checks actions", which means processing resource history entries in the CIB status section. This is used to decide whether certain actions need to be done, such as deleting orphan resources, forcing a restart when a resource definition changes, etc. * ``stage5()`` allocates resources to nodes and creates actions (which might or might not end up in the final graph). * ``stage6()`` creates implicit ordering constraints for resources running across remote connections, and schedules fencing actions and shutdowns. * ``stage7()`` "updates actions", which means applying ordering constraints in order to modify action attributes such as optional or required. * ``pcmk__create_graph()`` creates the transition graph. Challenges __________ Working with the scheduler is difficult. Challenges include: * It is far too much code to keep more than a small portion in your head at one time. * Small changes can have large (and unexpected) effects. This is why we have a large number of regression tests (``cts/cts-scheduler``), which should be run after making code changes. * It produces an insane amount of log messages at debug and trace levels. You can put resource ID(s) in the ``PCMK_trace_tags`` environment variable to enable trace-level messages only when related to specific resources. * Different parts of the main ``pe_working_set_t`` structure are finalized at different points in the scheduling process, so you have to keep in mind whether information you're using at one point of the code can possibly change later. For example, data unpacked from the CIB can safely be used anytime after ``stage0(),`` but actions may become optional or required anytime before ``pcmk__create_graph()``. There's no easy way to deal with this. * Many names of struct members, functions, etc., are suboptimal, but are part of the public API and cannot be changed until an API backward compatibility break. .. index:: single: pe_working_set_t Cluster Working Set ___________________ The main data object for the scheduler is ``pe_working_set_t``, which contains all information needed about nodes, resources, constraints, etc., both as the raw CIB XML and parsed into more usable data structures, plus the resulting transition graph XML. The variable name is usually ``data_set``. .. index:: single: pe_resource_t Resources _________ ``pe_resource_t`` is the data object representing cluster resources. A resource has a variant: primitive (a.k.a. native), group, clone, or bundle. The resource object has members for two sets of methods, ``resource_object_functions_t`` from the ``libpe_status`` public API, and ``resource_alloc_functions_t`` whose implementation is internal to ``libpacemaker``. The actual functions vary by variant. The object functions have basic capabilities such as unpacking the resource XML, and determining the current or planned location of the resource. The allocation functions have more obscure capabilities needed for scheduling, such as processing location and ordering constraints. For example, ``stage3()``, which creates internal constraints, simply calls the ``internal_constraints()`` method for each top-level resource in the working set. .. index:: single: pe_node_t Nodes _____ Allocation of resources to nodes is done by choosing the node with the highest score for a given resource. The scheduler does a bunch of processing to generate the scores, then the actual allocation is straightforward. Node lists are frequently used. For example, ``pe_working_set_t`` has a ``nodes`` member which is a list of all nodes in the cluster, and ``pe_resource_t`` has a ``running_on`` member which is a list of all nodes on which the resource is (or might be) active. These are lists of ``pe_node_t`` objects. The ``pe_node_t`` object contains a ``struct pe_node_shared_s *details`` member with all node information that is independent of resource allocation (the node name, etc.). The working set's ``nodes`` member contains the original of this information. All other node lists contain copies of ``pe_node_t`` where only the ``details`` member points to the originals in the working set's ``nodes`` list. In this way, the other members of ``pe_node_t`` (such as ``weight``, which is the node score) may vary by node list, while the common details are shared. .. index:: single: pe_action_t single: pe_action_flags Actions _______ ``pe_action_t`` is the data object representing actions that might need to be taken. These could be resource actions, cluster-wide actions such as fencing a node, or "pseudo-actions" which are abstractions used as convenient points for ordering other actions against. It has a ``flags`` member which is a bitmask of ``enum pe_action_flags``. The most important of these are ``pe_action_runnable`` (if not set, the action is "blocked" and cannot be added to the transition graph) and ``pe_action_optional`` (actions with this set will not be added to the transition graph; actions often start out as optional, and may become required later). .. index:: single: pe__ordering_t single: pe_ordering Orderings _________ Ordering constraints are simple in concept, but they are one of the most important, powerful, and difficult to follow aspects of the scheduler code. ``pe__ordering_t`` is the data object representing an ordering, better thought of as a relationship between two actions, since the relation can be more complex than just "this one runs after that one". For an ordering "A then B", the code generally refers to A as "first" or "before", and B as "then" or "after". Much of the power comes from ``enum pe_ordering``, which are flags that determine how an ordering behaves. There are many obscure flags with big effects. A few examples: * ``pe_order_none`` means the ordering is disabled and will be ignored. It's 0, meaning no flags set, so it must be compared with equality rather than ``pcmk_is_set()``. * ``pe_order_optional`` means the ordering does not make either action required, so it only applies if they both become required for other reasons. * ``pe_order_implies_first`` means that if action B becomes required for any reason, then action A will become required as well.