Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F4837900
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
161 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/daemons/controld/controld_cib.c b/daemons/controld/controld_cib.c
index 22ac42486f..c9dde0b748 100644
--- a/daemons/controld/controld_cib.c
+++ b/daemons/controld/controld_cib.c
@@ -1,1163 +1,1170 @@
/*
* Copyright 2004-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <unistd.h> /* sleep */
#include <crm/common/alerts_internal.h>
#include <crm/common/xml.h>
#include <crm/crm.h>
#include <crm/msg_xml.h>
#include <crm/lrmd_internal.h>
#include <pacemaker-controld.h>
// Call ID of the most recent in-progress CIB resource update (or 0 if none)
static int pending_rsc_update = 0;
// Call IDs of requested CIB replacements that won't trigger a new election
// (used as a set of gint values)
static GHashTable *cib_replacements = NULL;
/*!
* \internal
* \brief Store the call ID of a CIB replacement that the controller requested
*
* The \p do_cib_replaced() callback function will avoid triggering a new
* election when we're notified of one of these expected replacements.
*
* \param[in] call_id CIB call ID (or 0 for a synchronous call)
*
* \note This function should be called after making any asynchronous CIB
* request (or before making any synchronous CIB request) that may replace
* part of the nodes or status section. This may include CIB sync calls.
*/
void
controld_record_cib_replace_call(int call_id)
{
CRM_CHECK(call_id >= 0, return);
if (cib_replacements == NULL) {
cib_replacements = g_hash_table_new(NULL, NULL);
}
/* If the call ID is already present in the table, then it's old. We may not
* be removing them properly, and we could improperly ignore replacement
* notifications if cib_t:call_id wraps around.
*/
CRM_LOG_ASSERT(g_hash_table_add(cib_replacements,
GINT_TO_POINTER((gint) call_id)));
}
/*!
* \internal
* \brief Remove the call ID of a CIB replacement from the replacements table
*
* \param[in] call_id CIB call ID (or 0 for a synchronous call)
*
* \return \p true if \p call_id was found in the table, or \p false otherwise
*
* \note CIB notifications run before CIB callbacks. If this function is called
* from within a callback, \p do_cib_replaced() will have removed
* \p call_id from the table first if relevant changes triggered a
* notification.
*/
bool
controld_forget_cib_replace_call(int call_id)
{
CRM_CHECK(call_id >= 0, return false);
if (cib_replacements == NULL) {
return false;
}
return g_hash_table_remove(cib_replacements,
GINT_TO_POINTER((gint) call_id));
}
/*!
* \internal
* \brief Empty the hash table containing call IDs of CIB replacement requests
*/
void
controld_forget_all_cib_replace_calls(void)
{
if (cib_replacements != NULL) {
g_hash_table_remove_all(cib_replacements);
}
}
/*!
* \internal
* \brief Free the hash table containing call IDs of CIB replacement requests
*/
void
controld_destroy_cib_replacements_table(void)
{
if (cib_replacements != NULL) {
g_hash_table_destroy(cib_replacements);
cib_replacements = NULL;
}
}
/*!
* \internal
* \brief Respond to a dropped CIB connection
*
* \param[in] user_data CIB connection that dropped
*/
static void
handle_cib_disconnect(gpointer user_data)
{
CRM_LOG_ASSERT(user_data == controld_globals.cib_conn);
controld_trigger_fsa();
controld_globals.cib_conn->state = cib_disconnected;
if (pcmk_is_set(controld_globals.fsa_input_register, R_CIB_CONNECTED)) {
// @TODO This should trigger a reconnect, not a shutdown
crm_crit("Lost connection to the CIB manager, shutting down");
register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL);
controld_clear_fsa_input_flags(R_CIB_CONNECTED);
} else { // Expected
crm_info("Connection to the CIB manager terminated");
}
}
static void
do_cib_updated(const char *event, xmlNode * msg)
{
if (pcmk__alert_in_patchset(msg, TRUE)) {
controld_trigger_config();
}
}
static void
do_cib_replaced(const char *event, xmlNode * msg)
{
int call_id = 0;
const char *client_id = crm_element_value(msg, F_CIB_CLIENTID);
uint32_t change_section = cib_change_section_nodes
|cib_change_section_status;
long long value = 0;
crm_debug("Updating the CIB after a replace: DC=%s", pcmk__btoa(AM_I_DC));
if (!AM_I_DC) {
return;
}
if ((crm_element_value_int(msg, F_CIB_CALLID, &call_id) == 0)
&& pcmk__str_eq(client_id, controld_globals.cib_client_id,
pcmk__str_none)
&& controld_forget_cib_replace_call(call_id)) {
// We requested this replace op. No need to restart the join.
return;
}
if ((crm_element_value_ll(msg, F_CIB_CHANGE_SECTION, &value) < 0)
|| (value < 0) || (value > UINT32_MAX)) {
crm_trace("Couldn't parse '%s' from message", F_CIB_CHANGE_SECTION);
} else {
change_section = (uint32_t) value;
}
if (pcmk_any_flags_set(change_section, cib_change_section_nodes
|cib_change_section_status)) {
/* start the join process again so we get everyone's LRM status */
populate_cib_nodes(node_update_quick|node_update_all, __func__);
register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL);
}
}
void
controld_disconnect_cib_manager(void)
{
cib_t *cib_conn = controld_globals.cib_conn;
CRM_ASSERT(cib_conn != NULL);
crm_info("Disconnecting from the CIB manager");
controld_clear_fsa_input_flags(R_CIB_CONNECTED);
cib_conn->cmds->del_notify_callback(cib_conn, T_CIB_REPLACE_NOTIFY,
do_cib_replaced);
cib_conn->cmds->del_notify_callback(cib_conn, T_CIB_DIFF_NOTIFY,
do_cib_updated);
cib_free_callbacks(cib_conn);
if (cib_conn->state != cib_disconnected) {
cib_conn->cmds->set_secondary(cib_conn,
cib_scope_local|cib_discard_reply);
cib_conn->cmds->signoff(cib_conn);
}
crm_notice("Disconnected from the CIB manager");
}
/* A_CIB_STOP, A_CIB_START, O_CIB_RESTART */
void
do_cib_control(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
static int cib_retries = 0;
cib_t *cib_conn = controld_globals.cib_conn;
void (*dnotify_fn) (gpointer user_data) = handle_cib_disconnect;
void (*replace_cb) (const char *event, xmlNodePtr msg) = do_cib_replaced;
void (*update_cb) (const char *event, xmlNodePtr msg) = do_cib_updated;
int rc = pcmk_ok;
CRM_ASSERT(cib_conn != NULL);
if (pcmk_is_set(action, A_CIB_STOP)) {
if ((cib_conn->state != cib_disconnected)
&& (pending_rsc_update != 0)) {
crm_info("Waiting for resource update %d to complete",
pending_rsc_update);
crmd_fsa_stall(FALSE);
return;
}
controld_disconnect_cib_manager();
}
if (!pcmk_is_set(action, A_CIB_START)) {
return;
}
if (cur_state == S_STOPPING) {
crm_err("Ignoring request to connect to the CIB manager after "
"shutdown");
return;
}
rc = cib_conn->cmds->signon(cib_conn, CRM_SYSTEM_CRMD,
cib_command_nonblocking);
if (rc != pcmk_ok) {
// A short wait that usually avoids stalling the FSA
sleep(1);
rc = cib_conn->cmds->signon(cib_conn, CRM_SYSTEM_CRMD,
cib_command_nonblocking);
}
if (rc != pcmk_ok) {
crm_info("Could not connect to the CIB manager: %s", pcmk_strerror(rc));
} else if (cib_conn->cmds->set_connection_dnotify(cib_conn,
dnotify_fn) != pcmk_ok) {
crm_err("Could not set dnotify callback");
} else if (cib_conn->cmds->add_notify_callback(cib_conn,
T_CIB_REPLACE_NOTIFY,
replace_cb) != pcmk_ok) {
crm_err("Could not set CIB notification callback (replace)");
} else if (cib_conn->cmds->add_notify_callback(cib_conn,
T_CIB_DIFF_NOTIFY,
update_cb) != pcmk_ok) {
crm_err("Could not set CIB notification callback (update)");
} else {
controld_set_fsa_input_flags(R_CIB_CONNECTED);
cib_retries = 0;
cib_conn->cmds->client_id(cib_conn, &controld_globals.cib_client_id,
NULL);
}
if (!pcmk_is_set(controld_globals.fsa_input_register, R_CIB_CONNECTED)) {
cib_retries++;
if (cib_retries < 30) {
crm_warn("Couldn't complete CIB registration %d times... "
"pause and retry", cib_retries);
controld_start_wait_timer();
crmd_fsa_stall(FALSE);
} else {
crm_err("Could not complete CIB registration %d times... "
"hard error", cib_retries);
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
}
}
}
#define MIN_CIB_OP_TIMEOUT (30)
/*!
* \internal
* \brief Get the timeout (in seconds) that should be used with CIB operations
*
* \return The maximum of 30 seconds, the value of the PCMK_cib_timeout
* environment variable, or 10 seconds times one more than the number of
* nodes in the cluster.
*/
unsigned int
cib_op_timeout(void)
{
static int env_timeout = -1;
unsigned int calculated_timeout = 0;
if (env_timeout == -1) {
const char *env = getenv("PCMK_cib_timeout");
pcmk__scan_min_int(env, &env_timeout, MIN_CIB_OP_TIMEOUT);
crm_trace("Minimum CIB op timeout: %ds (environment: %s)",
env_timeout, (env? env : "none"));
}
calculated_timeout = 1 + crm_active_peers();
if (crm_remote_peer_cache) {
calculated_timeout += g_hash_table_size(crm_remote_peer_cache);
}
calculated_timeout *= 10;
calculated_timeout = QB_MAX(calculated_timeout, env_timeout);
crm_trace("Calculated timeout: %us", calculated_timeout);
if (controld_globals.cib_conn) {
controld_globals.cib_conn->call_timeout = calculated_timeout;
}
return calculated_timeout;
}
/*!
* \internal
* \brief Get CIB call options to use local scope if primary is unavailable
*
* \return CIB call options
*/
int
crmd_cib_smart_opt(void)
{
int call_opt = cib_none;
if ((controld_globals.fsa_state == S_ELECTION)
|| (controld_globals.fsa_state == S_PENDING)) {
crm_info("Sending update to local CIB in state: %s",
fsa_state2string(controld_globals.fsa_state));
cib__set_call_options(call_opt, "update", cib_scope_local);
}
return call_opt;
}
static void
cib_delete_callback(xmlNode *msg, int call_id, int rc, xmlNode *output,
void *user_data)
{
char *desc = user_data;
if (rc == 0) {
crm_debug("Deletion of %s (via CIB call %d) succeeded", desc, call_id);
} else {
crm_warn("Deletion of %s (via CIB call %d) failed: %s " CRM_XS " rc=%d",
desc, call_id, pcmk_strerror(rc), rc);
}
}
// Searches for various portions of node_state to delete
// Match a particular node's node_state (takes node name 1x)
#define XPATH_NODE_STATE "//" XML_CIB_TAG_STATE "[@" XML_ATTR_UNAME "='%s']"
// Node's lrm section (name 1x)
#define XPATH_NODE_LRM XPATH_NODE_STATE "/" XML_CIB_TAG_LRM
/* Node's lrm_rsc_op entries and lrm_resource entries without unexpired lock
* (name 2x, (seconds_since_epoch - XML_CONFIG_ATTR_SHUTDOWN_LOCK_LIMIT) 1x)
*/
#define XPATH_NODE_LRM_UNLOCKED XPATH_NODE_STATE "//" XML_LRM_TAG_RSC_OP \
"|" XPATH_NODE_STATE \
"//" XML_LRM_TAG_RESOURCE \
"[not(@" XML_CONFIG_ATTR_SHUTDOWN_LOCK ") " \
"or " XML_CONFIG_ATTR_SHUTDOWN_LOCK "<%lld]"
// Node's transient_attributes section (name 1x)
#define XPATH_NODE_ATTRS XPATH_NODE_STATE "/" XML_TAG_TRANSIENT_NODEATTRS
// Everything under node_state (name 1x)
#define XPATH_NODE_ALL XPATH_NODE_STATE "/*"
/* Unlocked history + transient attributes
* (name 2x, (seconds_since_epoch - XML_CONFIG_ATTR_SHUTDOWN_LOCK_LIMIT) 1x,
* name 1x)
*/
#define XPATH_NODE_ALL_UNLOCKED XPATH_NODE_LRM_UNLOCKED "|" XPATH_NODE_ATTRS
/*!
* \internal
* \brief Get the XPath and description of a node state section to be deleted
*
* \param[in] uname Desired node
* \param[in] section Subsection of node_state to be deleted
* \param[out] xpath Where to store XPath of \p section
* \param[out] desc If not \c NULL, where to store description of \p section
*/
void
controld_node_state_deletion_strings(const char *uname,
enum controld_section_e section,
char **xpath, char **desc)
{
const char *desc_pre = NULL;
// Shutdown locks that started before this time are expired
long long expire = (long long) time(NULL)
- controld_globals.shutdown_lock_limit;
switch (section) {
case controld_section_lrm:
*xpath = crm_strdup_printf(XPATH_NODE_LRM, uname);
desc_pre = "resource history";
break;
case controld_section_lrm_unlocked:
*xpath = crm_strdup_printf(XPATH_NODE_LRM_UNLOCKED,
uname, uname, expire);
desc_pre = "resource history (other than shutdown locks)";
break;
case controld_section_attrs:
*xpath = crm_strdup_printf(XPATH_NODE_ATTRS, uname);
desc_pre = "transient attributes";
break;
case controld_section_all:
*xpath = crm_strdup_printf(XPATH_NODE_ALL, uname);
desc_pre = "all state";
break;
case controld_section_all_unlocked:
*xpath = crm_strdup_printf(XPATH_NODE_ALL_UNLOCKED,
uname, uname, expire, uname);
desc_pre = "all state (other than shutdown locks)";
break;
default:
// We called this function incorrectly
CRM_ASSERT(false);
break;
}
if (desc != NULL) {
*desc = crm_strdup_printf("%s for node %s", desc_pre, uname);
}
}
/*!
* \internal
* \brief Delete subsection of a node's CIB node_state
*
* \param[in] uname Desired node
* \param[in] section Subsection of node_state to delete
* \param[in] options CIB call options to use
*/
void
controld_delete_node_state(const char *uname, enum controld_section_e section,
int options)
{
cib_t *cib = controld_globals.cib_conn;
char *xpath = NULL;
char *desc = NULL;
int cib_rc = pcmk_ok;
CRM_ASSERT((uname != NULL) && (cib != NULL));
controld_node_state_deletion_strings(uname, section, &xpath, &desc);
cib__set_call_options(options, "node state deletion",
cib_xpath|cib_multiple);
cib_rc = cib->cmds->remove(cib, xpath, NULL, options);
fsa_register_cib_callback(cib_rc, desc, cib_delete_callback);
crm_info("Deleting %s (via CIB call %d) " CRM_XS " xpath=%s",
desc, cib_rc, xpath);
// CIB library handles freeing desc
free(xpath);
}
// Takes node name and resource ID
#define XPATH_RESOURCE_HISTORY "//" XML_CIB_TAG_STATE \
"[@" XML_ATTR_UNAME "='%s']/" \
XML_CIB_TAG_LRM "/" XML_LRM_TAG_RESOURCES \
"/" XML_LRM_TAG_RESOURCE \
"[@" XML_ATTR_ID "='%s']"
// @TODO could add "and @XML_CONFIG_ATTR_SHUTDOWN_LOCK" to limit to locks
/*!
* \internal
* \brief Clear resource history from CIB for a given resource and node
*
* \param[in] rsc_id ID of resource to be cleared
* \param[in] node Node whose resource history should be cleared
* \param[in] user_name ACL user name to use
* \param[in] call_options CIB call options
*
* \return Standard Pacemaker return code
*/
int
controld_delete_resource_history(const char *rsc_id, const char *node,
const char *user_name, int call_options)
{
char *desc = NULL;
char *xpath = NULL;
int rc = pcmk_rc_ok;
CRM_CHECK((rsc_id != NULL) && (node != NULL), return EINVAL);
desc = crm_strdup_printf("resource history for %s on %s", rsc_id, node);
if (controld_globals.cib_conn == NULL) {
crm_err("Unable to clear %s: no CIB connection", desc);
free(desc);
return ENOTCONN;
}
// Ask CIB to delete the entry
xpath = crm_strdup_printf(XPATH_RESOURCE_HISTORY, node, rsc_id);
rc = cib_internal_op(controld_globals.cib_conn, PCMK__CIB_REQUEST_DELETE,
NULL, xpath, NULL, NULL, call_options|cib_xpath,
user_name);
if (rc < 0) {
rc = pcmk_legacy2rc(rc);
crm_err("Could not delete resource status of %s on %s%s%s: %s "
CRM_XS " rc=%d", rsc_id, node,
(user_name? " for user " : ""), (user_name? user_name : ""),
pcmk_rc_str(rc), rc);
free(desc);
free(xpath);
return rc;
}
if (pcmk_is_set(call_options, cib_sync_call)) {
if (pcmk_is_set(call_options, cib_dryrun)) {
crm_debug("Deletion of %s would succeed", desc);
} else {
crm_debug("Deletion of %s succeeded", desc);
}
free(desc);
} else {
crm_info("Clearing %s (via CIB call %d) " CRM_XS " xpath=%s",
desc, rc, xpath);
fsa_register_cib_callback(rc, desc, cib_delete_callback);
// CIB library handles freeing desc
}
free(xpath);
return pcmk_rc_ok;
}
/*!
* \internal
* \brief Build XML and string of parameters meeting some criteria, for digest
*
* \param[in] op Executor event with parameter table to use
* \param[in] metadata Parsed meta-data for executed resource agent
* \param[in] param_type Flag used for selection criteria
* \param[out] result Will be set to newly created XML with selected
* parameters as attributes
*
* \return Newly allocated space-separated string of parameter names
* \note Selection criteria varies by param_type: for the restart digest, we
* want parameters that are *not* marked reloadable (OCF 1.1) or that
* *are* marked unique (pre-1.1), for both string and XML results; for the
* secure digest, we want parameters that *are* marked private for the
* string, but parameters that are *not* marked private for the XML.
* \note It is the caller's responsibility to free the string return value with
* \p g_string_free() and the XML result with \p free_xml().
*/
static GString *
build_parameter_list(const lrmd_event_data_t *op,
const struct ra_metadata_s *metadata,
enum ra_param_flags_e param_type, xmlNode **result)
{
GString *list = NULL;
*result = create_xml_node(NULL, XML_TAG_PARAMS);
/* Consider all parameters only except private ones to be consistent with
* what scheduler does with calculate_secure_digest().
*/
if (param_type == ra_param_private
&& compare_version(controld_globals.dc_version, "3.16.0") >= 0) {
g_hash_table_foreach(op->params, hash2field, *result);
pcmk__filter_op_for_digest(*result);
}
for (GList *iter = metadata->ra_params; iter != NULL; iter = iter->next) {
struct ra_param_s *param = (struct ra_param_s *) iter->data;
bool accept_for_list = false;
bool accept_for_xml = false;
switch (param_type) {
case ra_param_reloadable:
accept_for_list = !pcmk_is_set(param->rap_flags, param_type);
accept_for_xml = accept_for_list;
break;
case ra_param_unique:
accept_for_list = pcmk_is_set(param->rap_flags, param_type);
accept_for_xml = accept_for_list;
break;
case ra_param_private:
accept_for_list = pcmk_is_set(param->rap_flags, param_type);
accept_for_xml = !accept_for_list;
break;
}
if (accept_for_list) {
crm_trace("Attr %s is %s", param->rap_name, ra_param_flag2text(param_type));
if (list == NULL) {
// We will later search for " WORD ", so start list with a space
pcmk__add_word(&list, 256, " ");
}
pcmk__add_word(&list, 0, param->rap_name);
} else {
crm_trace("Rejecting %s for %s", param->rap_name, ra_param_flag2text(param_type));
}
if (accept_for_xml) {
const char *v = g_hash_table_lookup(op->params, param->rap_name);
if (v != NULL) {
crm_trace("Adding attr %s=%s to the xml result", param->rap_name, v);
crm_xml_add(*result, param->rap_name, v);
}
} else {
crm_trace("Removing attr %s from the xml result", param->rap_name);
xml_remove_prop(*result, param->rap_name);
}
}
if (list != NULL) {
// We will later search for " WORD ", so end list with a space
pcmk__add_word(&list, 0, " ");
}
return list;
}
static void
append_restart_list(lrmd_event_data_t *op, struct ra_metadata_s *metadata,
xmlNode *update, const char *version)
{
GString *list = NULL;
char *digest = NULL;
xmlNode *restart = NULL;
CRM_LOG_ASSERT(op->params != NULL);
if (op->interval_ms > 0) {
/* monitors are not reloadable */
return;
}
if (pcmk_is_set(metadata->ra_flags, ra_supports_reload_agent)) {
// Add parameters not marked reloadable to the "op-force-restart" list
list = build_parameter_list(op, metadata, ra_param_reloadable,
&restart);
} else if (pcmk_is_set(metadata->ra_flags, ra_supports_legacy_reload)) {
/* @COMPAT pre-OCF-1.1 resource agents
*
* Before OCF 1.1, Pacemaker abused "unique=0" to indicate
* reloadability. Add any parameters with unique="1" to the
* "op-force-restart" list.
*/
list = build_parameter_list(op, metadata, ra_param_unique, &restart);
} else {
// Resource does not support agent reloads
return;
}
digest = calculate_operation_digest(restart, version);
/* Add "op-force-restart" and "op-restart-digest" to indicate the resource supports reload,
* no matter if it actually supports any parameters with unique="1"). */
crm_xml_add(update, XML_LRM_ATTR_OP_RESTART,
(list == NULL)? "" : (const char *) list->str);
crm_xml_add(update, XML_LRM_ATTR_RESTART_DIGEST, digest);
if ((list != NULL) && (list->len > 0)) {
crm_trace("%s: %s, %s", op->rsc_id, digest, (const char *) list->str);
} else {
crm_trace("%s: %s", op->rsc_id, digest);
}
if (list != NULL) {
g_string_free(list, TRUE);
}
free_xml(restart);
free(digest);
}
static void
append_secure_list(lrmd_event_data_t *op, struct ra_metadata_s *metadata,
xmlNode *update, const char *version)
{
GString *list = NULL;
char *digest = NULL;
xmlNode *secure = NULL;
CRM_LOG_ASSERT(op->params != NULL);
/*
* To keep XML_LRM_ATTR_OP_SECURE short, we want it to contain the
* secure parameters but XML_LRM_ATTR_SECURE_DIGEST to be based on
* the insecure ones
*/
list = build_parameter_list(op, metadata, ra_param_private, &secure);
if (list != NULL) {
digest = calculate_operation_digest(secure, version);
crm_xml_add(update, XML_LRM_ATTR_OP_SECURE, (const char *) list->str);
crm_xml_add(update, XML_LRM_ATTR_SECURE_DIGEST, digest);
crm_trace("%s: %s, %s", op->rsc_id, digest, (const char *) list->str);
g_string_free(list, TRUE);
} else {
crm_trace("%s: no secure parameters", op->rsc_id);
}
free_xml(secure);
free(digest);
}
/*!
* \internal
* \brief Create XML for a resource history entry
*
* \param[in] func Function name of caller
* \param[in,out] parent XML to add entry to
* \param[in] rsc Affected resource
* \param[in,out] op Action to add an entry for (or NULL to do nothing)
* \param[in] node_name Node where action occurred
*/
void
controld_add_resource_history_xml_as(const char *func, xmlNode *parent,
const lrmd_rsc_info_t *rsc,
lrmd_event_data_t *op,
const char *node_name)
{
int target_rc = 0;
xmlNode *xml_op = NULL;
struct ra_metadata_s *metadata = NULL;
const char *caller_version = NULL;
lrm_state_t *lrm_state = NULL;
if (op == NULL) {
return;
}
target_rc = rsc_op_expected_rc(op);
caller_version = g_hash_table_lookup(op->params, XML_ATTR_CRM_VERSION);
CRM_CHECK(caller_version != NULL, caller_version = CRM_FEATURE_SET);
xml_op = pcmk__create_history_xml(parent, op, caller_version, target_rc,
controld_globals.our_nodename, func);
if (xml_op == NULL) {
return;
}
if ((rsc == NULL) || (op->params == NULL)
|| !crm_op_needs_metadata(rsc->standard, op->op_type)) {
crm_trace("No digests needed for %s action on %s (params=%p rsc=%p)",
op->op_type, op->rsc_id, op->params, rsc);
return;
}
lrm_state = lrm_state_find(node_name);
if (lrm_state == NULL) {
crm_warn("Cannot calculate digests for operation " PCMK__OP_FMT
" because we have no connection to executor for %s",
op->rsc_id, op->op_type, op->interval_ms, node_name);
return;
}
/* Ideally the metadata is cached, and the agent is just a fallback.
*
* @TODO Go through all callers and ensure they get metadata asynchronously
* first.
*/
metadata = controld_get_rsc_metadata(lrm_state, rsc,
controld_metadata_from_agent
|controld_metadata_from_cache);
if (metadata == NULL) {
return;
}
crm_trace("Including additional digests for %s:%s:%s",
rsc->standard, rsc->provider, rsc->type);
append_restart_list(op, metadata, xml_op, caller_version);
append_secure_list(op, metadata, xml_op, caller_version);
return;
}
/*!
* \internal
* \brief Record an action as pending in the CIB, if appropriate
*
* \param[in] node_name Node where the action is pending
* \param[in] rsc Resource that action is for
* \param[in,out] op Pending action
*
* \return true if action was recorded in CIB, otherwise false
*/
bool
controld_record_pending_op(const char *node_name, const lrmd_rsc_info_t *rsc,
lrmd_event_data_t *op)
{
const char *record_pending = NULL;
CRM_CHECK((node_name != NULL) && (rsc != NULL) && (op != NULL),
return false);
// Never record certain operation types as pending
if ((op->op_type == NULL) || (op->params == NULL)
|| !controld_action_is_recordable(op->op_type)) {
return false;
}
// Check action's record-pending meta-attribute (defaults to true)
record_pending = crm_meta_value(op->params, XML_OP_ATTR_PENDING);
if ((record_pending != NULL) && !crm_is_true(record_pending)) {
return false;
}
op->call_id = -1;
op->t_run = time(NULL);
op->t_rcchange = op->t_run;
lrmd__set_result(op, PCMK_OCF_UNKNOWN, PCMK_EXEC_PENDING, NULL);
crm_debug("Recording pending %s-interval %s for %s on %s in the CIB",
pcmk__readable_interval(op->interval_ms), op->op_type, op->rsc_id,
node_name);
controld_update_resource_history(node_name, rsc, op, 0);
return true;
}
static void
cib_rsc_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
{
switch (rc) {
case pcmk_ok:
case -pcmk_err_diff_failed:
case -pcmk_err_diff_resync:
- crm_trace("Resource update %d complete: rc=%d", call_id, rc);
+ crm_trace("Resource history update completed (call=%d rc=%d)",
+ call_id, rc);
break;
default:
- crm_warn("Resource update %d failed: (rc=%d) %s", call_id, rc, pcmk_strerror(rc));
+ if (call_id > 0) {
+ crm_warn("Resource history update %d failed: %s "
+ CRM_XS " rc=%d", call_id, pcmk_strerror(rc), rc);
+ } else {
+ crm_warn("Resource history update failed: %s " CRM_XS " rc=%d",
+ pcmk_strerror(rc), rc);
+ }
}
if (call_id == pending_rsc_update) {
pending_rsc_update = 0;
controld_trigger_fsa();
}
}
/* Only successful stops, and probes that found the resource inactive, get locks
* recorded in the history. This ensures the resource stays locked to the node
* until it is active there again after the node comes back up.
*/
static bool
should_preserve_lock(lrmd_event_data_t *op)
{
if (!pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
return false;
}
if (!strcmp(op->op_type, PCMK_ACTION_STOP) && (op->rc == PCMK_OCF_OK)) {
return true;
}
if (!strcmp(op->op_type, PCMK_ACTION_MONITOR)
&& (op->rc == PCMK_OCF_NOT_RUNNING)) {
return true;
}
return false;
}
/*!
* \internal
* \brief Request a CIB update
*
* \param[in] section Section of CIB to update
* \param[in] data New XML of CIB section to update
* \param[in] options CIB call options
* \param[in] callback If not \c NULL, set this as the operation callback
* \param[in,out] user_data Data to pass to \p callback (must be freeable using
* \c free())
*
* \return Standard Pacemaker return code
*
* \note If \p callback is \p cib_rsc_callback(), the CIB update's call ID is
* stored in \p pending_rsc_update on success.
*/
int
controld_update_cib(const char *section, xmlNode *data, int options,
void (*callback)(xmlNode *, int, int, xmlNode *, void *),
void *user_data)
{
int cib_rc = -ENOTCONN;
CRM_ASSERT((data != NULL) && ((callback != NULL) || (user_data == NULL)));
if (controld_globals.cib_conn != NULL) {
cib_rc = cib_internal_op(controld_globals.cib_conn,
PCMK__CIB_REQUEST_MODIFY, NULL, section,
data, NULL, options, NULL);
if (cib_rc >= 0) {
crm_debug("Submitted CIB update %d for %s section",
cib_rc, section);
}
}
if (callback == NULL) {
if (cib_rc < 0) {
crm_err("Failed to update CIB %s section: %s",
section, pcmk_rc_str(pcmk_legacy2rc(cib_rc)));
}
} else {
if ((cib_rc >= 0) && (callback == cib_rsc_callback)) {
/* Checking for a particular callback is a little hacky, but it
* didn't seem worth adding an output argument for cib_rc for just
* one use case.
*/
pending_rsc_update = cib_rc;
}
fsa_register_cib_callback(cib_rc, user_data, callback);
}
return (cib_rc >= 0)? pcmk_rc_ok : pcmk_legacy2rc(cib_rc);
}
/*!
* \internal
* \brief Update resource history entry in CIB
*
* \param[in] node_name Node where action occurred
* \param[in] rsc Resource that action is for
* \param[in,out] op Action to record
* \param[in] lock_time If nonzero, when resource was locked to node
*
* \note On success, the CIB update's call ID will be stored in
* pending_rsc_update.
*/
void
controld_update_resource_history(const char *node_name,
const lrmd_rsc_info_t *rsc,
lrmd_event_data_t *op, time_t lock_time)
{
xmlNode *update = NULL;
xmlNode *xml = NULL;
int call_opt = crmd_cib_smart_opt();
const char *node_id = NULL;
const char *container = NULL;
CRM_CHECK((node_name != NULL) && (op != NULL), return);
if (rsc == NULL) {
crm_warn("Resource %s no longer exists in the executor", op->rsc_id);
controld_ack_event_directly(NULL, NULL, rsc, op, op->rsc_id);
return;
}
// <status>
update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
// <node_state ...>
xml = create_xml_node(update, XML_CIB_TAG_STATE);
if (pcmk__str_eq(node_name, controld_globals.our_nodename,
pcmk__str_casei)) {
node_id = controld_globals.our_uuid;
} else {
node_id = node_name;
pcmk__xe_set_bool_attr(xml, XML_NODE_IS_REMOTE, true);
}
crm_xml_add(xml, XML_ATTR_ID, node_id);
crm_xml_add(xml, XML_ATTR_UNAME, node_name);
crm_xml_add(xml, XML_ATTR_ORIGIN, __func__);
// <lrm ...>
xml = create_xml_node(xml, XML_CIB_TAG_LRM);
crm_xml_add(xml, XML_ATTR_ID, node_id);
// <lrm_resources>
xml = create_xml_node(xml, XML_LRM_TAG_RESOURCES);
// <lrm_resource ...>
xml = create_xml_node(xml, XML_LRM_TAG_RESOURCE);
crm_xml_add(xml, XML_ATTR_ID, op->rsc_id);
crm_xml_add(xml, XML_AGENT_ATTR_CLASS, rsc->standard);
crm_xml_add(xml, XML_AGENT_ATTR_PROVIDER, rsc->provider);
crm_xml_add(xml, XML_ATTR_TYPE, rsc->type);
if (lock_time != 0) {
/* Actions on a locked resource should either preserve the lock by
* recording it with the action result, or clear it.
*/
if (!should_preserve_lock(op)) {
lock_time = 0;
}
crm_xml_add_ll(xml, XML_CONFIG_ATTR_SHUTDOWN_LOCK,
(long long) lock_time);
}
if (op->params != NULL) {
container = g_hash_table_lookup(op->params,
CRM_META "_" XML_RSC_ATTR_CONTAINER);
if (container != NULL) {
crm_trace("Resource %s is a part of container resource %s",
op->rsc_id, container);
crm_xml_add(xml, XML_RSC_ATTR_CONTAINER, container);
}
}
// <lrm_resource_op ...> (possibly more than one)
controld_add_resource_history_xml(xml, rsc, op, node_name);
/* Update CIB asynchronously. Even if it fails, the resource state should be
* discovered during the next election. Worst case, the node is wrongly
* fenced for running a resource it isn't.
*/
crm_log_xml_trace(update, __func__);
controld_update_cib(XML_CIB_TAG_STATUS, update, call_opt, cib_rsc_callback,
NULL);
free_xml(update);
}
/*!
* \internal
* \brief Erase an LRM history entry from the CIB, given the operation data
*
* \param[in] op Operation whose history should be deleted
*/
void
controld_delete_action_history(const lrmd_event_data_t *op)
{
xmlNode *xml_top = NULL;
CRM_CHECK(op != NULL, return);
xml_top = create_xml_node(NULL, XML_LRM_TAG_RSC_OP);
crm_xml_add_int(xml_top, XML_LRM_ATTR_CALLID, op->call_id);
crm_xml_add(xml_top, XML_ATTR_TRANSITION_KEY, op->user_data);
if (op->interval_ms > 0) {
char *op_id = pcmk__op_key(op->rsc_id, op->op_type, op->interval_ms);
/* Avoid deleting last_failure too (if it was a result of this recurring op failing) */
crm_xml_add(xml_top, XML_ATTR_ID, op_id);
free(op_id);
}
crm_debug("Erasing resource operation history for " PCMK__OP_FMT " (call=%d)",
op->rsc_id, op->op_type, op->interval_ms, op->call_id);
controld_globals.cib_conn->cmds->remove(controld_globals.cib_conn,
XML_CIB_TAG_STATUS, xml_top,
cib_none);
crm_log_xml_trace(xml_top, "op:cancel");
free_xml(xml_top);
}
/* Define xpath to find LRM resource history entry by node and resource */
#define XPATH_HISTORY \
"/" XML_TAG_CIB "/" XML_CIB_TAG_STATUS \
"/" XML_CIB_TAG_STATE "[@" XML_ATTR_UNAME "='%s']" \
"/" XML_CIB_TAG_LRM "/" XML_LRM_TAG_RESOURCES \
"/" XML_LRM_TAG_RESOURCE "[@" XML_ATTR_ID "='%s']" \
"/" XML_LRM_TAG_RSC_OP
/* ... and also by operation key */
#define XPATH_HISTORY_ID XPATH_HISTORY \
"[@" XML_ATTR_ID "='%s']"
/* ... and also by operation key and operation call ID */
#define XPATH_HISTORY_CALL XPATH_HISTORY \
"[@" XML_ATTR_ID "='%s' and @" XML_LRM_ATTR_CALLID "='%d']"
/* ... and also by operation key and original operation key */
#define XPATH_HISTORY_ORIG XPATH_HISTORY \
"[@" XML_ATTR_ID "='%s' and @" XML_LRM_ATTR_TASK_KEY "='%s']"
/*!
* \internal
* \brief Delete a last_failure resource history entry from the CIB
*
* \param[in] rsc_id Name of resource to clear history for
* \param[in] node Name of node to clear history for
* \param[in] action If specified, delete only if this was failed action
* \param[in] interval_ms If \p action is specified, it has this interval
*/
void
controld_cib_delete_last_failure(const char *rsc_id, const char *node,
const char *action, guint interval_ms)
{
char *xpath = NULL;
char *last_failure_key = NULL;
CRM_CHECK((rsc_id != NULL) && (node != NULL), return);
// Generate XPath to match desired entry
last_failure_key = pcmk__op_key(rsc_id, "last_failure", 0);
if (action == NULL) {
xpath = crm_strdup_printf(XPATH_HISTORY_ID, node, rsc_id,
last_failure_key);
} else {
char *action_key = pcmk__op_key(rsc_id, action, interval_ms);
xpath = crm_strdup_printf(XPATH_HISTORY_ORIG, node, rsc_id,
last_failure_key, action_key);
free(action_key);
}
free(last_failure_key);
controld_globals.cib_conn->cmds->remove(controld_globals.cib_conn, xpath,
NULL, cib_xpath);
free(xpath);
}
/*!
* \internal
* \brief Delete resource history entry from the CIB, given operation key
*
* \param[in] rsc_id Name of resource to clear history for
* \param[in] node Name of node to clear history for
* \param[in] key Operation key of operation to clear history for
* \param[in] call_id If specified, delete entry only if it has this call ID
*/
void
controld_delete_action_history_by_key(const char *rsc_id, const char *node,
const char *key, int call_id)
{
char *xpath = NULL;
CRM_CHECK((rsc_id != NULL) && (node != NULL) && (key != NULL), return);
if (call_id > 0) {
xpath = crm_strdup_printf(XPATH_HISTORY_CALL, node, rsc_id, key,
call_id);
} else {
xpath = crm_strdup_printf(XPATH_HISTORY_ID, node, rsc_id, key);
}
controld_globals.cib_conn->cmds->remove(controld_globals.cib_conn, xpath,
NULL, cib_xpath);
free(xpath);
}
diff --git a/daemons/controld/controld_execd.c b/daemons/controld/controld_execd.c
index 530e4346c8..a90e8d833e 100644
--- a/daemons/controld/controld_execd.c
+++ b/daemons/controld/controld_execd.c
@@ -1,2445 +1,2447 @@
/*
* Copyright 2004-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <regex.h>
#include <sys/param.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <crm/crm.h>
#include <crm/lrmd.h> // lrmd_event_data_t, lrmd_rsc_info_t, etc.
#include <crm/services.h>
#include <crm/msg_xml.h>
#include <crm/common/xml.h>
#include <crm/pengine/rules.h>
#include <crm/lrmd_internal.h>
#include <pacemaker-internal.h>
#include <pacemaker-controld.h>
#define START_DELAY_THRESHOLD 5 * 60 * 1000
#define MAX_LRM_REG_FAILS 30
struct delete_event_s {
int rc;
const char *rsc;
lrm_state_t *lrm_state;
};
static gboolean is_rsc_active(lrm_state_t * lrm_state, const char *rsc_id);
static gboolean build_active_RAs(lrm_state_t * lrm_state, xmlNode * rsc_list);
static gboolean stop_recurring_actions(gpointer key, gpointer value, gpointer user_data);
static lrmd_event_data_t *construct_op(const lrm_state_t *lrm_state,
const xmlNode *rsc_op,
const char *rsc_id,
const char *operation);
static void do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc,
xmlNode *msg, struct ra_metadata_s *md);
static gboolean lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state,
int log_level);
static void
lrm_connection_destroy(void)
{
if (pcmk_is_set(controld_globals.fsa_input_register, R_LRM_CONNECTED)) {
crm_crit("Connection to executor failed");
register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL);
controld_clear_fsa_input_flags(R_LRM_CONNECTED);
} else {
crm_info("Disconnected from executor");
}
}
static char *
make_stop_id(const char *rsc, int call_id)
{
return crm_strdup_printf("%s:%d", rsc, call_id);
}
static void
copy_instance_keys(gpointer key, gpointer value, gpointer user_data)
{
if (strstr(key, CRM_META "_") == NULL) {
g_hash_table_replace(user_data, strdup((const char *)key), strdup((const char *)value));
}
}
static void
copy_meta_keys(gpointer key, gpointer value, gpointer user_data)
{
if (strstr(key, CRM_META "_") != NULL) {
g_hash_table_replace(user_data, strdup((const char *)key), strdup((const char *)value));
}
}
/*!
* \internal
* \brief Remove a recurring operation from a resource's history
*
* \param[in,out] history Resource history to modify
* \param[in] op Operation to remove
*
* \return TRUE if the operation was found and removed, FALSE otherwise
*/
static gboolean
history_remove_recurring_op(rsc_history_t *history, const lrmd_event_data_t *op)
{
GList *iter;
for (iter = history->recurring_op_list; iter != NULL; iter = iter->next) {
lrmd_event_data_t *existing = iter->data;
if ((op->interval_ms == existing->interval_ms)
&& pcmk__str_eq(op->rsc_id, existing->rsc_id, pcmk__str_none)
&& pcmk__str_eq(op->op_type, existing->op_type, pcmk__str_casei)) {
history->recurring_op_list = g_list_delete_link(history->recurring_op_list, iter);
lrmd_free_event(existing);
return TRUE;
}
}
return FALSE;
}
/*!
* \internal
* \brief Free all recurring operations in resource history
*
* \param[in,out] history Resource history to modify
*/
static void
history_free_recurring_ops(rsc_history_t *history)
{
GList *iter;
for (iter = history->recurring_op_list; iter != NULL; iter = iter->next) {
lrmd_free_event(iter->data);
}
g_list_free(history->recurring_op_list);
history->recurring_op_list = NULL;
}
/*!
* \internal
* \brief Free resource history
*
* \param[in,out] history Resource history to free
*/
void
history_free(gpointer data)
{
rsc_history_t *history = (rsc_history_t*)data;
if (history->stop_params) {
g_hash_table_destroy(history->stop_params);
}
/* Don't need to free history->rsc.id because it's set to history->id */
free(history->rsc.type);
free(history->rsc.standard);
free(history->rsc.provider);
lrmd_free_event(history->failed);
lrmd_free_event(history->last);
free(history->id);
history_free_recurring_ops(history);
free(history);
}
static void
update_history_cache(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, lrmd_event_data_t * op)
{
int target_rc = 0;
rsc_history_t *entry = NULL;
if (op->rsc_deleted) {
crm_debug("Purged history for '%s' after %s", op->rsc_id, op->op_type);
controld_delete_resource_history(op->rsc_id, lrm_state->node_name,
NULL, crmd_cib_smart_opt());
return;
}
if (pcmk__str_eq(op->op_type, PCMK_ACTION_NOTIFY, pcmk__str_casei)) {
return;
}
crm_debug("Updating history for '%s' with %s op", op->rsc_id, op->op_type);
entry = g_hash_table_lookup(lrm_state->resource_history, op->rsc_id);
if (entry == NULL && rsc) {
entry = calloc(1, sizeof(rsc_history_t));
entry->id = strdup(op->rsc_id);
g_hash_table_insert(lrm_state->resource_history, entry->id, entry);
entry->rsc.id = entry->id;
entry->rsc.type = strdup(rsc->type);
entry->rsc.standard = strdup(rsc->standard);
pcmk__str_update(&entry->rsc.provider, rsc->provider);
} else if (entry == NULL) {
crm_info("Resource %s no longer exists, not updating cache", op->rsc_id);
return;
}
entry->last_callid = op->call_id;
target_rc = rsc_op_expected_rc(op);
if (op->op_status == PCMK_EXEC_CANCELLED) {
if (op->interval_ms > 0) {
crm_trace("Removing cancelled recurring op: " PCMK__OP_FMT,
op->rsc_id, op->op_type, op->interval_ms);
history_remove_recurring_op(entry, op);
return;
} else {
crm_trace("Skipping " PCMK__OP_FMT " rc=%d, status=%d",
op->rsc_id, op->op_type, op->interval_ms, op->rc,
op->op_status);
}
} else if (did_rsc_op_fail(op, target_rc)) {
/* Store failed monitors here, otherwise the block below will cause them
* to be forgotten when a stop happens.
*/
if (entry->failed) {
lrmd_free_event(entry->failed);
}
entry->failed = lrmd_copy_event(op);
} else if (op->interval_ms == 0) {
if (entry->last) {
lrmd_free_event(entry->last);
}
entry->last = lrmd_copy_event(op);
if (op->params && pcmk__strcase_any_of(op->op_type, PCMK_ACTION_START,
PCMK_ACTION_RELOAD,
PCMK_ACTION_RELOAD_AGENT,
PCMK_ACTION_MONITOR, NULL)) {
if (entry->stop_params) {
g_hash_table_destroy(entry->stop_params);
}
entry->stop_params = pcmk__strkey_table(free, free);
g_hash_table_foreach(op->params, copy_instance_keys, entry->stop_params);
}
}
if (op->interval_ms > 0) {
/* Ensure there are no duplicates */
history_remove_recurring_op(entry, op);
crm_trace("Adding recurring op: " PCMK__OP_FMT,
op->rsc_id, op->op_type, op->interval_ms);
entry->recurring_op_list = g_list_prepend(entry->recurring_op_list, lrmd_copy_event(op));
} else if ((entry->recurring_op_list != NULL)
&& !pcmk__str_eq(op->op_type, PCMK_ACTION_MONITOR,
pcmk__str_casei)) {
crm_trace("Dropping %d recurring ops because of: " PCMK__OP_FMT,
g_list_length(entry->recurring_op_list), op->rsc_id,
op->op_type, op->interval_ms);
history_free_recurring_ops(entry);
}
}
/*!
* \internal
* \brief Send a direct OK ack for a resource task
*
* \param[in] lrm_state LRM connection
* \param[in] input Input message being ack'ed
* \param[in] rsc_id ID of affected resource
* \param[in] rsc Affected resource (if available)
* \param[in] task Operation task being ack'ed
* \param[in] ack_host Name of host to send ack to
* \param[in] ack_sys IPC system name to ack
*/
static void
send_task_ok_ack(const lrm_state_t *lrm_state, const ha_msg_input_t *input,
const char *rsc_id, const lrmd_rsc_info_t *rsc,
const char *task, const char *ack_host, const char *ack_sys)
{
lrmd_event_data_t *op = construct_op(lrm_state, input->xml, rsc_id, task);
lrmd__set_result(op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
controld_ack_event_directly(ack_host, ack_sys, rsc, op, rsc_id);
lrmd_free_event(op);
}
static inline const char *
op_node_name(lrmd_event_data_t *op)
{
return pcmk__s(op->remote_nodename, controld_globals.our_nodename);
}
void
lrm_op_callback(lrmd_event_data_t * op)
{
CRM_CHECK(op != NULL, return);
switch (op->type) {
case lrmd_event_disconnect:
if (op->remote_nodename == NULL) {
/* If this is the local executor IPC connection, set the right
* bits in the controller when the connection goes down.
*/
lrm_connection_destroy();
}
break;
case lrmd_event_exec_complete:
{
lrm_state_t *lrm_state = lrm_state_find(op_node_name(op));
CRM_ASSERT(lrm_state != NULL);
process_lrm_event(lrm_state, op, NULL, NULL);
}
break;
default:
break;
}
}
static void
try_local_executor_connect(long long action, fsa_data_t *msg_data,
lrm_state_t *lrm_state)
{
int rc = pcmk_rc_ok;
crm_debug("Connecting to the local executor");
// If we can connect, great
rc = controld_connect_local_executor(lrm_state);
if (rc == pcmk_rc_ok) {
controld_set_fsa_input_flags(R_LRM_CONNECTED);
crm_info("Connection to the local executor established");
return;
}
// Otherwise, if we can try again, set a timer to do so
if (lrm_state->num_lrm_register_fails < MAX_LRM_REG_FAILS) {
crm_warn("Failed to connect to the local executor %d time%s "
"(%d max): %s", lrm_state->num_lrm_register_fails,
pcmk__plural_s(lrm_state->num_lrm_register_fails),
MAX_LRM_REG_FAILS, pcmk_rc_str(rc));
controld_start_wait_timer();
crmd_fsa_stall(FALSE);
return;
}
// Otherwise give up
crm_err("Failed to connect to the executor the max allowed "
"%d time%s: %s", lrm_state->num_lrm_register_fails,
pcmk__plural_s(lrm_state->num_lrm_register_fails),
pcmk_rc_str(rc));
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
}
/* A_LRM_CONNECT */
void
do_lrm_control(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
/* This only pertains to local executor connections. Remote connections are
* handled as resources within the scheduler. Connecting and disconnecting
* from remote executor instances is handled differently.
*/
lrm_state_t *lrm_state = NULL;
if (controld_globals.our_nodename == NULL) {
return; /* Nothing to do */
}
lrm_state = lrm_state_find_or_create(controld_globals.our_nodename);
if (lrm_state == NULL) {
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
return;
}
if (action & A_LRM_DISCONNECT) {
if (lrm_state_verify_stopped(lrm_state, cur_state, LOG_INFO) == FALSE) {
if (action == A_LRM_DISCONNECT) {
crmd_fsa_stall(FALSE);
return;
}
}
controld_clear_fsa_input_flags(R_LRM_CONNECTED);
crm_info("Disconnecting from the executor");
lrm_state_disconnect(lrm_state);
lrm_state_reset_tables(lrm_state, FALSE);
crm_notice("Disconnected from the executor");
}
if (action & A_LRM_CONNECT) {
try_local_executor_connect(action, msg_data, lrm_state);
}
if (action & ~(A_LRM_CONNECT | A_LRM_DISCONNECT)) {
crm_err("Unexpected action %s in %s", fsa_action2string(action),
__func__);
}
}
static gboolean
lrm_state_verify_stopped(lrm_state_t * lrm_state, enum crmd_fsa_state cur_state, int log_level)
{
int counter = 0;
gboolean rc = TRUE;
const char *when = "lrm disconnect";
GHashTableIter gIter;
const char *key = NULL;
rsc_history_t *entry = NULL;
active_op_t *pending = NULL;
crm_debug("Checking for active resources before exit");
if (cur_state == S_TERMINATE) {
log_level = LOG_ERR;
when = "shutdown";
} else if (pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)) {
when = "shutdown... waiting";
}
if ((lrm_state->active_ops != NULL) && lrm_state_is_connected(lrm_state)) {
guint removed = g_hash_table_foreach_remove(lrm_state->active_ops,
stop_recurring_actions,
lrm_state);
guint nremaining = g_hash_table_size(lrm_state->active_ops);
if (removed || nremaining) {
crm_notice("Stopped %u recurring operation%s at %s (%u remaining)",
removed, pcmk__plural_s(removed), when, nremaining);
}
}
if (lrm_state->active_ops != NULL) {
g_hash_table_iter_init(&gIter, lrm_state->active_ops);
while (g_hash_table_iter_next(&gIter, NULL, (void **)&pending)) {
/* Ignore recurring actions in the shutdown calculations */
if (pending->interval_ms == 0) {
counter++;
}
}
}
if (counter > 0) {
do_crm_log(log_level, "%d pending executor operation%s at %s",
counter, pcmk__plural_s(counter), when);
if ((cur_state == S_TERMINATE)
|| !pcmk_is_set(controld_globals.fsa_input_register,
R_SENT_RSC_STOP)) {
g_hash_table_iter_init(&gIter, lrm_state->active_ops);
while (g_hash_table_iter_next(&gIter, (gpointer*)&key, (gpointer*)&pending)) {
do_crm_log(log_level, "Pending action: %s (%s)", key, pending->op_key);
}
} else {
rc = FALSE;
}
return rc;
}
if (lrm_state->resource_history == NULL) {
return rc;
}
if (pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)) {
/* At this point we're not waiting, we're just shutting down */
when = "shutdown";
}
counter = 0;
g_hash_table_iter_init(&gIter, lrm_state->resource_history);
while (g_hash_table_iter_next(&gIter, NULL, (gpointer*)&entry)) {
if (is_rsc_active(lrm_state, entry->id) == FALSE) {
continue;
}
counter++;
if (log_level == LOG_ERR) {
crm_info("Found %s active at %s", entry->id, when);
} else {
crm_trace("Found %s active at %s", entry->id, when);
}
if (lrm_state->active_ops != NULL) {
GHashTableIter hIter;
g_hash_table_iter_init(&hIter, lrm_state->active_ops);
while (g_hash_table_iter_next(&hIter, (gpointer*)&key, (gpointer*)&pending)) {
if (pcmk__str_eq(entry->id, pending->rsc_id, pcmk__str_none)) {
crm_notice("%sction %s (%s) incomplete at %s",
pending->interval_ms == 0 ? "A" : "Recurring a",
key, pending->op_key, when);
}
}
}
}
if (counter) {
crm_err("%d resource%s active at %s",
counter, (counter == 1)? " was" : "s were", when);
}
return rc;
}
static gboolean
is_rsc_active(lrm_state_t * lrm_state, const char *rsc_id)
{
rsc_history_t *entry = NULL;
entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id);
if (entry == NULL || entry->last == NULL) {
return FALSE;
}
crm_trace("Processing %s: %s.%d=%d", rsc_id, entry->last->op_type,
entry->last->interval_ms, entry->last->rc);
if ((entry->last->rc == PCMK_OCF_OK)
&& pcmk__str_eq(entry->last->op_type, PCMK_ACTION_STOP,
pcmk__str_casei)) {
return FALSE;
} else if (entry->last->rc == PCMK_OCF_OK
&& pcmk__str_eq(entry->last->op_type, PCMK_ACTION_MIGRATE_TO,
pcmk__str_casei)) {
// A stricter check is too complex ... leave that to the scheduler
return FALSE;
} else if (entry->last->rc == PCMK_OCF_NOT_RUNNING) {
return FALSE;
} else if ((entry->last->interval_ms == 0)
&& (entry->last->rc == PCMK_OCF_NOT_CONFIGURED)) {
/* Badly configured resources can't be reliably stopped */
return FALSE;
}
return TRUE;
}
static gboolean
build_active_RAs(lrm_state_t * lrm_state, xmlNode * rsc_list)
{
GHashTableIter iter;
rsc_history_t *entry = NULL;
g_hash_table_iter_init(&iter, lrm_state->resource_history);
while (g_hash_table_iter_next(&iter, NULL, (void **)&entry)) {
GList *gIter = NULL;
xmlNode *xml_rsc = create_xml_node(rsc_list, XML_LRM_TAG_RESOURCE);
crm_xml_add(xml_rsc, XML_ATTR_ID, entry->id);
crm_xml_add(xml_rsc, XML_ATTR_TYPE, entry->rsc.type);
crm_xml_add(xml_rsc, XML_AGENT_ATTR_CLASS, entry->rsc.standard);
crm_xml_add(xml_rsc, XML_AGENT_ATTR_PROVIDER, entry->rsc.provider);
if (entry->last && entry->last->params) {
const char *container = g_hash_table_lookup(entry->last->params, CRM_META"_"XML_RSC_ATTR_CONTAINER);
if (container) {
crm_trace("Resource %s is a part of container resource %s", entry->id, container);
crm_xml_add(xml_rsc, XML_RSC_ATTR_CONTAINER, container);
}
}
controld_add_resource_history_xml(xml_rsc, &(entry->rsc), entry->failed,
lrm_state->node_name);
controld_add_resource_history_xml(xml_rsc, &(entry->rsc), entry->last,
lrm_state->node_name);
for (gIter = entry->recurring_op_list; gIter != NULL; gIter = gIter->next) {
controld_add_resource_history_xml(xml_rsc, &(entry->rsc), gIter->data,
lrm_state->node_name);
}
}
return FALSE;
}
xmlNode *
controld_query_executor_state(void)
{
xmlNode *xml_state = NULL;
xmlNode *xml_data = NULL;
xmlNode *rsc_list = NULL;
crm_node_t *peer = NULL;
lrm_state_t *lrm_state = lrm_state_find(controld_globals.our_nodename);
if (!lrm_state) {
crm_err("Could not find executor state for node %s",
controld_globals.our_nodename);
return NULL;
}
peer = crm_get_peer_full(0, lrm_state->node_name, CRM_GET_PEER_ANY);
CRM_CHECK(peer != NULL, return NULL);
xml_state = create_node_state_update(peer,
node_update_cluster|node_update_peer,
NULL, __func__);
if (xml_state == NULL) {
return NULL;
}
xml_data = create_xml_node(xml_state, XML_CIB_TAG_LRM);
crm_xml_add(xml_data, XML_ATTR_ID, peer->uuid);
rsc_list = create_xml_node(xml_data, XML_LRM_TAG_RESOURCES);
/* Build a list of active (not always running) resources */
build_active_RAs(lrm_state, rsc_list);
crm_log_xml_trace(xml_state, "Current executor state");
return xml_state;
}
/*!
* \internal
* \brief Map standard Pacemaker return code to operation status and OCF code
*
* \param[out] event Executor event whose status and return code should be set
* \param[in] rc Standard Pacemaker return code
*/
void
controld_rc2event(lrmd_event_data_t *event, int rc)
{
/* This is called for cleanup requests from controller peers/clients, not
* for resource actions, so no exit reason is needed.
*/
switch (rc) {
case pcmk_rc_ok:
lrmd__set_result(event, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
break;
case EACCES:
lrmd__set_result(event, PCMK_OCF_INSUFFICIENT_PRIV,
PCMK_EXEC_ERROR, NULL);
break;
default:
lrmd__set_result(event, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_ERROR,
NULL);
break;
}
}
/*!
* \internal
* \brief Trigger a new transition after CIB status was deleted
*
* If a CIB status delete was not expected (as part of the transition graph),
* trigger a new transition by updating the (arbitrary) "last-lrm-refresh"
* cluster property.
*
* \param[in] from_sys IPC name that requested the delete
* \param[in] rsc_id Resource whose status was deleted (for logging only)
*/
void
controld_trigger_delete_refresh(const char *from_sys, const char *rsc_id)
{
if (!pcmk__str_eq(from_sys, CRM_SYSTEM_TENGINE, pcmk__str_casei)) {
char *now_s = crm_strdup_printf("%lld", (long long) time(NULL));
crm_debug("Triggering a refresh after %s cleaned %s", from_sys, rsc_id);
cib__update_node_attr(controld_globals.logger_out,
controld_globals.cib_conn, cib_none,
XML_CIB_TAG_CRMCONFIG, NULL, NULL, NULL, NULL,
"last-lrm-refresh", now_s, NULL, NULL);
free(now_s);
}
}
static void
notify_deleted(lrm_state_t * lrm_state, ha_msg_input_t * input, const char *rsc_id, int rc)
{
lrmd_event_data_t *op = NULL;
const char *from_sys = crm_element_value(input->msg, F_CRM_SYS_FROM);
const char *from_host = crm_element_value(input->msg, F_CRM_HOST_FROM);
crm_info("Notifying %s on %s that %s was%s deleted",
from_sys, (from_host? from_host : "localhost"), rsc_id,
((rc == pcmk_ok)? "" : " not"));
op = construct_op(lrm_state, input->xml, rsc_id, PCMK_ACTION_DELETE);
controld_rc2event(op, pcmk_legacy2rc(rc));
controld_ack_event_directly(from_host, from_sys, NULL, op, rsc_id);
lrmd_free_event(op);
controld_trigger_delete_refresh(from_sys, rsc_id);
}
static gboolean
lrm_remove_deleted_rsc(gpointer key, gpointer value, gpointer user_data)
{
struct delete_event_s *event = user_data;
struct pending_deletion_op_s *op = value;
if (pcmk__str_eq(event->rsc, op->rsc, pcmk__str_none)) {
notify_deleted(event->lrm_state, op->input, event->rsc, event->rc);
return TRUE;
}
return FALSE;
}
static gboolean
lrm_remove_deleted_op(gpointer key, gpointer value, gpointer user_data)
{
const char *rsc = user_data;
active_op_t *pending = value;
if (pcmk__str_eq(rsc, pending->rsc_id, pcmk__str_none)) {
crm_info("Removing op %s:%d for deleted resource %s",
pending->op_key, pending->call_id, rsc);
return TRUE;
}
return FALSE;
}
static void
delete_rsc_entry(lrm_state_t *lrm_state, ha_msg_input_t *input,
const char *rsc_id, GHashTableIter *rsc_iter, int rc,
const char *user_name, bool from_cib)
{
struct delete_event_s event;
CRM_CHECK(rsc_id != NULL, return);
if (rc == pcmk_ok) {
char *rsc_id_copy = strdup(rsc_id);
if (rsc_iter) {
g_hash_table_iter_remove(rsc_iter);
} else {
g_hash_table_remove(lrm_state->resource_history, rsc_id_copy);
}
if (from_cib) {
controld_delete_resource_history(rsc_id_copy, lrm_state->node_name,
user_name, crmd_cib_smart_opt());
}
g_hash_table_foreach_remove(lrm_state->active_ops,
lrm_remove_deleted_op, rsc_id_copy);
free(rsc_id_copy);
}
if (input) {
notify_deleted(lrm_state, input, rsc_id, rc);
}
event.rc = rc;
event.rsc = rsc_id;
event.lrm_state = lrm_state;
g_hash_table_foreach_remove(lrm_state->deletion_ops, lrm_remove_deleted_rsc, &event);
}
static inline gboolean
last_failed_matches_op(rsc_history_t *entry, const char *op, guint interval_ms)
{
if (entry == NULL) {
return FALSE;
}
if (op == NULL) {
return TRUE;
}
return (pcmk__str_eq(op, entry->failed->op_type, pcmk__str_casei)
&& (interval_ms == entry->failed->interval_ms));
}
/*!
* \internal
* \brief Clear a resource's last failure
*
* Erase a resource's last failure on a particular node from both the
* LRM resource history in the CIB, and the resource history remembered
* for the LRM state.
*
* \param[in] rsc_id Resource name
* \param[in] node_name Node name
* \param[in] operation If specified, only clear if matching this operation
* \param[in] interval_ms If operation is specified, it has this interval
*/
void
lrm_clear_last_failure(const char *rsc_id, const char *node_name,
const char *operation, guint interval_ms)
{
lrm_state_t *lrm_state = lrm_state_find(node_name);
if (lrm_state == NULL) {
return;
}
if (lrm_state->resource_history != NULL) {
rsc_history_t *entry = g_hash_table_lookup(lrm_state->resource_history,
rsc_id);
if (last_failed_matches_op(entry, operation, interval_ms)) {
lrmd_free_event(entry->failed);
entry->failed = NULL;
}
}
}
/* Returns: gboolean - cancellation is in progress */
static gboolean
cancel_op(lrm_state_t * lrm_state, const char *rsc_id, const char *key, int op, gboolean remove)
{
int rc = pcmk_ok;
char *local_key = NULL;
active_op_t *pending = NULL;
CRM_CHECK(op != 0, return FALSE);
CRM_CHECK(rsc_id != NULL, return FALSE);
if (key == NULL) {
local_key = make_stop_id(rsc_id, op);
key = local_key;
}
pending = g_hash_table_lookup(lrm_state->active_ops, key);
if (pending) {
if (remove && !pcmk_is_set(pending->flags, active_op_remove)) {
controld_set_active_op_flags(pending, active_op_remove);
crm_debug("Scheduling %s for removal", key);
}
if (pcmk_is_set(pending->flags, active_op_cancelled)) {
crm_debug("Operation %s already cancelled", key);
free(local_key);
return FALSE;
}
controld_set_active_op_flags(pending, active_op_cancelled);
} else {
crm_info("No pending op found for %s", key);
free(local_key);
return FALSE;
}
crm_debug("Cancelling op %d for %s (%s)", op, rsc_id, key);
rc = lrm_state_cancel(lrm_state, pending->rsc_id, pending->op_type,
pending->interval_ms);
if (rc == pcmk_ok) {
crm_debug("Op %d for %s (%s): cancelled", op, rsc_id, key);
free(local_key);
return TRUE;
}
crm_debug("Op %d for %s (%s): Nothing to cancel", op, rsc_id, key);
/* The caller needs to make sure the entry is
* removed from the active operations list
*
* Usually by returning TRUE inside the worker function
* supplied to g_hash_table_foreach_remove()
*
* Not removing the entry from active operations will block
* the node from shutting down
*/
free(local_key);
return FALSE;
}
struct cancel_data {
gboolean done;
gboolean remove;
const char *key;
lrmd_rsc_info_t *rsc;
lrm_state_t *lrm_state;
};
static gboolean
cancel_action_by_key(gpointer key, gpointer value, gpointer user_data)
{
gboolean remove = FALSE;
struct cancel_data *data = user_data;
active_op_t *op = value;
if (pcmk__str_eq(op->op_key, data->key, pcmk__str_none)) {
data->done = TRUE;
remove = !cancel_op(data->lrm_state, data->rsc->id, key, op->call_id, data->remove);
}
return remove;
}
static gboolean
cancel_op_key(lrm_state_t * lrm_state, lrmd_rsc_info_t * rsc, const char *key, gboolean remove)
{
guint removed = 0;
struct cancel_data data;
CRM_CHECK(rsc != NULL, return FALSE);
CRM_CHECK(key != NULL, return FALSE);
data.key = key;
data.rsc = rsc;
data.done = FALSE;
data.remove = remove;
data.lrm_state = lrm_state;
removed = g_hash_table_foreach_remove(lrm_state->active_ops,
cancel_action_by_key, &data);
crm_trace("Removed %u op cache entries, new size: %u",
removed, g_hash_table_size(lrm_state->active_ops));
return data.done;
}
/*!
* \internal
* \brief Retrieve resource information from LRM
*
* \param[in,out] lrm_state Executor connection state to use
* \param[in] rsc_xml XML containing resource configuration
* \param[in] do_create If true, register resource if not already
* \param[out] rsc_info Where to store information obtained from executor
*
* \retval pcmk_ok Success (and rsc_info holds newly allocated result)
* \retval -EINVAL Required information is missing from arguments
* \retval -ENOTCONN No active connection to LRM
* \retval -ENODEV Resource not found
* \retval -errno Error communicating with executor when registering resource
*
* \note Caller is responsible for freeing result on success.
*/
static int
get_lrm_resource(lrm_state_t *lrm_state, const xmlNode *rsc_xml,
gboolean do_create, lrmd_rsc_info_t **rsc_info)
{
const char *id = ID(rsc_xml);
CRM_CHECK(lrm_state && rsc_xml && rsc_info, return -EINVAL);
CRM_CHECK(id, return -EINVAL);
if (lrm_state_is_connected(lrm_state) == FALSE) {
return -ENOTCONN;
}
crm_trace("Retrieving resource information for %s from the executor", id);
*rsc_info = lrm_state_get_rsc_info(lrm_state, id, 0);
// If resource isn't known by ID, try clone name, if provided
if (!*rsc_info) {
const char *long_id = crm_element_value(rsc_xml, XML_ATTR_ID_LONG);
if (long_id) {
*rsc_info = lrm_state_get_rsc_info(lrm_state, long_id, 0);
}
}
if ((*rsc_info == NULL) && do_create) {
const char *class = crm_element_value(rsc_xml, XML_AGENT_ATTR_CLASS);
const char *provider = crm_element_value(rsc_xml, XML_AGENT_ATTR_PROVIDER);
const char *type = crm_element_value(rsc_xml, XML_ATTR_TYPE);
int rc;
crm_trace("Registering resource %s with the executor", id);
rc = lrm_state_register_rsc(lrm_state, id, class, provider, type,
lrmd_opt_drop_recurring);
if (rc != pcmk_ok) {
fsa_data_t *msg_data = NULL;
crm_err("Could not register resource %s with the executor on %s: %s "
CRM_XS " rc=%d",
id, lrm_state->node_name, pcmk_strerror(rc), rc);
/* Register this as an internal error if this involves the local
* executor. Otherwise, we're likely dealing with an unresponsive
* remote node, which is not an FSA failure.
*/
if (lrm_state_is_local(lrm_state) == TRUE) {
register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
}
return rc;
}
*rsc_info = lrm_state_get_rsc_info(lrm_state, id, 0);
}
return *rsc_info? pcmk_ok : -ENODEV;
}
static void
delete_resource(lrm_state_t *lrm_state, const char *id, lrmd_rsc_info_t *rsc,
GHashTableIter *iter, const char *sys, const char *user,
ha_msg_input_t *request, bool unregister, bool from_cib)
{
int rc = pcmk_ok;
crm_info("Removing resource %s from executor for %s%s%s",
id, sys, (user? " as " : ""), (user? user : ""));
if (rsc && unregister) {
rc = lrm_state_unregister_rsc(lrm_state, id, 0);
}
if (rc == pcmk_ok) {
crm_trace("Resource %s deleted from executor", id);
} else if (rc == -EINPROGRESS) {
crm_info("Deletion of resource '%s' from executor is pending", id);
if (request) {
struct pending_deletion_op_s *op = NULL;
char *ref = crm_element_value_copy(request->msg, XML_ATTR_REFERENCE);
op = calloc(1, sizeof(struct pending_deletion_op_s));
op->rsc = strdup(rsc->id);
op->input = copy_ha_msg_input(request);
g_hash_table_insert(lrm_state->deletion_ops, ref, op);
}
return;
} else {
crm_warn("Could not delete '%s' from executor for %s%s%s: %s "
CRM_XS " rc=%d", id, sys, (user? " as " : ""),
(user? user : ""), pcmk_strerror(rc), rc);
}
delete_rsc_entry(lrm_state, request, id, iter, rc, user, from_cib);
}
static int
get_fake_call_id(lrm_state_t *lrm_state, const char *rsc_id)
{
int call_id = 999999999;
rsc_history_t *entry = NULL;
if(lrm_state) {
entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id);
}
/* Make sure the call id is greater than the last successful operation,
* otherwise the failure will not result in a possible recovery of the resource
* as it could appear the failure occurred before the successful start */
if (entry) {
call_id = entry->last_callid + 1;
}
if (call_id < 0) {
call_id = 1;
}
return call_id;
}
static void
fake_op_status(lrm_state_t *lrm_state, lrmd_event_data_t *op, int op_status,
enum ocf_exitcode op_exitcode, const char *exit_reason)
{
op->call_id = get_fake_call_id(lrm_state, op->rsc_id);
op->t_run = time(NULL);
op->t_rcchange = op->t_run;
lrmd__set_result(op, op_exitcode, op_status, exit_reason);
}
static void
force_reprobe(lrm_state_t *lrm_state, const char *from_sys,
const char *from_host, const char *user_name,
gboolean is_remote_node, bool reprobe_all_nodes)
{
GHashTableIter gIter;
rsc_history_t *entry = NULL;
crm_info("Clearing resource history on node %s", lrm_state->node_name);
g_hash_table_iter_init(&gIter, lrm_state->resource_history);
while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
/* only unregister the resource during a reprobe if it is not a remote connection
* resource. otherwise unregistering the connection will terminate remote-node
* membership */
bool unregister = true;
if (is_remote_lrmd_ra(NULL, NULL, entry->id)) {
unregister = false;
if (reprobe_all_nodes) {
lrm_state_t *remote_lrm_state = lrm_state_find(entry->id);
if (remote_lrm_state != NULL) {
/* If reprobing all nodes, be sure to reprobe the remote
* node before clearing its connection resource
*/
force_reprobe(remote_lrm_state, from_sys, from_host,
user_name, TRUE, reprobe_all_nodes);
}
}
}
/* Don't delete from the CIB, since we'll delete the whole node's LRM
* state from the CIB soon
*/
delete_resource(lrm_state, entry->id, &entry->rsc, &gIter, from_sys,
user_name, NULL, unregister, false);
}
/* Now delete the copy in the CIB */
controld_delete_node_state(lrm_state->node_name, controld_section_lrm,
cib_scope_local);
// @COMPAT DCs < 1.1.14 need this deleted (in case it was explicitly false)
update_attrd(lrm_state->node_name, CRM_OP_PROBED, NULL, user_name, is_remote_node);
}
/*!
* \internal
* \brief Fail a requested action without actually executing it
*
* For an action that can't be executed, process it similarly to an actual
* execution result, with specified error status (except for notify actions,
* which will always be treated as successful).
*
* \param[in,out] lrm_state Executor connection that action is for
* \param[in] action Action XML from request
* \param[in] rc Desired return code to use
* \param[in] op_status Desired operation status to use
* \param[in] exit_reason Human-friendly detail, if error
*/
static void
synthesize_lrmd_failure(lrm_state_t *lrm_state, const xmlNode *action,
int op_status, enum ocf_exitcode rc,
const char *exit_reason)
{
lrmd_event_data_t *op = NULL;
const char *operation = crm_element_value(action, XML_LRM_ATTR_TASK);
const char *target_node = crm_element_value(action, XML_LRM_ATTR_TARGET);
xmlNode *xml_rsc = find_xml_node(action, XML_CIB_TAG_RESOURCE, TRUE);
if ((xml_rsc == NULL) || (ID(xml_rsc) == NULL)) {
/* @TODO Should we do something else, like direct ack? */
crm_info("Can't fake %s failure (%d) on %s without resource configuration",
crm_element_value(action, XML_LRM_ATTR_TASK_KEY), rc,
target_node);
return;
} else if(operation == NULL) {
/* This probably came from crm_resource -C, nothing to do */
crm_info("Can't fake %s failure (%d) on %s without operation",
ID(xml_rsc), rc, target_node);
return;
}
op = construct_op(lrm_state, action, ID(xml_rsc), operation);
if (pcmk__str_eq(operation, PCMK_ACTION_NOTIFY, pcmk__str_casei)) {
// Notifications can't fail
fake_op_status(lrm_state, op, PCMK_EXEC_DONE, PCMK_OCF_OK, NULL);
} else {
fake_op_status(lrm_state, op, op_status, rc, exit_reason);
}
crm_info("Faking " PCMK__OP_FMT " result (%d) on %s",
op->rsc_id, op->op_type, op->interval_ms, op->rc, target_node);
// Process the result as if it came from the LRM
process_lrm_event(lrm_state, op, NULL, action);
lrmd_free_event(op);
}
/*!
* \internal
* \brief Get target of an LRM operation (replacing \p NULL with local node
* name)
*
* \param[in] xml LRM operation data XML
*
* \return LRM operation target node name (local node or Pacemaker Remote node)
*/
static const char *
lrm_op_target(const xmlNode *xml)
{
const char *target = NULL;
if (xml) {
target = crm_element_value(xml, XML_LRM_ATTR_TARGET);
}
if (target == NULL) {
target = controld_globals.our_nodename;
}
return target;
}
static void
fail_lrm_resource(xmlNode *xml, lrm_state_t *lrm_state, const char *user_name,
const char *from_host, const char *from_sys)
{
lrmd_event_data_t *op = NULL;
lrmd_rsc_info_t *rsc = NULL;
xmlNode *xml_rsc = find_xml_node(xml, XML_CIB_TAG_RESOURCE, TRUE);
CRM_CHECK(xml_rsc != NULL, return);
/* The executor simply executes operations and reports the results, without
* any concept of success or failure, so to fail a resource, we must fake
* what a failure looks like.
*
* To do this, we create a fake executor operation event for the resource,
* and pass that event to the executor client callback so it will be
* processed as if it came from the executor.
*/
op = construct_op(lrm_state, xml, ID(xml_rsc), "asyncmon");
free((char*) op->user_data);
op->user_data = NULL;
op->interval_ms = 0;
if (user_name && !pcmk__is_privileged(user_name)) {
crm_err("%s does not have permission to fail %s", user_name, ID(xml_rsc));
fake_op_status(lrm_state, op, PCMK_EXEC_ERROR,
PCMK_OCF_INSUFFICIENT_PRIV,
"Unprivileged user cannot fail resources");
controld_ack_event_directly(from_host, from_sys, NULL, op, ID(xml_rsc));
lrmd_free_event(op);
return;
}
if (get_lrm_resource(lrm_state, xml_rsc, TRUE, &rsc) == pcmk_ok) {
crm_info("Failing resource %s...", rsc->id);
fake_op_status(lrm_state, op, PCMK_EXEC_DONE, PCMK_OCF_UNKNOWN_ERROR,
"Simulated failure");
process_lrm_event(lrm_state, op, NULL, xml);
op->rc = PCMK_OCF_OK; // The request to fail the resource succeeded
lrmd_free_rsc_info(rsc);
} else {
crm_info("Cannot find/create resource in order to fail it...");
crm_log_xml_warn(xml, "bad input");
fake_op_status(lrm_state, op, PCMK_EXEC_ERROR, PCMK_OCF_UNKNOWN_ERROR,
"Cannot fail unknown resource");
}
controld_ack_event_directly(from_host, from_sys, NULL, op, ID(xml_rsc));
lrmd_free_event(op);
}
static void
handle_reprobe_op(lrm_state_t *lrm_state, const char *from_sys,
const char *from_host, const char *user_name,
gboolean is_remote_node, bool reprobe_all_nodes)
{
crm_notice("Forcing the status of all resources to be redetected");
force_reprobe(lrm_state, from_sys, from_host, user_name, is_remote_node,
reprobe_all_nodes);
if (!pcmk__strcase_any_of(from_sys, CRM_SYSTEM_PENGINE, CRM_SYSTEM_TENGINE, NULL)) {
xmlNode *reply = create_request(CRM_OP_INVOKE_LRM, NULL, from_host,
from_sys, CRM_SYSTEM_LRMD,
controld_globals.our_uuid);
crm_debug("ACK'ing re-probe from %s (%s)", from_sys, from_host);
if (relay_message(reply, TRUE) == FALSE) {
crm_log_xml_err(reply, "Unable to route reply");
}
free_xml(reply);
}
}
static bool do_lrm_cancel(ha_msg_input_t *input, lrm_state_t *lrm_state,
lrmd_rsc_info_t *rsc, const char *from_host, const char *from_sys)
{
char *op_key = NULL;
char *meta_key = NULL;
int call = 0;
const char *call_id = NULL;
const char *op_task = NULL;
guint interval_ms = 0;
gboolean in_progress = FALSE;
xmlNode *params = find_xml_node(input->xml, XML_TAG_ATTRS, TRUE);
CRM_CHECK(params != NULL, return FALSE);
meta_key = crm_meta_name(XML_LRM_ATTR_TASK);
op_task = crm_element_value(params, meta_key);
free(meta_key);
CRM_CHECK(op_task != NULL, return FALSE);
meta_key = crm_meta_name(XML_LRM_ATTR_INTERVAL_MS);
if (crm_element_value_ms(params, meta_key, &interval_ms) != pcmk_ok) {
free(meta_key);
return FALSE;
}
free(meta_key);
op_key = pcmk__op_key(rsc->id, op_task, interval_ms);
meta_key = crm_meta_name(XML_LRM_ATTR_CALLID);
call_id = crm_element_value(params, meta_key);
free(meta_key);
crm_debug("Scheduler requested op %s (call=%s) be cancelled",
op_key, (call_id? call_id : "NA"));
pcmk__scan_min_int(call_id, &call, 0);
if (call == 0) {
// Normal case when the scheduler cancels a recurring op
in_progress = cancel_op_key(lrm_state, rsc, op_key, TRUE);
} else {
// Normal case when the scheduler cancels an orphan op
in_progress = cancel_op(lrm_state, rsc->id, NULL, call, TRUE);
}
// Acknowledge cancellation operation if for a remote connection resource
if (!in_progress || is_remote_lrmd_ra(NULL, NULL, rsc->id)) {
char *op_id = make_stop_id(rsc->id, call);
if (is_remote_lrmd_ra(NULL, NULL, rsc->id) == FALSE) {
crm_info("Nothing known about operation %d for %s", call, op_key);
}
controld_delete_action_history_by_key(rsc->id, lrm_state->node_name,
op_key, call);
send_task_ok_ack(lrm_state, input, rsc->id, rsc, op_task,
from_host, from_sys);
/* needed at least for cancellation of a remote operation */
if (lrm_state->active_ops != NULL) {
g_hash_table_remove(lrm_state->active_ops, op_id);
}
free(op_id);
} else {
/* No ack is needed since abcdaa8, but peers with older versions
* in a rolling upgrade need one. We didn't bump the feature set
* at that commit, so we can only compare against the previous
* CRM version (3.0.8). If any peers have feature set 3.0.9 but
* not abcdaa8, they will time out waiting for the ack (no
* released versions of Pacemaker are affected).
*/
const char *peer_version = crm_element_value(params, XML_ATTR_CRM_VERSION);
if (compare_version(peer_version, "3.0.8") <= 0) {
crm_info("Sending compatibility ack for %s cancellation to %s (CRM version %s)",
op_key, from_host, peer_version);
send_task_ok_ack(lrm_state, input, rsc->id, rsc, op_task,
from_host, from_sys);
}
}
free(op_key);
return TRUE;
}
static void
do_lrm_delete(ha_msg_input_t *input, lrm_state_t *lrm_state,
lrmd_rsc_info_t *rsc, const char *from_sys, const char *from_host,
bool crm_rsc_delete, const char *user_name)
{
bool unregister = true;
int cib_rc = controld_delete_resource_history(rsc->id, lrm_state->node_name,
user_name,
cib_dryrun|cib_sync_call);
if (cib_rc != pcmk_rc_ok) {
lrmd_event_data_t *op = NULL;
op = construct_op(lrm_state, input->xml, rsc->id, PCMK_ACTION_DELETE);
/* These are resource clean-ups, not actions, so no exit reason is
* needed.
*/
lrmd__set_result(op, pcmk_rc2ocf(cib_rc), PCMK_EXEC_ERROR, NULL);
controld_ack_event_directly(from_host, from_sys, NULL, op, rsc->id);
lrmd_free_event(op);
return;
}
if (crm_rsc_delete && is_remote_lrmd_ra(NULL, NULL, rsc->id)) {
unregister = false;
}
delete_resource(lrm_state, rsc->id, rsc, NULL, from_sys,
user_name, input, unregister, true);
}
// User data for asynchronous metadata execution
struct metadata_cb_data {
lrmd_rsc_info_t *rsc; // Copy of resource information
xmlNode *input_xml; // Copy of FSA input XML
};
static struct metadata_cb_data *
new_metadata_cb_data(lrmd_rsc_info_t *rsc, xmlNode *input_xml)
{
struct metadata_cb_data *data = NULL;
data = calloc(1, sizeof(struct metadata_cb_data));
CRM_ASSERT(data != NULL);
data->input_xml = copy_xml(input_xml);
data->rsc = lrmd_copy_rsc_info(rsc);
return data;
}
static void
free_metadata_cb_data(struct metadata_cb_data *data)
{
lrmd_free_rsc_info(data->rsc);
free_xml(data->input_xml);
free(data);
}
/*!
* \internal
* \brief Execute an action after metadata has been retrieved
*
* \param[in] pid Ignored
* \param[in] result Result of metadata action
* \param[in] user_data Metadata callback data
*/
static void
metadata_complete(int pid, const pcmk__action_result_t *result, void *user_data)
{
struct metadata_cb_data *data = (struct metadata_cb_data *) user_data;
struct ra_metadata_s *md = NULL;
lrm_state_t *lrm_state = lrm_state_find(lrm_op_target(data->input_xml));
if ((lrm_state != NULL) && pcmk__result_ok(result)) {
md = controld_cache_metadata(lrm_state->metadata_cache, data->rsc,
result->action_stdout);
}
- do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md);
+ if (!pcmk_is_set(controld_globals.fsa_input_register, R_HA_DISCONNECTED)) {
+ do_lrm_rsc_op(lrm_state, data->rsc, data->input_xml, md);
+ }
free_metadata_cb_data(data);
}
/* A_LRM_INVOKE */
void
do_lrm_invoke(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state,
enum crmd_fsa_input current_input, fsa_data_t * msg_data)
{
lrm_state_t *lrm_state = NULL;
const char *crm_op = NULL;
const char *from_sys = NULL;
const char *from_host = NULL;
const char *operation = NULL;
ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg);
const char *user_name = NULL;
const char *target_node = lrm_op_target(input->xml);
gboolean is_remote_node = FALSE;
bool crm_rsc_delete = FALSE;
// Message routed to the local node is targeting a specific, non-local node
is_remote_node = !pcmk__str_eq(target_node, controld_globals.our_nodename,
pcmk__str_casei);
lrm_state = lrm_state_find(target_node);
if ((lrm_state == NULL) && is_remote_node) {
crm_err("Failing action because local node has never had connection to remote node %s",
target_node);
synthesize_lrmd_failure(NULL, input->xml, PCMK_EXEC_NOT_CONNECTED,
PCMK_OCF_UNKNOWN_ERROR,
"Local node has no connection to remote");
return;
}
CRM_ASSERT(lrm_state != NULL);
user_name = pcmk__update_acl_user(input->msg, F_CRM_USER, NULL);
crm_op = crm_element_value(input->msg, F_CRM_TASK);
from_sys = crm_element_value(input->msg, F_CRM_SYS_FROM);
if (!pcmk__str_eq(from_sys, CRM_SYSTEM_TENGINE, pcmk__str_none)) {
from_host = crm_element_value(input->msg, F_CRM_HOST_FROM);
}
if (pcmk__str_eq(crm_op, PCMK_ACTION_LRM_DELETE, pcmk__str_none)) {
if (!pcmk__str_eq(from_sys, CRM_SYSTEM_TENGINE, pcmk__str_none)) {
crm_rsc_delete = TRUE; // from crm_resource
}
operation = PCMK_ACTION_DELETE;
} else if (input->xml != NULL) {
operation = crm_element_value(input->xml, XML_LRM_ATTR_TASK);
}
CRM_CHECK(!pcmk__str_empty(crm_op) || !pcmk__str_empty(operation), return);
crm_trace("'%s' execution request from %s as %s user",
pcmk__s(crm_op, operation),
pcmk__s(from_sys, "unknown subsystem"),
pcmk__s(user_name, "current"));
if (pcmk__str_eq(crm_op, CRM_OP_LRM_FAIL, pcmk__str_none)) {
fail_lrm_resource(input->xml, lrm_state, user_name, from_host,
from_sys);
} else if (pcmk__str_eq(crm_op, CRM_OP_LRM_REFRESH, pcmk__str_none)) {
/* @COMPAT This can only be sent by crm_resource --refresh on a
* Pacemaker Remote node running Pacemaker 1.1.9, which is extremely
* unlikely. It previously would cause the controller to re-write its
* resource history to the CIB. Just ignore it.
*/
crm_notice("Ignoring refresh request from Pacemaker Remote 1.1.9 node");
// @COMPAT DCs <1.1.14 in a rolling upgrade might schedule this op
} else if (pcmk__str_eq(operation, CRM_OP_PROBED, pcmk__str_none)) {
update_attrd(lrm_state->node_name, CRM_OP_PROBED, XML_BOOLEAN_TRUE,
user_name, is_remote_node);
} else if (pcmk__str_eq(crm_op, CRM_OP_REPROBE, pcmk__str_none)
|| pcmk__str_eq(operation, CRM_OP_REPROBE, pcmk__str_none)) {
const char *raw_target = NULL;
if (input->xml != NULL) {
// For CRM_OP_REPROBE, a NULL target means we're targeting all nodes
raw_target = crm_element_value(input->xml, XML_LRM_ATTR_TARGET);
}
handle_reprobe_op(lrm_state, from_sys, from_host, user_name,
is_remote_node, (raw_target == NULL));
} else if (operation != NULL) {
lrmd_rsc_info_t *rsc = NULL;
xmlNode *xml_rsc = find_xml_node(input->xml, XML_CIB_TAG_RESOURCE, TRUE);
gboolean create_rsc = !pcmk__str_eq(operation, PCMK_ACTION_DELETE,
pcmk__str_none);
int rc;
// We can't return anything meaningful without a resource ID
CRM_CHECK(xml_rsc && ID(xml_rsc), return);
rc = get_lrm_resource(lrm_state, xml_rsc, create_rsc, &rsc);
if (rc == -ENOTCONN) {
synthesize_lrmd_failure(lrm_state, input->xml,
PCMK_EXEC_NOT_CONNECTED,
PCMK_OCF_UNKNOWN_ERROR,
"Not connected to remote executor");
return;
} else if ((rc < 0) && !create_rsc) {
/* Delete of malformed or nonexistent resource
* (deleting something that does not exist is a success)
*/
crm_notice("Not registering resource '%s' for a %s event "
CRM_XS " get-rc=%d (%s) transition-key=%s",
ID(xml_rsc), operation,
rc, pcmk_strerror(rc), ID(input->xml));
delete_rsc_entry(lrm_state, input, ID(xml_rsc), NULL, pcmk_ok,
user_name, true);
return;
} else if (rc == -EINVAL) {
// Resource operation on malformed resource
crm_err("Invalid resource definition for %s", ID(xml_rsc));
crm_log_xml_warn(input->msg, "invalid resource");
synthesize_lrmd_failure(lrm_state, input->xml, PCMK_EXEC_ERROR,
PCMK_OCF_NOT_CONFIGURED, // fatal error
"Invalid resource definition");
return;
} else if (rc < 0) {
// Error communicating with the executor
crm_err("Could not register resource '%s' with executor: %s "
CRM_XS " rc=%d",
ID(xml_rsc), pcmk_strerror(rc), rc);
crm_log_xml_warn(input->msg, "failed registration");
synthesize_lrmd_failure(lrm_state, input->xml, PCMK_EXEC_ERROR,
PCMK_OCF_INVALID_PARAM, // hard error
"Could not register resource with executor");
return;
}
if (pcmk__str_eq(operation, PCMK_ACTION_CANCEL, pcmk__str_none)) {
if (!do_lrm_cancel(input, lrm_state, rsc, from_host, from_sys)) {
crm_log_xml_warn(input->xml, "Bad command");
}
} else if (pcmk__str_eq(operation, PCMK_ACTION_DELETE,
pcmk__str_none)) {
do_lrm_delete(input, lrm_state, rsc, from_sys, from_host,
crm_rsc_delete, user_name);
} else {
struct ra_metadata_s *md = NULL;
/* Getting metadata from cache is OK except for start actions --
* always refresh from the agent for those, in case the resource
* agent was updated.
*
* @TODO Only refresh metadata for starts if the agent actually
* changed (using something like inotify, or a hash or modification
* time of the agent executable).
*/
if (strcmp(operation, PCMK_ACTION_START) != 0) {
md = controld_get_rsc_metadata(lrm_state, rsc,
controld_metadata_from_cache);
}
if ((md == NULL) && crm_op_needs_metadata(rsc->standard,
operation)) {
/* Most likely, we'll need the agent metadata to record the
* pending operation and the operation result. Get it now rather
* than wait until then, so the metadata action doesn't eat into
* the real action's timeout.
*
* @TODO Metadata is retrieved via direct execution of the
* agent, which has a couple of related issues: the executor
* should execute agents, not the controller; and metadata for
* Pacemaker Remote nodes should be collected on those nodes,
* not locally.
*/
struct metadata_cb_data *data = NULL;
data = new_metadata_cb_data(rsc, input->xml);
crm_info("Retrieving metadata for %s (%s%s%s:%s) asynchronously",
rsc->id, rsc->standard,
((rsc->provider == NULL)? "" : ":"),
((rsc->provider == NULL)? "" : rsc->provider),
rsc->type);
(void) lrmd__metadata_async(rsc, metadata_complete,
(void *) data);
} else {
do_lrm_rsc_op(lrm_state, rsc, input->xml, md);
}
}
lrmd_free_rsc_info(rsc);
} else {
crm_err("Invalid execution request: unknown command '%s' (bug?)",
crm_op);
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
}
}
static lrmd_event_data_t *
construct_op(const lrm_state_t *lrm_state, const xmlNode *rsc_op,
const char *rsc_id, const char *operation)
{
lrmd_event_data_t *op = NULL;
const char *op_delay = NULL;
const char *op_timeout = NULL;
GHashTable *params = NULL;
xmlNode *primitive = NULL;
const char *class = NULL;
const char *transition = NULL;
CRM_ASSERT(rsc_id && operation);
op = lrmd_new_event(rsc_id, operation, 0);
op->type = lrmd_event_exec_complete;
op->timeout = 0;
op->start_delay = 0;
lrmd__set_result(op, PCMK_OCF_UNKNOWN, PCMK_EXEC_PENDING, NULL);
if (rsc_op == NULL) {
CRM_LOG_ASSERT(pcmk__str_eq(operation, PCMK_ACTION_STOP,
pcmk__str_casei));
op->user_data = NULL;
/* the stop_all_resources() case
* by definition there is no DC (or they'd be shutting
* us down).
* So we should put our version here.
*/
op->params = pcmk__strkey_table(free, free);
g_hash_table_insert(op->params, strdup(XML_ATTR_CRM_VERSION), strdup(CRM_FEATURE_SET));
crm_trace("Constructed %s op for %s", operation, rsc_id);
return op;
}
params = xml2list(rsc_op);
g_hash_table_remove(params, CRM_META "_op_target_rc");
op_delay = crm_meta_value(params, XML_OP_ATTR_START_DELAY);
pcmk__scan_min_int(op_delay, &op->start_delay, 0);
op_timeout = crm_meta_value(params, XML_ATTR_TIMEOUT);
pcmk__scan_min_int(op_timeout, &op->timeout, 0);
if (pcmk__guint_from_hash(params, CRM_META "_" XML_LRM_ATTR_INTERVAL_MS, 0,
&(op->interval_ms)) != pcmk_rc_ok) {
op->interval_ms = 0;
}
/* Use pcmk_monitor_timeout instead of meta timeout for stonith
recurring monitor, if set */
primitive = find_xml_node(rsc_op, XML_CIB_TAG_RESOURCE, FALSE);
class = crm_element_value(primitive, XML_AGENT_ATTR_CLASS);
if (pcmk_is_set(pcmk_get_ra_caps(class), pcmk_ra_cap_fence_params)
&& pcmk__str_eq(operation, PCMK_ACTION_MONITOR, pcmk__str_casei)
&& (op->interval_ms > 0)) {
op_timeout = g_hash_table_lookup(params, "pcmk_monitor_timeout");
if (op_timeout != NULL) {
op->timeout = crm_get_msec(op_timeout);
}
}
if (!pcmk__str_eq(operation, PCMK_ACTION_STOP, pcmk__str_casei)) {
op->params = params;
} else {
rsc_history_t *entry = NULL;
if (lrm_state) {
entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id);
}
/* If we do not have stop parameters cached, use
* whatever we are given */
if (!entry || !entry->stop_params) {
op->params = params;
} else {
/* Copy the cached parameter list so that we stop the resource
* with the old attributes, not the new ones */
op->params = pcmk__strkey_table(free, free);
g_hash_table_foreach(params, copy_meta_keys, op->params);
g_hash_table_foreach(entry->stop_params, copy_instance_keys, op->params);
g_hash_table_destroy(params);
params = NULL;
}
}
/* sanity */
if (op->timeout <= 0) {
op->timeout = op->interval_ms;
}
if (op->start_delay < 0) {
op->start_delay = 0;
}
transition = crm_element_value(rsc_op, XML_ATTR_TRANSITION_KEY);
CRM_CHECK(transition != NULL, return op);
op->user_data = strdup(transition);
if (op->interval_ms != 0) {
if (pcmk__strcase_any_of(operation, PCMK_ACTION_START, PCMK_ACTION_STOP,
NULL)) {
crm_err("Start and Stop actions cannot have an interval: %u",
op->interval_ms);
op->interval_ms = 0;
}
}
crm_trace("Constructed %s op for %s: interval=%u",
operation, rsc_id, op->interval_ms);
return op;
}
/*!
* \internal
* \brief Send a (synthesized) event result
*
* Reply with a synthesized event result directly, as opposed to going through
* the executor.
*
* \param[in] to_host Host to send result to
* \param[in] to_sys IPC name to send result (NULL for transition engine)
* \param[in] rsc Type information about resource the result is for
* \param[in,out] op Event with result to send
* \param[in] rsc_id ID of resource the result is for
*/
void
controld_ack_event_directly(const char *to_host, const char *to_sys,
const lrmd_rsc_info_t *rsc, lrmd_event_data_t *op,
const char *rsc_id)
{
xmlNode *reply = NULL;
xmlNode *update, *iter;
crm_node_t *peer = NULL;
CRM_CHECK(op != NULL, return);
if (op->rsc_id == NULL) {
CRM_ASSERT(rsc_id != NULL);
op->rsc_id = strdup(rsc_id);
}
if (to_sys == NULL) {
to_sys = CRM_SYSTEM_TENGINE;
}
peer = crm_get_peer(0, controld_globals.our_nodename);
update = create_node_state_update(peer, node_update_none, NULL,
__func__);
iter = create_xml_node(update, XML_CIB_TAG_LRM);
crm_xml_add(iter, XML_ATTR_ID, controld_globals.our_uuid);
iter = create_xml_node(iter, XML_LRM_TAG_RESOURCES);
iter = create_xml_node(iter, XML_LRM_TAG_RESOURCE);
crm_xml_add(iter, XML_ATTR_ID, op->rsc_id);
controld_add_resource_history_xml(iter, rsc, op,
controld_globals.our_nodename);
reply = create_request(CRM_OP_INVOKE_LRM, update, to_host, to_sys, CRM_SYSTEM_LRMD, NULL);
crm_log_xml_trace(update, "[direct ACK]");
crm_debug("ACK'ing resource op " PCMK__OP_FMT " from %s: %s",
op->rsc_id, op->op_type, op->interval_ms, op->user_data,
crm_element_value(reply, XML_ATTR_REFERENCE));
if (relay_message(reply, TRUE) == FALSE) {
crm_log_xml_err(reply, "Unable to route reply");
}
free_xml(update);
free_xml(reply);
}
gboolean
verify_stopped(enum crmd_fsa_state cur_state, int log_level)
{
gboolean res = TRUE;
GList *lrm_state_list = lrm_state_get_list();
GList *state_entry;
for (state_entry = lrm_state_list; state_entry != NULL; state_entry = state_entry->next) {
lrm_state_t *lrm_state = state_entry->data;
if (!lrm_state_verify_stopped(lrm_state, cur_state, log_level)) {
/* keep iterating through all even when false is returned */
res = FALSE;
}
}
controld_set_fsa_input_flags(R_SENT_RSC_STOP);
g_list_free(lrm_state_list); lrm_state_list = NULL;
return res;
}
struct stop_recurring_action_s {
lrmd_rsc_info_t *rsc;
lrm_state_t *lrm_state;
};
static gboolean
stop_recurring_action_by_rsc(gpointer key, gpointer value, gpointer user_data)
{
gboolean remove = FALSE;
struct stop_recurring_action_s *event = user_data;
active_op_t *op = value;
if ((op->interval_ms != 0)
&& pcmk__str_eq(op->rsc_id, event->rsc->id, pcmk__str_none)) {
crm_debug("Cancelling op %d for %s (%s)", op->call_id, op->rsc_id, (char*)key);
remove = !cancel_op(event->lrm_state, event->rsc->id, key, op->call_id, FALSE);
}
return remove;
}
static gboolean
stop_recurring_actions(gpointer key, gpointer value, gpointer user_data)
{
gboolean remove = FALSE;
lrm_state_t *lrm_state = user_data;
active_op_t *op = value;
if (op->interval_ms != 0) {
crm_info("Cancelling op %d for %s (%s)", op->call_id, op->rsc_id,
(const char *) key);
remove = !cancel_op(lrm_state, op->rsc_id, key, op->call_id, FALSE);
}
return remove;
}
/*!
* \internal
* \brief Check whether recurring actions should be cancelled before an action
*
* \param[in] rsc_id Resource that action is for
* \param[in] action Action being performed
* \param[in] interval_ms Operation interval of \p action (in milliseconds)
*
* \return true if recurring actions should be cancelled, otherwise false
*/
static bool
should_cancel_recurring(const char *rsc_id, const char *action, guint interval_ms)
{
if (is_remote_lrmd_ra(NULL, NULL, rsc_id) && (interval_ms == 0)
&& (strcmp(action, PCMK_ACTION_MIGRATE_TO) == 0)) {
/* Don't stop monitoring a migrating Pacemaker Remote connection
* resource until the entire migration has completed. We must detect if
* the connection is unexpectedly severed, even during a migration.
*/
return false;
}
// Cancel recurring actions before changing resource state
return (interval_ms == 0)
&& !pcmk__str_any_of(action, PCMK_ACTION_MONITOR,
PCMK_ACTION_NOTIFY, NULL);
}
/*!
* \internal
* \brief Check whether an action should not be performed at this time
*
* \param[in] operation Action to be performed
*
* \return Readable description of why action should not be performed,
* or NULL if it should be performed
*/
static const char *
should_nack_action(const char *action)
{
if (pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)
&& pcmk__str_eq(action, PCMK_ACTION_START, pcmk__str_none)) {
register_fsa_input(C_SHUTDOWN, I_SHUTDOWN, NULL);
return "Not attempting start due to shutdown in progress";
}
switch (controld_globals.fsa_state) {
case S_NOT_DC:
case S_POLICY_ENGINE: // Recalculating
case S_TRANSITION_ENGINE:
break;
default:
if (!pcmk__str_eq(action, PCMK_ACTION_STOP, pcmk__str_none)) {
return "Controller cannot attempt actions at this time";
}
break;
}
return NULL;
}
static void
do_lrm_rsc_op(lrm_state_t *lrm_state, lrmd_rsc_info_t *rsc, xmlNode *msg,
struct ra_metadata_s *md)
{
int rc;
int call_id = 0;
char *op_id = NULL;
lrmd_event_data_t *op = NULL;
fsa_data_t *msg_data = NULL;
const char *transition = NULL;
const char *operation = NULL;
const char *nack_reason = NULL;
CRM_CHECK((rsc != NULL) && (msg != NULL), return);
operation = crm_element_value(msg, XML_LRM_ATTR_TASK);
CRM_CHECK(!pcmk__str_empty(operation), return);
transition = crm_element_value(msg, XML_ATTR_TRANSITION_KEY);
if (pcmk__str_empty(transition)) {
crm_log_xml_err(msg, "Missing transition number");
}
if (lrm_state == NULL) {
// This shouldn't be possible, but provide a failsafe just in case
crm_err("Cannot execute %s of %s: No executor connection "
CRM_XS " transition_key=%s",
operation, rsc->id, pcmk__s(transition, ""));
synthesize_lrmd_failure(NULL, msg, PCMK_EXEC_INVALID,
PCMK_OCF_UNKNOWN_ERROR,
"No executor connection");
return;
}
if (pcmk__str_any_of(operation, PCMK_ACTION_RELOAD,
PCMK_ACTION_RELOAD_AGENT, NULL)) {
/* Pre-2.1.0 DCs will schedule reload actions only, and 2.1.0+ DCs
* will schedule reload-agent actions only. In either case, we need
* to map that to whatever the resource agent actually supports.
* Default to the OCF 1.1 name.
*/
if ((md != NULL)
&& pcmk_is_set(md->ra_flags, ra_supports_legacy_reload)) {
operation = PCMK_ACTION_RELOAD;
} else {
operation = PCMK_ACTION_RELOAD_AGENT;
}
}
op = construct_op(lrm_state, msg, rsc->id, operation);
CRM_CHECK(op != NULL, return);
if (should_cancel_recurring(rsc->id, operation, op->interval_ms)) {
guint removed = 0;
struct stop_recurring_action_s data;
data.rsc = rsc;
data.lrm_state = lrm_state;
removed = g_hash_table_foreach_remove(lrm_state->active_ops,
stop_recurring_action_by_rsc,
&data);
if (removed) {
crm_debug("Stopped %u recurring operation%s in preparation for "
PCMK__OP_FMT, removed, pcmk__plural_s(removed),
rsc->id, operation, op->interval_ms);
}
}
/* now do the op */
crm_notice("Requesting local execution of %s operation for %s on %s "
CRM_XS " transition_key=%s op_key=" PCMK__OP_FMT,
pcmk__readable_action(op->op_type, op->interval_ms), rsc->id,
lrm_state->node_name, pcmk__s(transition, ""), rsc->id,
operation, op->interval_ms);
nack_reason = should_nack_action(operation);
if (nack_reason != NULL) {
crm_notice("Discarding attempt to perform action %s on %s in state %s "
"(shutdown=%s)", operation, rsc->id,
fsa_state2string(controld_globals.fsa_state),
pcmk__btoa(pcmk_is_set(controld_globals.fsa_input_register,
R_SHUTDOWN)));
lrmd__set_result(op, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_INVALID,
nack_reason);
controld_ack_event_directly(NULL, NULL, rsc, op, rsc->id);
lrmd_free_event(op);
free(op_id);
return;
}
controld_record_pending_op(lrm_state->node_name, rsc, op);
op_id = pcmk__op_key(rsc->id, op->op_type, op->interval_ms);
if (op->interval_ms > 0) {
/* cancel it so we can then restart it without conflict */
cancel_op_key(lrm_state, rsc, op_id, FALSE);
}
rc = controld_execute_resource_agent(lrm_state, rsc->id, op->op_type,
op->user_data, op->interval_ms,
op->timeout, op->start_delay,
op->params, &call_id);
if (rc == pcmk_rc_ok) {
/* record all operations so we can wait
* for them to complete during shutdown
*/
char *call_id_s = make_stop_id(rsc->id, call_id);
active_op_t *pending = NULL;
pending = calloc(1, sizeof(active_op_t));
crm_trace("Recording pending op: %d - %s %s", call_id, op_id, call_id_s);
pending->call_id = call_id;
pending->interval_ms = op->interval_ms;
pending->op_type = strdup(operation);
pending->op_key = strdup(op_id);
pending->rsc_id = strdup(rsc->id);
pending->start_time = time(NULL);
pcmk__str_update(&pending->user_data, op->user_data);
if (crm_element_value_epoch(msg, XML_CONFIG_ATTR_SHUTDOWN_LOCK,
&(pending->lock_time)) != pcmk_ok) {
pending->lock_time = 0;
}
g_hash_table_replace(lrm_state->active_ops, call_id_s, pending);
if ((op->interval_ms > 0)
&& (op->start_delay > START_DELAY_THRESHOLD)) {
int target_rc = PCMK_OCF_OK;
crm_info("Faking confirmation of %s: execution postponed for over 5 minutes", op_id);
decode_transition_key(op->user_data, NULL, NULL, NULL, &target_rc);
lrmd__set_result(op, target_rc, PCMK_EXEC_DONE, NULL);
controld_ack_event_directly(NULL, NULL, rsc, op, rsc->id);
}
pending->params = op->params;
op->params = NULL;
} else if (lrm_state_is_local(lrm_state)) {
crm_err("Could not initiate %s action for resource %s locally: %s "
CRM_XS " rc=%d", operation, rsc->id, pcmk_rc_str(rc), rc);
fake_op_status(lrm_state, op, PCMK_EXEC_NOT_CONNECTED,
PCMK_OCF_UNKNOWN_ERROR, pcmk_rc_str(rc));
process_lrm_event(lrm_state, op, NULL, NULL);
register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
} else {
crm_err("Could not initiate %s action for resource %s remotely on %s: "
"%s " CRM_XS " rc=%d",
operation, rsc->id, lrm_state->node_name, pcmk_rc_str(rc), rc);
fake_op_status(lrm_state, op, PCMK_EXEC_NOT_CONNECTED,
PCMK_OCF_UNKNOWN_ERROR, pcmk_rc_str(rc));
process_lrm_event(lrm_state, op, NULL, NULL);
}
free(op_id);
lrmd_free_event(op);
}
void
do_lrm_event(long long action,
enum crmd_fsa_cause cause,
enum crmd_fsa_state cur_state, enum crmd_fsa_input cur_input, fsa_data_t * msg_data)
{
CRM_CHECK(FALSE, return);
}
static char *
unescape_newlines(const char *string)
{
char *pch = NULL;
char *ret = NULL;
static const char *escaped_newline = "\\n";
if (!string) {
return NULL;
}
ret = strdup(string);
pch = strstr(ret, escaped_newline);
while (pch != NULL) {
/* Replace newline escape pattern with actual newline (and a space so we
* don't have to shuffle the rest of the buffer)
*/
pch[0] = '\n';
pch[1] = ' ';
pch = strstr(pch, escaped_newline);
}
return ret;
}
static bool
did_lrm_rsc_op_fail(lrm_state_t *lrm_state, const char * rsc_id,
const char * op_type, guint interval_ms)
{
rsc_history_t *entry = NULL;
CRM_CHECK(lrm_state != NULL, return FALSE);
CRM_CHECK(rsc_id != NULL, return FALSE);
CRM_CHECK(op_type != NULL, return FALSE);
entry = g_hash_table_lookup(lrm_state->resource_history, rsc_id);
if (entry == NULL || entry->failed == NULL) {
return FALSE;
}
if (pcmk__str_eq(entry->failed->rsc_id, rsc_id, pcmk__str_none)
&& pcmk__str_eq(entry->failed->op_type, op_type, pcmk__str_casei)
&& entry->failed->interval_ms == interval_ms) {
return TRUE;
}
return FALSE;
}
/*!
* \internal
* \brief Log the result of an executor action (actual or synthesized)
*
* \param[in] op Executor action to log result for
* \param[in] op_key Operation key for action
* \param[in] node_name Name of node action was performed on, if known
* \param[in] confirmed Whether to log that graph action was confirmed
*/
static void
log_executor_event(const lrmd_event_data_t *op, const char *op_key,
const char *node_name, gboolean confirmed)
{
int log_level = LOG_ERR;
GString *str = g_string_sized_new(100); // reasonable starting size
pcmk__g_strcat(str,
"Result of ",
pcmk__readable_action(op->op_type, op->interval_ms),
" operation for ", op->rsc_id, NULL);
if (node_name != NULL) {
pcmk__g_strcat(str, " on ", node_name, NULL);
}
switch (op->op_status) {
case PCMK_EXEC_DONE:
log_level = LOG_NOTICE;
pcmk__g_strcat(str, ": ", services_ocf_exitcode_str(op->rc), NULL);
break;
case PCMK_EXEC_TIMEOUT:
pcmk__g_strcat(str,
": ", pcmk_exec_status_str(op->op_status), " after ",
pcmk__readable_interval(op->timeout), NULL);
break;
case PCMK_EXEC_CANCELLED:
log_level = LOG_INFO;
/* order of __attribute__ and Fall through comment is IMPORTANT!
* do not change it without proper testing with both clang and gcc
* in multiple versions.
* the clang check allows to build with all versions of clang.
* the has_c_attribute check is to workaround a bug in clang version
* in rhel7. has_attribute would happily return "YES SIR WE GOT IT"
* and fail the build the next line.
*/
#ifdef __clang__
#ifdef __has_c_attribute
#if __has_attribute(fallthrough)
__attribute__((fallthrough));
#endif
#endif
#endif
// Fall through
default:
pcmk__g_strcat(str, ": ", pcmk_exec_status_str(op->op_status),
NULL);
}
if ((op->exit_reason != NULL)
&& ((op->op_status != PCMK_EXEC_DONE) || (op->rc != PCMK_OCF_OK))) {
pcmk__g_strcat(str, " (", op->exit_reason, ")", NULL);
}
g_string_append(str, " " CRM_XS);
g_string_append_printf(str, " graph action %sconfirmed; call=%d key=%s",
(confirmed? "" : "un"), op->call_id, op_key);
if (op->op_status == PCMK_EXEC_DONE) {
g_string_append_printf(str, " rc=%d", op->rc);
}
do_crm_log(log_level, "%s", str->str);
g_string_free(str, TRUE);
/* The services library has already logged the output at info or debug
* level, so just raise to notice if it looks like a failure.
*/
if ((op->output != NULL) && (op->rc != PCMK_OCF_OK)) {
char *prefix = crm_strdup_printf(PCMK__OP_FMT "@%s output",
op->rsc_id, op->op_type,
op->interval_ms, node_name);
crm_log_output(LOG_NOTICE, prefix, op->output);
free(prefix);
}
}
void
process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
active_op_t *pending, const xmlNode *action_xml)
{
char *op_id = NULL;
char *op_key = NULL;
gboolean remove = FALSE;
gboolean removed = FALSE;
bool need_direct_ack = FALSE;
lrmd_rsc_info_t *rsc = NULL;
const char *node_name = NULL;
CRM_CHECK(op != NULL, return);
CRM_CHECK(op->rsc_id != NULL, return);
// Remap new status codes for older DCs
if (compare_version(controld_globals.dc_version, "3.2.0") < 0) {
switch (op->op_status) {
case PCMK_EXEC_NOT_CONNECTED:
lrmd__set_result(op, PCMK_OCF_CONNECTION_DIED,
PCMK_EXEC_ERROR, op->exit_reason);
break;
case PCMK_EXEC_INVALID:
lrmd__set_result(op, CRM_DIRECT_NACK_RC, PCMK_EXEC_ERROR,
op->exit_reason);
break;
default:
break;
}
}
op_id = make_stop_id(op->rsc_id, op->call_id);
op_key = pcmk__op_key(op->rsc_id, op->op_type, op->interval_ms);
// Get resource info if available (from executor state or action XML)
if (lrm_state) {
rsc = lrm_state_get_rsc_info(lrm_state, op->rsc_id, 0);
}
if ((rsc == NULL) && action_xml) {
xmlNode *xml = find_xml_node(action_xml, XML_CIB_TAG_RESOURCE, TRUE);
const char *standard = crm_element_value(xml, XML_AGENT_ATTR_CLASS);
const char *provider = crm_element_value(xml, XML_AGENT_ATTR_PROVIDER);
const char *type = crm_element_value(xml, XML_ATTR_TYPE);
if (standard && type) {
crm_info("%s agent information not cached, using %s%s%s:%s from action XML",
op->rsc_id, standard,
(provider? ":" : ""), (provider? provider : ""), type);
rsc = lrmd_new_rsc_info(op->rsc_id, standard, provider, type);
} else {
crm_err("Can't process %s result because %s agent information not cached or in XML",
op_key, op->rsc_id);
}
}
// Get node name if available (from executor state or action XML)
if (lrm_state) {
node_name = lrm_state->node_name;
} else if (action_xml) {
node_name = crm_element_value(action_xml, XML_LRM_ATTR_TARGET);
}
if(pending == NULL) {
remove = TRUE;
if (lrm_state) {
pending = g_hash_table_lookup(lrm_state->active_ops, op_id);
}
}
if (op->op_status == PCMK_EXEC_ERROR) {
switch(op->rc) {
case PCMK_OCF_NOT_RUNNING:
case PCMK_OCF_RUNNING_PROMOTED:
case PCMK_OCF_DEGRADED:
case PCMK_OCF_DEGRADED_PROMOTED:
// Leave it to the TE/scheduler to decide if this is an error
op->op_status = PCMK_EXEC_DONE;
break;
default:
/* Nothing to do */
break;
}
}
if (op->op_status != PCMK_EXEC_CANCELLED) {
/* We might not record the result, so directly acknowledge it to the
* originator instead, so it doesn't time out waiting for the result
* (especially important if part of a transition).
*/
need_direct_ack = TRUE;
if (controld_action_is_recordable(op->op_type)) {
if (node_name && rsc) {
// We should record the result, and happily, we can
time_t lock_time = (pending == NULL)? 0 : pending->lock_time;
controld_update_resource_history(node_name, rsc, op, lock_time);
need_direct_ack = FALSE;
} else if (op->rsc_deleted) {
/* We shouldn't record the result (likely the resource was
* refreshed, cleaned, or removed while this operation was
* in flight).
*/
crm_notice("Not recording %s result in CIB because "
"resource information was removed since it was initiated",
op_key);
} else {
/* This shouldn't be possible; the executor didn't consider the
* resource deleted, but we couldn't find resource or node
* information.
*/
crm_err("Unable to record %s result in CIB: %s", op_key,
(node_name? "No resource information" : "No node name"));
}
}
} else if (op->interval_ms == 0) {
/* A non-recurring operation was cancelled. Most likely, the
* never-initiated action was removed from the executor's pending
* operations list upon resource removal.
*/
need_direct_ack = TRUE;
} else if (pending == NULL) {
/* This recurring operation was cancelled, but was not pending. No
* transition actions are waiting on it, nothing needs to be done.
*/
} else if (op->user_data == NULL) {
/* This recurring operation was cancelled and pending, but we don't
* have a transition key. This should never happen.
*/
crm_err("Recurring operation %s was cancelled without transition information",
op_key);
} else if (pcmk_is_set(pending->flags, active_op_remove)) {
/* This recurring operation was cancelled (by us) and pending, and we
* have been waiting for it to finish.
*/
if (lrm_state) {
controld_delete_action_history(op);
}
/* Directly acknowledge failed recurring actions here. The above call to
* controld_delete_action_history() will not erase any corresponding
* last_failure entry, which means that the DC won't confirm the
* cancellation via process_op_deletion(), and the transition would
* otherwise wait for the action timer to pop.
*/
if (did_lrm_rsc_op_fail(lrm_state, pending->rsc_id,
pending->op_type, pending->interval_ms)) {
need_direct_ack = TRUE;
}
} else if (op->rsc_deleted) {
/* This recurring operation was cancelled (but not by us, and the
* executor does not have resource information, likely due to resource
* cleanup, refresh, or removal) and pending.
*/
crm_debug("Recurring op %s was cancelled due to resource deletion",
op_key);
need_direct_ack = TRUE;
} else {
/* This recurring operation was cancelled (but not by us, likely by the
* executor before stopping the resource) and pending. We don't need to
* do anything special.
*/
}
if (need_direct_ack) {
controld_ack_event_directly(NULL, NULL, NULL, op, op->rsc_id);
}
if(remove == FALSE) {
/* The caller will do this afterwards, but keep the logging consistent */
removed = TRUE;
} else if (lrm_state && ((op->interval_ms == 0)
|| (op->op_status == PCMK_EXEC_CANCELLED))) {
gboolean found = g_hash_table_remove(lrm_state->active_ops, op_id);
if (op->interval_ms != 0) {
removed = TRUE;
} else if (found) {
removed = TRUE;
crm_trace("Op %s (call=%d, stop-id=%s, remaining=%u): Confirmed",
op_key, op->call_id, op_id,
g_hash_table_size(lrm_state->active_ops));
}
}
log_executor_event(op, op_key, node_name, removed);
if (lrm_state) {
if (!pcmk__str_eq(op->op_type, PCMK_ACTION_META_DATA,
pcmk__str_casei)) {
crmd_alert_resource_op(lrm_state->node_name, op);
} else if (rsc && (op->rc == PCMK_OCF_OK)) {
char *metadata = unescape_newlines(op->output);
controld_cache_metadata(lrm_state->metadata_cache, rsc, metadata);
free(metadata);
}
}
if (op->rsc_deleted) {
crm_info("Deletion of resource '%s' complete after %s", op->rsc_id, op_key);
if (lrm_state) {
delete_rsc_entry(lrm_state, NULL, op->rsc_id, NULL, pcmk_ok, NULL,
true);
}
}
/* If a shutdown was escalated while operations were pending,
* then the FSA will be stalled right now... allow it to continue
*/
controld_trigger_fsa();
if (lrm_state && rsc) {
update_history_cache(lrm_state, rsc, op);
}
lrmd_free_rsc_info(rsc);
free(op_key);
free(op_id);
}
diff --git a/daemons/controld/controld_execd_state.c b/daemons/controld/controld_execd_state.c
index 8c68bfca08..b90cc5e635 100644
--- a/daemons/controld/controld_execd_state.c
+++ b/daemons/controld/controld_execd_state.c
@@ -1,814 +1,813 @@
/*
* Copyright 2012-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU General Public License version 2
* or later (GPLv2+) WITHOUT ANY WARRANTY.
*/
#include <crm_internal.h>
#include <errno.h>
#include <crm/crm.h>
#include <crm/msg_xml.h>
#include <crm/common/iso8601.h>
#include <crm/pengine/rules.h>
#include <crm/pengine/rules_internal.h>
#include <crm/lrmd_internal.h>
#include <pacemaker-internal.h>
#include <pacemaker-controld.h>
static GHashTable *lrm_state_table = NULL;
extern GHashTable *proxy_table;
int lrmd_internal_proxy_send(lrmd_t * lrmd, xmlNode *msg);
void lrmd_internal_set_proxy_callback(lrmd_t * lrmd, void *userdata, void (*callback)(lrmd_t *lrmd, void *userdata, xmlNode *msg));
static void
free_rsc_info(gpointer value)
{
lrmd_rsc_info_t *rsc_info = value;
lrmd_free_rsc_info(rsc_info);
}
static void
free_deletion_op(gpointer value)
{
struct pending_deletion_op_s *op = value;
free(op->rsc);
delete_ha_msg_input(op->input);
free(op);
}
static void
free_recurring_op(gpointer value)
{
active_op_t *op = value;
free(op->user_data);
free(op->rsc_id);
free(op->op_type);
free(op->op_key);
if (op->params) {
g_hash_table_destroy(op->params);
}
free(op);
}
static gboolean
fail_pending_op(gpointer key, gpointer value, gpointer user_data)
{
lrmd_event_data_t event = { 0, };
lrm_state_t *lrm_state = user_data;
active_op_t *op = value;
crm_trace("Pre-emptively failing " PCMK__OP_FMT " on %s (call=%s, %s)",
op->rsc_id, op->op_type, op->interval_ms,
lrm_state->node_name, (char*)key, op->user_data);
event.type = lrmd_event_exec_complete;
event.rsc_id = op->rsc_id;
event.op_type = op->op_type;
event.user_data = op->user_data;
event.timeout = 0;
event.interval_ms = op->interval_ms;
lrmd__set_result(&event, PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_NOT_CONNECTED,
"Action was pending when executor connection was dropped");
event.t_run = (unsigned int) op->start_time;
event.t_rcchange = (unsigned int) op->start_time;
event.call_id = op->call_id;
event.remote_nodename = lrm_state->node_name;
event.params = op->params;
process_lrm_event(lrm_state, &event, op, NULL);
lrmd__reset_result(&event);
return TRUE;
}
gboolean
lrm_state_is_local(lrm_state_t *lrm_state)
{
return (lrm_state != NULL)
&& pcmk__str_eq(lrm_state->node_name, controld_globals.our_nodename,
pcmk__str_casei);
}
/*!
* \internal
* \brief Create executor state entry for a node and add it to the state table
*
* \param[in] node_name Node to create entry for
*
* \return Newly allocated executor state object initialized for \p node_name
*/
static lrm_state_t *
lrm_state_create(const char *node_name)
{
lrm_state_t *state = NULL;
if (!node_name) {
crm_err("No node name given for lrm state object");
return NULL;
}
state = calloc(1, sizeof(lrm_state_t));
if (!state) {
return NULL;
}
state->node_name = strdup(node_name);
state->rsc_info_cache = pcmk__strkey_table(NULL, free_rsc_info);
state->deletion_ops = pcmk__strkey_table(free, free_deletion_op);
state->active_ops = pcmk__strkey_table(free, free_recurring_op);
state->resource_history = pcmk__strkey_table(NULL, history_free);
state->metadata_cache = metadata_cache_new();
g_hash_table_insert(lrm_state_table, (char *)state->node_name, state);
return state;
}
-void
-lrm_state_destroy(const char *node_name)
-{
- g_hash_table_remove(lrm_state_table, node_name);
-}
-
static gboolean
remote_proxy_remove_by_node(gpointer key, gpointer value, gpointer user_data)
{
remote_proxy_t *proxy = value;
const char *node_name = user_data;
if (pcmk__str_eq(node_name, proxy->node_name, pcmk__str_casei)) {
return TRUE;
}
return FALSE;
}
static remote_proxy_t *
find_connected_proxy_by_node(const char * node_name)
{
GHashTableIter gIter;
remote_proxy_t *proxy = NULL;
CRM_CHECK(proxy_table != NULL, return NULL);
g_hash_table_iter_init(&gIter, proxy_table);
while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) &proxy)) {
if (proxy->source
&& pcmk__str_eq(node_name, proxy->node_name, pcmk__str_casei)) {
return proxy;
}
}
return NULL;
}
static void
remote_proxy_disconnect_by_node(const char * node_name)
{
remote_proxy_t *proxy = NULL;
CRM_CHECK(proxy_table != NULL, return);
while ((proxy = find_connected_proxy_by_node(node_name)) != NULL) {
/* mainloop_del_ipc_client() eventually calls remote_proxy_disconnected()
* , which removes the entry from proxy_table.
* Do not do this in a g_hash_table_iter_next() loop. */
if (proxy->source) {
mainloop_del_ipc_client(proxy->source);
}
}
return;
}
static void
internal_lrm_state_destroy(gpointer data)
{
lrm_state_t *lrm_state = data;
if (!lrm_state) {
return;
}
/* Rather than directly remove the recorded proxy entries from proxy_table,
* make sure any connected proxies get disconnected. So that
* remote_proxy_disconnected() will be called and as well remove the
* entries from proxy_table.
*/
remote_proxy_disconnect_by_node(lrm_state->node_name);
crm_trace("Destroying proxy table %s with %u members",
lrm_state->node_name, g_hash_table_size(proxy_table));
// Just in case there's still any leftovers in proxy_table
g_hash_table_foreach_remove(proxy_table, remote_proxy_remove_by_node, (char *) lrm_state->node_name);
remote_ra_cleanup(lrm_state);
lrmd_api_delete(lrm_state->conn);
if (lrm_state->rsc_info_cache) {
crm_trace("Destroying rsc info cache with %u members",
g_hash_table_size(lrm_state->rsc_info_cache));
g_hash_table_destroy(lrm_state->rsc_info_cache);
}
if (lrm_state->resource_history) {
crm_trace("Destroying history op cache with %u members",
g_hash_table_size(lrm_state->resource_history));
g_hash_table_destroy(lrm_state->resource_history);
}
if (lrm_state->deletion_ops) {
crm_trace("Destroying deletion op cache with %u members",
g_hash_table_size(lrm_state->deletion_ops));
g_hash_table_destroy(lrm_state->deletion_ops);
}
if (lrm_state->active_ops != NULL) {
crm_trace("Destroying pending op cache with %u members",
g_hash_table_size(lrm_state->active_ops));
g_hash_table_destroy(lrm_state->active_ops);
}
metadata_cache_free(lrm_state->metadata_cache);
free((char *)lrm_state->node_name);
free(lrm_state);
}
void
lrm_state_reset_tables(lrm_state_t * lrm_state, gboolean reset_metadata)
{
if (lrm_state->resource_history) {
crm_trace("Resetting resource history cache with %u members",
g_hash_table_size(lrm_state->resource_history));
g_hash_table_remove_all(lrm_state->resource_history);
}
if (lrm_state->deletion_ops) {
crm_trace("Resetting deletion operations cache with %u members",
g_hash_table_size(lrm_state->deletion_ops));
g_hash_table_remove_all(lrm_state->deletion_ops);
}
if (lrm_state->active_ops != NULL) {
crm_trace("Resetting active operations cache with %u members",
g_hash_table_size(lrm_state->active_ops));
g_hash_table_remove_all(lrm_state->active_ops);
}
if (lrm_state->rsc_info_cache) {
crm_trace("Resetting resource information cache with %u members",
g_hash_table_size(lrm_state->rsc_info_cache));
g_hash_table_remove_all(lrm_state->rsc_info_cache);
}
if (reset_metadata) {
metadata_cache_reset(lrm_state->metadata_cache);
}
}
gboolean
lrm_state_init_local(void)
{
if (lrm_state_table) {
return TRUE;
}
lrm_state_table = pcmk__strikey_table(NULL, internal_lrm_state_destroy);
if (!lrm_state_table) {
return FALSE;
}
proxy_table = pcmk__strikey_table(NULL, remote_proxy_free);
if (!proxy_table) {
g_hash_table_destroy(lrm_state_table);
lrm_state_table = NULL;
return FALSE;
}
return TRUE;
}
void
lrm_state_destroy_all(void)
{
if (lrm_state_table) {
crm_trace("Destroying state table with %u members",
g_hash_table_size(lrm_state_table));
g_hash_table_destroy(lrm_state_table); lrm_state_table = NULL;
}
if(proxy_table) {
crm_trace("Destroying proxy table with %u members",
g_hash_table_size(proxy_table));
g_hash_table_destroy(proxy_table); proxy_table = NULL;
}
}
lrm_state_t *
lrm_state_find(const char *node_name)
{
- if (!node_name) {
+ if ((node_name == NULL) || (lrm_state_table == NULL)) {
return NULL;
}
return g_hash_table_lookup(lrm_state_table, node_name);
}
lrm_state_t *
lrm_state_find_or_create(const char *node_name)
{
lrm_state_t *lrm_state;
+ CRM_CHECK(lrm_state_table != NULL, return NULL);
+
lrm_state = g_hash_table_lookup(lrm_state_table, node_name);
if (!lrm_state) {
lrm_state = lrm_state_create(node_name);
}
return lrm_state;
}
GList *
lrm_state_get_list(void)
{
+ if (lrm_state_table == NULL) {
+ return NULL;
+ }
return g_hash_table_get_values(lrm_state_table);
}
void
lrm_state_disconnect_only(lrm_state_t * lrm_state)
{
int removed = 0;
if (!lrm_state->conn) {
return;
}
crm_trace("Disconnecting %s", lrm_state->node_name);
remote_proxy_disconnect_by_node(lrm_state->node_name);
((lrmd_t *) lrm_state->conn)->cmds->disconnect(lrm_state->conn);
if (!pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)) {
removed = g_hash_table_foreach_remove(lrm_state->active_ops,
fail_pending_op, lrm_state);
crm_trace("Synthesized %d operation failures for %s", removed, lrm_state->node_name);
}
}
void
lrm_state_disconnect(lrm_state_t * lrm_state)
{
if (!lrm_state->conn) {
return;
}
lrm_state_disconnect_only(lrm_state);
lrmd_api_delete(lrm_state->conn);
lrm_state->conn = NULL;
}
int
lrm_state_is_connected(lrm_state_t * lrm_state)
{
if (!lrm_state->conn) {
return FALSE;
}
return ((lrmd_t *) lrm_state->conn)->cmds->is_connected(lrm_state->conn);
}
int
lrm_state_poke_connection(lrm_state_t * lrm_state)
{
if (!lrm_state->conn) {
return -ENOTCONN;
}
return ((lrmd_t *) lrm_state->conn)->cmds->poke_connection(lrm_state->conn);
}
// \return Standard Pacemaker return code
int
controld_connect_local_executor(lrm_state_t *lrm_state)
{
int rc = pcmk_rc_ok;
if (lrm_state->conn == NULL) {
lrmd_t *api = NULL;
rc = lrmd__new(&api, NULL, NULL, 0);
if (rc != pcmk_rc_ok) {
return rc;
}
api->cmds->set_callback(api, lrm_op_callback);
lrm_state->conn = api;
}
rc = ((lrmd_t *) lrm_state->conn)->cmds->connect(lrm_state->conn,
CRM_SYSTEM_CRMD, NULL);
rc = pcmk_legacy2rc(rc);
if (rc == pcmk_rc_ok) {
lrm_state->num_lrm_register_fails = 0;
} else {
lrm_state->num_lrm_register_fails++;
}
return rc;
}
static remote_proxy_t *
crmd_remote_proxy_new(lrmd_t *lrmd, const char *node_name, const char *session_id, const char *channel)
{
struct ipc_client_callbacks proxy_callbacks = {
.dispatch = remote_proxy_dispatch,
.destroy = remote_proxy_disconnected
};
remote_proxy_t *proxy = remote_proxy_new(lrmd, &proxy_callbacks, node_name,
session_id, channel);
return proxy;
}
gboolean
crmd_is_proxy_session(const char *session)
{
return g_hash_table_lookup(proxy_table, session) ? TRUE : FALSE;
}
void
crmd_proxy_send(const char *session, xmlNode *msg)
{
remote_proxy_t *proxy = g_hash_table_lookup(proxy_table, session);
lrm_state_t *lrm_state = NULL;
if (!proxy) {
return;
}
crm_log_xml_trace(msg, "to-proxy");
lrm_state = lrm_state_find(proxy->node_name);
if (lrm_state) {
crm_trace("Sending event to %.8s on %s", proxy->session_id, proxy->node_name);
remote_proxy_relay_event(proxy, msg);
}
}
static void
crmd_proxy_dispatch(const char *session, xmlNode *msg)
{
crm_trace("Processing proxied IPC message from session %s", session);
crm_log_xml_trace(msg, "controller[inbound]");
crm_xml_add(msg, F_CRM_SYS_FROM, session);
if (controld_authorize_ipc_message(msg, NULL, session)) {
route_message(C_IPC_MESSAGE, msg);
}
controld_trigger_fsa();
}
static void
remote_config_check(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
{
if (rc != pcmk_ok) {
crm_err("Query resulted in an error: %s", pcmk_strerror(rc));
if (rc == -EACCES || rc == -pcmk_err_schema_validation) {
crm_err("The cluster is mis-configured - shutting down and staying down");
}
} else {
lrmd_t * lrmd = (lrmd_t *)user_data;
crm_time_t *now = crm_time_new(NULL);
GHashTable *config_hash = pcmk__strkey_table(free, free);
crm_debug("Call %d : Parsing CIB options", call_id);
pe_unpack_nvpairs(output, output, XML_CIB_TAG_PROPSET, NULL,
config_hash, CIB_OPTIONS_FIRST, FALSE, now, NULL);
/* Now send it to the remote peer */
lrmd__validate_remote_settings(lrmd, config_hash);
g_hash_table_destroy(config_hash);
crm_time_free(now);
}
}
static void
crmd_remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg)
{
lrm_state_t *lrm_state = userdata;
const char *session = crm_element_value(msg, F_LRMD_IPC_SESSION);
remote_proxy_t *proxy = g_hash_table_lookup(proxy_table, session);
const char *op = crm_element_value(msg, F_LRMD_IPC_OP);
if (pcmk__str_eq(op, LRMD_IPC_OP_NEW, pcmk__str_casei)) {
const char *channel = crm_element_value(msg, F_LRMD_IPC_IPC_SERVER);
proxy = crmd_remote_proxy_new(lrmd, lrm_state->node_name, session, channel);
if (!remote_ra_controlling_guest(lrm_state)) {
if (proxy != NULL) {
cib_t *cib_conn = controld_globals.cib_conn;
/* Look up stonith-watchdog-timeout and send to the remote peer for validation */
int rc = cib_conn->cmds->query(cib_conn, XML_CIB_TAG_CRMCONFIG,
NULL, cib_scope_local);
cib_conn->cmds->register_callback_full(cib_conn, rc, 10, FALSE,
lrmd,
"remote_config_check",
remote_config_check,
NULL);
}
} else {
crm_debug("Skipping remote_config_check for guest-nodes");
}
} else if (pcmk__str_eq(op, LRMD_IPC_OP_SHUTDOWN_REQ, pcmk__str_casei)) {
char *now_s = NULL;
crm_notice("%s requested shutdown of its remote connection",
lrm_state->node_name);
if (!remote_ra_is_in_maintenance(lrm_state)) {
now_s = pcmk__ttoa(time(NULL));
update_attrd(lrm_state->node_name, XML_CIB_ATTR_SHUTDOWN, now_s, NULL, TRUE);
free(now_s);
remote_proxy_ack_shutdown(lrmd);
crm_warn("Reconnection attempts to %s may result in failures that must be cleared",
lrm_state->node_name);
} else {
remote_proxy_nack_shutdown(lrmd);
crm_notice("Remote resource for %s is not managed so no ordered shutdown happening",
lrm_state->node_name);
}
return;
} else if (pcmk__str_eq(op, LRMD_IPC_OP_REQUEST, pcmk__str_casei) && proxy && proxy->is_local) {
/* This is for the controller, which we are, so don't try
* to send to ourselves over IPC -- do it directly.
*/
int flags = 0;
xmlNode *request = get_message_xml(msg, F_LRMD_IPC_MSG);
CRM_CHECK(request != NULL, return);
CRM_CHECK(lrm_state->node_name, return);
crm_xml_add(request, XML_ACL_TAG_ROLE, "pacemaker-remote");
pcmk__update_acl_user(request, F_LRMD_IPC_USER, lrm_state->node_name);
/* Pacemaker Remote nodes don't know their own names (as known to the
* cluster). When getting a node info request with no name or ID, add
* the name, so we don't return info for ourselves instead of the
* Pacemaker Remote node.
*/
if (pcmk__str_eq(crm_element_value(request, F_CRM_TASK), CRM_OP_NODE_INFO, pcmk__str_casei)) {
int node_id = 0;
crm_element_value_int(request, XML_ATTR_ID, &node_id);
if ((node_id <= 0)
&& (crm_element_value(request, XML_ATTR_UNAME) == NULL)) {
crm_xml_add(request, XML_ATTR_UNAME, lrm_state->node_name);
}
}
crmd_proxy_dispatch(session, request);
crm_element_value_int(msg, F_LRMD_IPC_MSG_FLAGS, &flags);
if (flags & crm_ipc_client_response) {
int msg_id = 0;
xmlNode *op_reply = create_xml_node(NULL, "ack");
crm_xml_add(op_reply, "function", __func__);
crm_xml_add_int(op_reply, "line", __LINE__);
crm_element_value_int(msg, F_LRMD_IPC_MSG_ID, &msg_id);
remote_proxy_relay_response(proxy, op_reply, msg_id);
free_xml(op_reply);
}
} else {
remote_proxy_cb(lrmd, lrm_state->node_name, msg);
}
}
// \return Standard Pacemaker return code
int
controld_connect_remote_executor(lrm_state_t *lrm_state, const char *server,
int port, int timeout_ms)
{
int rc = pcmk_rc_ok;
if (lrm_state->conn == NULL) {
lrmd_t *api = NULL;
rc = lrmd__new(&api, lrm_state->node_name, server, port);
if (rc != pcmk_rc_ok) {
crm_warn("Pacemaker Remote connection to %s:%s failed: %s "
CRM_XS " rc=%d", server, port, pcmk_rc_str(rc), rc);
return rc;
}
lrm_state->conn = api;
api->cmds->set_callback(api, remote_lrm_op_callback);
lrmd_internal_set_proxy_callback(api, lrm_state, crmd_remote_proxy_cb);
}
crm_trace("Initiating remote connection to %s:%d with timeout %dms",
server, port, timeout_ms);
rc = ((lrmd_t *) lrm_state->conn)->cmds->connect_async(lrm_state->conn,
lrm_state->node_name,
timeout_ms);
if (rc == pcmk_ok) {
lrm_state->num_lrm_register_fails = 0;
} else {
lrm_state->num_lrm_register_fails++; // Ignored for remote connections
}
return pcmk_legacy2rc(rc);
}
int
lrm_state_get_metadata(lrm_state_t * lrm_state,
const char *class,
const char *provider,
const char *agent, char **output, enum lrmd_call_options options)
{
lrmd_key_value_t *params = NULL;
if (!lrm_state->conn) {
return -ENOTCONN;
}
/* Add the node name to the environment, as is done with normal resource
* action calls. Meta-data calls shouldn't need it, but some agents are
* written with an ocf_local_nodename call at the beginning regardless of
* action. Without the environment variable, the agent would try to contact
* the controller to get the node name -- but the controller would be
* blocking on the synchronous meta-data call.
*
* At this point, we have to assume that agents are unlikely to make other
* calls that require the controller, such as crm_node --quorum or
* --cluster-id.
*
* @TODO Make meta-data calls asynchronous. (This will be part of a larger
* project to make meta-data calls via the executor rather than directly.)
*/
params = lrmd_key_value_add(params, CRM_META "_" XML_LRM_ATTR_TARGET,
lrm_state->node_name);
return ((lrmd_t *) lrm_state->conn)->cmds->get_metadata_params(lrm_state->conn,
class, provider, agent, output, options, params);
}
int
lrm_state_cancel(lrm_state_t *lrm_state, const char *rsc_id, const char *action,
guint interval_ms)
{
if (!lrm_state->conn) {
return -ENOTCONN;
}
/* Figure out a way to make this async?
* NOTICE: Currently it's synced and directly acknowledged in do_lrm_invoke(). */
if (is_remote_lrmd_ra(NULL, NULL, rsc_id)) {
return remote_ra_cancel(lrm_state, rsc_id, action, interval_ms);
}
return ((lrmd_t *) lrm_state->conn)->cmds->cancel(lrm_state->conn, rsc_id,
action, interval_ms);
}
lrmd_rsc_info_t *
lrm_state_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id, enum lrmd_call_options options)
{
lrmd_rsc_info_t *rsc = NULL;
if (!lrm_state->conn) {
return NULL;
}
if (is_remote_lrmd_ra(NULL, NULL, rsc_id)) {
return remote_ra_get_rsc_info(lrm_state, rsc_id);
}
rsc = g_hash_table_lookup(lrm_state->rsc_info_cache, rsc_id);
if (rsc == NULL) {
/* only contact the lrmd if we don't already have a cached rsc info */
rsc = ((lrmd_t *) lrm_state->conn)->cmds->get_rsc_info(lrm_state->conn, rsc_id, options);
if (rsc == NULL) {
return NULL;
}
/* cache the result */
g_hash_table_insert(lrm_state->rsc_info_cache, rsc->id, rsc);
}
return lrmd_copy_rsc_info(rsc);
}
/*!
* \internal
* \brief Initiate a resource agent action
*
* \param[in,out] lrm_state Executor state object
* \param[in] rsc_id ID of resource for action
* \param[in] action Action to execute
* \param[in] userdata String to copy and pass to execution callback
* \param[in] interval_ms Action interval (in milliseconds)
* \param[in] timeout_ms Action timeout (in milliseconds)
* \param[in] start_delay_ms Delay (in ms) before initiating action
* \param[in] parameters Hash table of resource parameters
* \param[out] call_id Where to store call ID on success
*
* \return Standard Pacemaker return code
*/
int
controld_execute_resource_agent(lrm_state_t *lrm_state, const char *rsc_id,
const char *action, const char *userdata,
guint interval_ms, int timeout_ms,
int start_delay_ms, GHashTable *parameters,
int *call_id)
{
int rc = pcmk_rc_ok;
lrmd_key_value_t *params = NULL;
if (lrm_state->conn == NULL) {
return ENOTCONN;
}
// Convert parameters from hash table to list
if (parameters != NULL) {
const char *key = NULL;
const char *value = NULL;
GHashTableIter iter;
g_hash_table_iter_init(&iter, parameters);
while (g_hash_table_iter_next(&iter, (gpointer *) &key,
(gpointer *) &value)) {
params = lrmd_key_value_add(params, key, value);
}
}
if (is_remote_lrmd_ra(NULL, NULL, rsc_id)) {
rc = controld_execute_remote_agent(lrm_state, rsc_id, action,
userdata, interval_ms, timeout_ms,
start_delay_ms, params, call_id);
} else {
rc = ((lrmd_t *) lrm_state->conn)->cmds->exec(lrm_state->conn, rsc_id,
action, userdata,
interval_ms, timeout_ms,
start_delay_ms,
lrmd_opt_notify_changes_only,
params);
if (rc < 0) {
rc = pcmk_legacy2rc(rc);
} else {
*call_id = rc;
rc = pcmk_rc_ok;
}
}
return rc;
}
int
lrm_state_register_rsc(lrm_state_t * lrm_state,
const char *rsc_id,
const char *class,
const char *provider, const char *agent, enum lrmd_call_options options)
{
lrmd_t *conn = (lrmd_t *) lrm_state->conn;
if (conn == NULL) {
return -ENOTCONN;
}
if (is_remote_lrmd_ra(agent, provider, NULL)) {
return lrm_state_find_or_create(rsc_id)? pcmk_ok : -EINVAL;
}
/* @TODO Implement an asynchronous version of this (currently a blocking
* call to the lrmd).
*/
return conn->cmds->register_rsc(lrm_state->conn, rsc_id, class, provider,
agent, options);
}
int
lrm_state_unregister_rsc(lrm_state_t * lrm_state,
const char *rsc_id, enum lrmd_call_options options)
{
if (!lrm_state->conn) {
return -ENOTCONN;
}
if (is_remote_lrmd_ra(NULL, NULL, rsc_id)) {
- lrm_state_destroy(rsc_id);
+ g_hash_table_remove(lrm_state_table, rsc_id);
return pcmk_ok;
}
g_hash_table_remove(lrm_state->rsc_info_cache, rsc_id);
/* @TODO Optimize this ... this function is a blocking round trip from
* client to daemon. The controld_execd_state.c code path that uses this
* function should always treat it as an async operation. The executor API
* should make an async version available.
*/
return ((lrmd_t *) lrm_state->conn)->cmds->unregister_rsc(lrm_state->conn, rsc_id, options);
}
diff --git a/daemons/controld/controld_lrm.h b/daemons/controld/controld_lrm.h
index 25f3db3316..c3113e49c3 100644
--- a/daemons/controld/controld_lrm.h
+++ b/daemons/controld/controld_lrm.h
@@ -1,188 +1,183 @@
/*
* Copyright 2004-2023 the Pacemaker project contributors
*
* The version control history for this file may have further details.
*
* This source code is licensed under the GNU Lesser General Public License
* version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
*/
#ifndef CONTROLD_LRM__H
# define CONTROLD_LRM__H
#include <controld_messages.h>
extern gboolean verify_stopped(enum crmd_fsa_state cur_state, int log_level);
void lrm_clear_last_failure(const char *rsc_id, const char *node_name,
const char *operation, guint interval_ms);
void lrm_op_callback(lrmd_event_data_t * op);
lrmd_t *crmd_local_lrmd_conn(void);
typedef struct resource_history_s {
char *id;
uint32_t last_callid;
lrmd_rsc_info_t rsc;
lrmd_event_data_t *last;
lrmd_event_data_t *failed;
GList *recurring_op_list;
/* Resources must be stopped using the same
* parameters they were started with. This hashtable
* holds the parameters that should be used for the next stop
* cmd on this resource. */
GHashTable *stop_params;
} rsc_history_t;
void history_free(gpointer data);
enum active_op_e {
active_op_remove = (1 << 0),
active_op_cancelled = (1 << 1),
};
// In-flight action (recurring or pending)
typedef struct active_op_s {
guint interval_ms;
int call_id;
uint32_t flags; // bitmask of active_op_e
time_t start_time;
time_t lock_time;
char *rsc_id;
char *op_type;
char *op_key;
char *user_data;
GHashTable *params;
} active_op_t;
#define controld_set_active_op_flags(active_op, flags_to_set) do { \
(active_op)->flags = pcmk__set_flags_as(__func__, __LINE__, \
LOG_TRACE, "Active operation", (active_op)->op_key, \
(active_op)->flags, (flags_to_set), #flags_to_set); \
} while (0)
#define controld_clear_active_op_flags(active_op, flags_to_clear) do { \
(active_op)->flags = pcmk__clear_flags_as(__func__, __LINE__, \
LOG_TRACE, "Active operation", (active_op)->op_key, \
(active_op)->flags, (flags_to_clear), #flags_to_clear); \
} while (0)
typedef struct lrm_state_s {
const char *node_name;
void *conn; // Reserved for controld_execd_state.c usage
void *remote_ra_data; // Reserved for controld_remote_ra.c usage
GHashTable *resource_history;
GHashTable *active_ops; // Pending and recurring actions
GHashTable *deletion_ops;
GHashTable *rsc_info_cache;
GHashTable *metadata_cache; // key = class[:provider]:agent, value = ra_metadata_s
int num_lrm_register_fails;
} lrm_state_t;
struct pending_deletion_op_s {
char *rsc;
ha_msg_input_t *input;
};
/*!
* \brief Check whether this the local IPC connection to the executor
*/
gboolean
lrm_state_is_local(lrm_state_t *lrm_state);
/*!
* \brief Clear all state information from a single state entry.
* \note It sometimes useful to save metadata cache when it won't go stale.
* \note This does not close the executor connection
*/
void lrm_state_reset_tables(lrm_state_t * lrm_state, gboolean reset_metadata);
GList *lrm_state_get_list(void);
/*!
* \brief Initiate internal state tables
*/
gboolean lrm_state_init_local(void);
/*!
* \brief Destroy all state entries and internal state tables
*/
void lrm_state_destroy_all(void);
-/*!
- * \brief Destroy executor connection by node name
- */
-void lrm_state_destroy(const char *node_name);
-
/*!
* \brief Find lrm_state data by node name
*/
lrm_state_t *lrm_state_find(const char *node_name);
/*!
* \brief Either find or create a new entry
*/
lrm_state_t *lrm_state_find_or_create(const char *node_name);
/*!
* The functions below are wrappers for the executor API the the controller
* uses. These wrapper functions allow us to treat the controller's remote
* executor connection resources the same as regular resources. Internally,
* regular resources go to the executor, and remote connection resources are
* handled locally in the controller.
*/
void lrm_state_disconnect_only(lrm_state_t * lrm_state);
void lrm_state_disconnect(lrm_state_t * lrm_state);
int controld_connect_local_executor(lrm_state_t *lrm_state);
int controld_connect_remote_executor(lrm_state_t *lrm_state, const char *server,
int port, int timeout);
int lrm_state_is_connected(lrm_state_t * lrm_state);
int lrm_state_poke_connection(lrm_state_t * lrm_state);
int lrm_state_get_metadata(lrm_state_t * lrm_state,
const char *class,
const char *provider,
const char *agent, char **output, enum lrmd_call_options options);
int lrm_state_cancel(lrm_state_t *lrm_state, const char *rsc_id,
const char *action, guint interval_ms);
int controld_execute_resource_agent(lrm_state_t *lrm_state, const char *rsc_id,
const char *action, const char *userdata,
guint interval_ms, int timeout_ms,
int start_delay_ms,
GHashTable *parameters, int *call_id);
lrmd_rsc_info_t *lrm_state_get_rsc_info(lrm_state_t * lrm_state,
const char *rsc_id, enum lrmd_call_options options);
int lrm_state_register_rsc(lrm_state_t * lrm_state,
const char *rsc_id,
const char *class,
const char *provider, const char *agent, enum lrmd_call_options options);
int lrm_state_unregister_rsc(lrm_state_t * lrm_state,
const char *rsc_id, enum lrmd_call_options options);
// Functions used to manage remote executor connection resources
void remote_lrm_op_callback(lrmd_event_data_t * op);
gboolean is_remote_lrmd_ra(const char *agent, const char *provider, const char *id);
lrmd_rsc_info_t *remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id);
int remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id,
const char *action, guint interval_ms);
int controld_execute_remote_agent(const lrm_state_t *lrm_state,
const char *rsc_id, const char *action,
const char *userdata,
guint interval_ms, int timeout_ms,
int start_delay_ms, lrmd_key_value_t *params,
int *call_id);
void remote_ra_cleanup(lrm_state_t * lrm_state);
void remote_ra_fail(const char *node_name);
void remote_ra_process_pseudo(xmlNode *xml);
gboolean remote_ra_is_in_maintenance(lrm_state_t * lrm_state);
void remote_ra_process_maintenance_nodes(xmlNode *xml);
gboolean remote_ra_controlling_guest(lrm_state_t * lrm_state);
void process_lrm_event(lrm_state_t *lrm_state, lrmd_event_data_t *op,
active_op_t *pending, const xmlNode *action_xml);
void controld_ack_event_directly(const char *to_host, const char *to_sys,
const lrmd_rsc_info_t *rsc,
lrmd_event_data_t *op, const char *rsc_id);
void controld_rc2event(lrmd_event_data_t *event, int rc);
void controld_trigger_delete_refresh(const char *from_sys, const char *rsc_id);
#endif
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Mon, Jul 21, 2:58 AM (5 h, 2 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
2082970
Default Alt Text
(161 KB)
Attached To
Mode
rP Pacemaker
Attached
Detach File
Event Timeline
Log In to Comment