diff --git a/crmd/crmd_alerts.c b/crmd/crmd_alerts.c index 11d7239bf5..9e306c5c99 100644 --- a/crmd/crmd_alerts.c +++ b/crmd/crmd_alerts.c @@ -1,378 +1,378 @@ /* * Copyright (C) 2015 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include "crmd_alerts.h" #include "crmd_messages.h" #include #include static char *notify_script = NULL; static char *notify_target = NULL; static int alerts_inflight = 0; static gboolean draining_alerts = FALSE; /* - * syncronize local data with cib + * synchronize local data with cib */ static GHashTable * get_meta_attrs_from_cib(xmlNode *basenode, crm_alert_entry_t *entry, guint *max_timeout) { GHashTable *config_hash = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); crm_time_t *now = crm_time_new(NULL); const char *value = NULL; unpack_instance_attributes(basenode, basenode, XML_TAG_META_SETS, NULL, config_hash, NULL, FALSE, now); value = g_hash_table_lookup(config_hash, XML_ALERT_ATTR_TIMEOUT); if (value) { entry->timeout = crm_get_msec(value); if (entry->timeout <= 0) { if (entry->timeout == 0) { crm_trace("Setting timeout to default %dmsec", CRM_ALERT_DEFAULT_TIMEOUT_MS); } else { crm_warn("Invalid timeout value setting to default %dmsec", CRM_ALERT_DEFAULT_TIMEOUT_MS); } entry->timeout = CRM_ALERT_DEFAULT_TIMEOUT_MS; } else { crm_trace("Found timeout %dmsec", entry->timeout); } if (entry->timeout > *max_timeout) { *max_timeout = entry->timeout; } } value = g_hash_table_lookup(config_hash, XML_ALERT_ATTR_TSTAMP_FORMAT); if (value) { /* hard to do any checks here as merely anything can * can be a valid time-format-string */ entry->tstamp_format = (char *) value; crm_trace("Found timestamp format string '%s'", value); } crm_time_free(now); return config_hash; /* keep hash as long as strings are needed */ } void parse_alerts(xmlNode *alerts) { xmlNode *alert; crm_alert_entry_t entry; guint max_timeout = 0; crm_free_alert_list(); crm_alert_max_alert_timeout = CRM_ALERT_DEFAULT_TIMEOUT_MS; if (alerts) { crm_info("We have an alerts section in the cib"); if (notify_script) { crm_warn("Cib contains configuration for Legacy Notifications " "which is overruled by alerts section"); } } else { crm_info("No optional alerts section in cib"); if (notify_script) { entry = (crm_alert_entry_t) { .id = (char *) "legacy_notification", .path = notify_script, .timeout = CRM_ALERT_DEFAULT_TIMEOUT_MS, .recipient = notify_target }; crm_add_dup_alert_list_entry(&entry); crm_info("Legacy Notifications enabled"); } return; } for (alert = first_named_child(alerts, XML_CIB_TAG_ALERT); alert; alert = __xml_next(alert)) { xmlNode *recipient; int recipients = 0, envvars = 0; GHashTable *config_hash = NULL; entry = (crm_alert_entry_t) { .id = (char *) crm_element_value(alert, XML_ATTR_ID), .path = (char *) crm_element_value(alert, XML_ALERT_ATTR_PATH), .timeout = CRM_ALERT_DEFAULT_TIMEOUT_MS, .tstamp_format = (char *) CRM_ALERT_DEFAULT_TSTAMP_FORMAT }; crm_get_envvars_from_cib(alert, &entry, &envvars); config_hash = get_meta_attrs_from_cib(alert, &entry, &max_timeout); crm_debug("Found alert: id=%s, path=%s, timeout=%d, " "tstamp_format=%s, %d additional environment variables", entry.id, entry.path, entry.timeout, entry.tstamp_format, envvars); for (recipient = first_named_child(alert, XML_CIB_TAG_ALERT_RECIPIENT); recipient; recipient = __xml_next(recipient)) { int envvars_added = 0; entry.recipient = (char *) crm_element_value(recipient, XML_ALERT_ATTR_REC_VALUE); recipients++; crm_get_envvars_from_cib(recipient, &entry, &envvars_added); { crm_alert_entry_t recipient_entry = entry; GHashTable *config_hash = get_meta_attrs_from_cib(recipient, &recipient_entry, &max_timeout); crm_add_dup_alert_list_entry(&recipient_entry); crm_debug("Alert has recipient: id=%s, value=%s, " "%d additional environment variables", crm_element_value(recipient, XML_ATTR_ID), recipient_entry.recipient, envvars_added); g_hash_table_destroy(config_hash); } crm_drop_envvars(&entry, envvars_added); } if (recipients == 0) { crm_add_dup_alert_list_entry(&entry); } crm_drop_envvars(&entry, -1); g_hash_table_destroy(config_hash); } if (max_timeout > 0) { crm_alert_max_alert_timeout = max_timeout; } } /* * end of synchronization of local data with cib */ void crmd_enable_alerts(const char *script, const char *target) { free(notify_script); notify_script = ((script) && (strcmp(script,"/dev/null")))?strdup(script):NULL; free(notify_target); notify_target = (target != NULL)?strdup(target):NULL; } static void crmd_alert_complete(svc_action_t *op) { alerts_inflight--; if(op->rc == 0) { crm_info("Alert %d (%s) complete", op->sequence, op->agent); } else { crm_warn("Alert %d (%s) failed: %d", op->sequence, op->agent, op->rc); } } static void send_alerts(const char *kind) { svc_action_t *alert = NULL; static int operations = 0; GListPtr l; crm_time_hr_t *now = crm_time_hr_new(NULL); crm_set_alert_key(CRM_alert_kind, kind); crm_set_alert_key(CRM_alert_version, VERSION); for (l = g_list_first(crm_alert_list); l; l = g_list_next(l)) { crm_alert_entry_t *entry = (crm_alert_entry_t *)(l->data); char *timestamp = crm_time_format_hr(entry->tstamp_format, now); operations++; if (!draining_alerts) { crm_debug("Sending '%s' alert to '%s' via '%s'", kind, entry->recipient, entry->path); crm_set_alert_key(CRM_alert_recipient, entry->recipient); crm_set_alert_key_int(CRM_alert_node_sequence, operations); crm_set_alert_key(CRM_alert_timestamp, timestamp); alert = services_action_create_generic(entry->path, NULL); alert->timeout = entry->timeout; alert->standard = strdup("event"); alert->id = strdup(entry->id); alert->agent = strdup(entry->path); alert->sequence = operations; crm_set_envvar_list(entry); alerts_inflight++; if(services_action_async(alert, &crmd_alert_complete) == FALSE) { services_action_free(alert); alerts_inflight--; } crm_unset_envvar_list(entry); } else { crm_warn("Ignoring '%s' alert to '%s' via '%s' received " "while shutting down", kind, entry->recipient, entry->path); } free(timestamp); } crm_unset_alert_keys(); if (now) { free(now); } } void crmd_alert_node_event(crm_node_t *node) { if(!crm_alert_list) { return; } crm_set_alert_key(CRM_alert_node, node->uname); crm_set_alert_key_int(CRM_alert_nodeid, node->id); crm_set_alert_key(CRM_alert_desc, node->state); send_alerts("node"); } void crmd_alert_fencing_op(stonith_event_t * e) { char *desc = NULL; if (!crm_alert_list) { return; } desc = crm_strdup_printf( "Operation %s of %s by %s for %s@%s: %s (ref=%s)", e->action, e->target, e->executioner ? e->executioner : "", e->client_origin, e->origin, pcmk_strerror(e->result), e->id); crm_set_alert_key(CRM_alert_node, e->target); crm_set_alert_key(CRM_alert_task, e->operation); crm_set_alert_key(CRM_alert_desc, desc); crm_set_alert_key_int(CRM_alert_rc, e->result); send_alerts("fencing"); free(desc); } void crmd_alert_resource_op(const char *node, lrmd_event_data_t * op) { int target_rc = 0; if(!crm_alert_list) { return; } target_rc = rsc_op_expected_rc(op); if(op->interval == 0 && target_rc == op->rc && safe_str_eq(op->op_type, RSC_STATUS)) { /* Leave it up to the script if they want to alert for * 'failed' probes, only swallow ones for which the result was * unexpected. * * Even if we find a resource running, it was probably because * someone erased the status section. */ return; } crm_set_alert_key(CRM_alert_node, node); crm_set_alert_key(CRM_alert_rsc, op->rsc_id); crm_set_alert_key(CRM_alert_task, op->op_type); crm_set_alert_key_int(CRM_alert_interval, op->interval); crm_set_alert_key_int(CRM_alert_target_rc, target_rc); crm_set_alert_key_int(CRM_alert_status, op->op_status); crm_set_alert_key_int(CRM_alert_rc, op->rc); if(op->op_status == PCMK_LRM_OP_DONE) { crm_set_alert_key(CRM_alert_desc, services_ocf_exitcode_str(op->rc)); } else { crm_set_alert_key(CRM_alert_desc, services_lrm_status_str(op->op_status)); } send_alerts("resource"); } static gboolean alert_drain_timeout_callback(gpointer user_data) { gboolean *timeout_popped = (gboolean *) user_data; *timeout_popped = TRUE; return FALSE; } void crmd_drain_alerts(GMainContext *ctx) { guint timer; gboolean timeout_popped = FALSE; draining_alerts = TRUE; timer = g_timeout_add(crm_alert_max_alert_timeout + 5000, alert_drain_timeout_callback, (gpointer) &timeout_popped); while(alerts_inflight && !timeout_popped) { crm_trace("Draining mainloop while still %d alerts are in flight (timeout=%dms)", alerts_inflight, crm_alert_max_alert_timeout + 5000); g_main_context_iteration(ctx, TRUE); } if (!timeout_popped && (timer > 0)) { g_source_remove(timer); } } diff --git a/crmd/subsystems.c b/crmd/subsystems.c index 5bef64751d..499e5da96b 100644 --- a/crmd/subsystems.c +++ b/crmd/subsystems.c @@ -1,186 +1,186 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include /* for access */ #include /* for calls to open */ #include /* for calls to open */ #include /* for calls to open */ #include /* for getpwuid */ #include /* for initgroups */ #include #include #include /* for getrlimit */ #include #include #include /* for getrlimit */ #include #include #include #include #include #include #include #include #include static void crmd_child_exit(mainloop_child_t * p, pid_t pid, int core, int signo, int exitcode) { /* struct crm_subsystem_s *the_subsystem = mainloop_child_userdata(p); */ const char *name = mainloop_child_name(p); if (signo) { crm_notice("Child process %s terminated with signal %d (pid=%d, core=%d)", name, signo, pid, core); } else { do_crm_log(exitcode == 0 ? LOG_INFO : LOG_ERR, "Child process %s exited (pid=%d, rc=%d)", name, pid, exitcode); } } gboolean stop_subsystem(struct crm_subsystem_s *the_subsystem, gboolean force_quit) { int quit_signal = SIGTERM; crm_trace("Stopping sub-system \"%s\"", the_subsystem->name); clear_bit(fsa_input_register, the_subsystem->flag_required); if (the_subsystem->pid <= 0) { crm_trace("Client %s not running", the_subsystem->name); return FALSE; } if (is_set(fsa_input_register, the_subsystem->flag_connected) == FALSE) { /* running but not yet connected */ crm_debug("Stopping %s before it had connected", the_subsystem->name); } /* if(force_quit && the_subsystem->sent_kill == FALSE) { quit_signal = SIGKILL; } else if(force_quit) { crm_debug("Already sent -KILL to %s: [%d]", the_subsystem->name, the_subsystem->pid); } */ errno = 0; if (kill(the_subsystem->pid, quit_signal) == 0) { crm_info("Sent -TERM to %s: [%d]", the_subsystem->name, the_subsystem->pid); the_subsystem->sent_kill = TRUE; } else { crm_perror(LOG_ERR, "Sent -TERM to %s: [%d]", the_subsystem->name, the_subsystem->pid); } return TRUE; } gboolean start_subsystem(struct crm_subsystem_s * the_subsystem) { pid_t pid; struct stat buf; int s_res; unsigned int j; struct rlimit oflimits; const char *devnull = "/dev/null"; crm_info("Starting sub-system \"%s\"", the_subsystem->name); if (the_subsystem->pid > 0) { crm_warn("Client %s already running as pid %d", the_subsystem->name, (int)the_subsystem->pid); /* starting a started X is not an error */ return TRUE; } /* * We want to ensure that the exec will succeed before * we bother forking. */ if (access(the_subsystem->path, F_OK | X_OK) != 0) { crm_perror(LOG_ERR, "Cannot (access) exec %s", the_subsystem->path); return FALSE; } s_res = stat(the_subsystem->command, &buf); if (s_res != 0) { crm_perror(LOG_ERR, "Cannot (stat) exec %s", the_subsystem->command); return FALSE; } /* We need to fork so we can make child procs not real time */ switch (pid = fork()) { case -1: crm_err("Cannot fork."); return FALSE; default: /* Parent */ mainloop_child_add(pid, 0, the_subsystem->name, the_subsystem, crmd_child_exit); crm_trace("Client %s is has pid: %d", the_subsystem->name, pid); the_subsystem->pid = pid; return TRUE; case 0: /* Child */ /* create a new process group to avoid - * being interupted by heartbeat + * being interrupted by heartbeat */ setpgid(0, 0); break; } crm_debug("Executing \"%s (%s)\" (pid %d)", the_subsystem->command, the_subsystem->name, (int)getpid()); /* A precautionary measure */ getrlimit(RLIMIT_NOFILE, &oflimits); for (j = 0; j < oflimits.rlim_cur; ++j) { close(j); } (void)open(devnull, O_RDONLY); /* Stdin: fd 0 */ (void)open(devnull, O_WRONLY); /* Stdout: fd 1 */ (void)open(devnull, O_WRONLY); /* Stderr: fd 2 */ { char *opts[2]; opts[0] = strdup(the_subsystem->command); opts[1] = NULL; /* coverity[toctou] The call to stat() is a fail-fast, not a race */ (void)execvp(the_subsystem->command, opts); } /* Should not happen */ crm_perror(LOG_ERR, "FATAL: Cannot exec %s", the_subsystem->command); return crm_exit(DAEMON_RESPAWN_STOP); /* Suppress respawning */ } diff --git a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt index 226357aa3e..7c23e23e22 100644 --- a/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt +++ b/doc/Pacemaker_Explained/en-US/Ch-Advanced-Resources.txt @@ -1,1371 +1,1379 @@ = Advanced Resource Types = [[group-resources]] == Groups - A Syntactic Shortcut == indexterm:[Group Resources] indexterm:[Resource,Groups] One of the most common elements of a cluster is a set of resources that need to be located together, start sequentially, and stop in the reverse order. To simplify this configuration, we support the concept of groups. .A group of two primitive resources ====== [source,XML] ------- ------- ====== Although the example above contains only two resources, there is no limit to the number of resources a group can contain. The example is also sufficient to explain the fundamental properties of a group: * Resources are started in the order they appear in (+Public-IP+ first, then +Email+) * Resources are stopped in the reverse order to which they appear in (+Email+ first, then +Public-IP+) If a resource in the group can't run anywhere, then nothing after that is allowed to run, too. * If +Public-IP+ can't run anywhere, neither can +Email+; * but if +Email+ can't run anywhere, this does not affect +Public-IP+ in any way The group above is logically equivalent to writing: .How the cluster sees a group resource ====== [source,XML] ------- ------- ====== Obviously as the group grows bigger, the reduced configuration effort can become significant. Another (typical) example of a group is a DRBD volume, the filesystem mount, an IP address, and an application that uses them. === Group Properties === .Properties of a Group Resource [width="95%",cols="3m,5<",options="header",align="center"] |========================================================= |Field |Description |id |A unique name for the group indexterm:[id,Group Resource Property] indexterm:[Resource,Group Property,id] |========================================================= === Group Options === Groups inherit the +priority+, +target-role+, and +is-managed+ properties from primitive resources. See <> for information about those properties. === Group Instance Attributes === Groups have no instance attributes. However, any that are set for the group object will be inherited by the group's children. === Group Contents === Groups may only contain a collection of cluster resources (see <>). To refer to a child of a group resource, just use the child's +id+ instead of the group's. === Group Constraints === Although it is possible to reference a group's children in constraints, it is usually preferable to reference the group itself. .Some constraints involving groups ====== [source,XML] ------- ------- ====== === Group Stickiness === indexterm:[resource-stickiness,Groups] Stickiness, the measure of how much a resource wants to stay where it is, is additive in groups. Every active resource of the group will contribute its stickiness value to the group's total. So if the default +resource-stickiness+ is 100, and a group has seven members, five of which are active, then the group as a whole will prefer its current location with a score of 500. [[s-resource-clone]] == Clones - Resources That Get Active on Multiple Hosts == indexterm:[Clone Resources] indexterm:[Resource,Clones] Clones were initially conceived as a convenient way to start multiple instances of an IP address resource and have them distributed throughout the cluster for load balancing. They have turned out to quite useful for a number of purposes including integrating with the Distributed Lock Manager (used by many cluster filesystems), the fencing subsystem, and OCFS2. You can clone any resource, provided the resource agent supports it. Three types of cloned resources exist: * Anonymous * Globally unique * Stateful 'Anonymous' clones are the simplest. These behave completely identically everywhere they are running. Because of this, there can be only one copy of an anonymous clone active per machine. 'Globally unique' clones are distinct entities. A copy of the clone running on one machine is not equivalent to another instance on another node, nor would any two copies on the same node be equivalent. 'Stateful' clones are covered later in <>. .A clone of an LSB resource ====== [source,XML] ------- ------- ====== === Clone Properties === .Properties of a Clone Resource [width="95%",cols="3m,5<",options="header",align="center"] |========================================================= |Field |Description |id |A unique name for the clone indexterm:[id,Clone Property] indexterm:[Clone,Property,id] |========================================================= === Clone Options === Options inherited from <> resources: +priority, target-role, is-managed+ .Clone-specific configuration options [width="95%",cols="1m,1,3<",options="header",align="center"] |========================================================= |Field |Default |Description |clone-max |number of nodes in cluster |How many copies of the resource to start indexterm:[clone-max,Clone Option] indexterm:[Clone,Option,clone-max] |clone-node-max |1 |How many copies of the resource can be started on a single node indexterm:[clone-node-max,Clone Option] indexterm:[Clone,Option,clone-node-max] |clone-min |1 |Require at least this number of clone instances to be runnable before allowing resources depending on the clone to be runnable '(since 1.1.14)' indexterm:[clone-min,Clone Option] indexterm:[Clone,Option,clone-min] |notify |true |When stopping or starting a copy of the clone, tell all the other copies beforehand and again when the action was successful. Allowed values: +false+, +true+ indexterm:[notify,Clone Option] indexterm:[Clone,Option,notify] |globally-unique |false |Does each copy of the clone perform a different function? Allowed values: +false+, +true+ indexterm:[globally-unique,Clone Option] indexterm:[Clone,Option,globally-unique] |ordered |false |Should the copies be started in series (instead of in parallel)? Allowed values: +false+, +true+ indexterm:[ordered,Clone Option] indexterm:[Clone,Option,ordered] |interleave |false |If this clone depends on another clone via an ordering constraint, is it allowed to start after the local instance of the other clone starts, rather than wait for all instances of the other clone to start? Allowed values: +false+, +true+ indexterm:[interleave,Clone Option] indexterm:[Clone,Option,interleave] |========================================================= === Clone Instance Attributes === Clones have no instance attributes; however, any that are set here will be inherited by the clone's children. === Clone Contents === Clones must contain exactly one primitive or group resource. [WARNING] You should never reference the name of a clone's child. If you think you need to do this, you probably need to re-evaluate your design. === Clone Constraints === In most cases, a clone will have a single copy on each active cluster node. If this is not the case, you can indicate which nodes the cluster should preferentially assign copies to with resource location constraints. These constraints are written no differently from those for primitive resources except that the clone's +id+ is used. .Some constraints involving clones ====== [source,XML] ------- ------- ====== Ordering constraints behave slightly differently for clones. In the example above, +apache-stats+ will wait until all copies of +apache-clone+ that need to be started have done so before being started itself. Only if _no_ copies can be started will +apache-stats+ be prevented from being active. Additionally, the clone will wait for +apache-stats+ to be stopped before stopping itself. Colocation of a primitive or group resource with a clone means that the resource can run on any machine with an active copy of the clone. The cluster will choose a copy based on where the clone is running and the resource's own location preferences. Colocation between clones is also possible. If one clone +A+ is colocated with another clone +B+, the set of allowed locations for +A+ is limited to nodes on which +B+ is (or will be) active. Placement is then performed normally. [[s-clone-stickiness]] === Clone Stickiness === indexterm:[resource-stickiness,Clones] To achieve a stable allocation pattern, clones are slightly sticky by default. If no value for +resource-stickiness+ is provided, the clone will use a value of 1. Being a small value, it causes minimal disturbance to the score calculations of other resources but is enough to prevent Pacemaker from needlessly moving copies around the cluster. [NOTE] ==== For globally unique clones, this may result in multiple instances of the clone staying on a single node, even after another eligible node becomes active (for example, after being put into standby mode then made active again). If you do not want this behavior, specify a +resource-stickiness+ of 0 for the clone temporarily and let the cluster adjust, then set it back to 1 if you want the default behavior to apply again. ==== === Clone Resource Agent Requirements === Any resource can be used as an anonymous clone, as it requires no additional support from the resource agent. Whether it makes sense to do so depends on your resource and its resource agent. Globally unique clones do require some additional support in the resource agent. In particular, it must only respond with +$\{OCF_SUCCESS}+ if the node has that exact instance active. All other probes for instances of the clone should result in +$\{OCF_NOT_RUNNING}+ (or one of the other OCF error codes if they are failed). Individual instances of a clone are identified by appending a colon and a numerical offset, e.g. +apache:2+. Resource agents can find out how many copies there are by examining the +OCF_RESKEY_CRM_meta_clone_max+ environment variable and which copy it is by examining +OCF_RESKEY_CRM_meta_clone+. The resource agent must not make any assumptions (based on +OCF_RESKEY_CRM_meta_clone+) about which numerical instances are active. In particular, the list of active copies will not always be an unbroken sequence, nor always start at 0. ==== Clone Notifications ==== Supporting notifications requires the +notify+ action to be implemented. If supported, the notify action will be passed a number of extra variables which, when combined with additional context, can be used to calculate the current state of the cluster and what is about to happen to it. .Environment variables supplied with Clone notify actions [width="95%",cols="5,3<",options="header",align="center"] |========================================================= |Variable |Description |OCF_RESKEY_CRM_meta_notify_type |Allowed values: +pre+, +post+ indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,type] indexterm:[type,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_operation |Allowed values: +start+, +stop+ indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,operation] indexterm:[operation,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_start_resource |Resources to be started indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,start_resource] indexterm:[start_resource,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_stop_resource |Resources to be stopped indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,stop_resource] indexterm:[stop_resource,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_active_resource |Resources that are running indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,active_resource] indexterm:[active_resource,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_inactive_resource |Resources that are not running indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,inactive_resource] indexterm:[inactive_resource,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_start_uname |Nodes on which resources will be started indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,start_uname] indexterm:[start_uname,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_stop_uname |Nodes on which resources will be stopped indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,stop_uname] indexterm:[stop_uname,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_active_uname |Nodes on which resources are running indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,active_uname] indexterm:[active_uname,Notification Environment Variable] |========================================================= The variables come in pairs, such as +OCF_RESKEY_CRM_meta_notify_start_resource+ and +OCF_RESKEY_CRM_meta_notify_start_uname+ and should be treated as an array of whitespace-separated elements. +OCF_RESKEY_CRM_meta_notify_inactive_resource+ is an exception as the matching +uname+ variable does not exist since inactive resources are not running on any node. Thus in order to indicate that +clone:0+ will be started on +sles-1+, +clone:2+ will be started on +sles-3+, and +clone:3+ will be started on +sles-2+, the cluster would set .Notification variables ====== [source,Bash] ------- OCF_RESKEY_CRM_meta_notify_start_resource="clone:0 clone:2 clone:3" OCF_RESKEY_CRM_meta_notify_start_uname="sles-1 sles-3 sles-2" ------- ====== ==== Proper Interpretation of Notification Environment Variables ==== .Pre-notification (stop): * Active resources: +$OCF_RESKEY_CRM_meta_notify_active_resource+ * Inactive resources: +$OCF_RESKEY_CRM_meta_notify_inactive_resource+ * Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+ * Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+ .Post-notification (stop) / Pre-notification (start): * Active resources ** +$OCF_RESKEY_CRM_meta_notify_active_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ * Inactive resources ** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+ ** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ * Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+ * Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+ .Post-notification (start): * Active resources: ** +$OCF_RESKEY_CRM_meta_notify_active_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ ** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+ * Inactive resources: ** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+ ** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_start_resource+ * Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+ * Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+ [[s-resource-multistate]] == Multi-state - Resources That Have Multiple Modes == indexterm:[Multi-state Resources] indexterm:[Resource,Multi-state] Multi-state resources are a specialization of clone resources; please ensure you understand <> before continuing! Multi-state resources allow the instances to be in one of two operating modes (called 'roles'). The roles are called 'master' and 'slave', but can mean whatever you wish them to mean. The only limitation is that when an instance is started, it must come up in the slave role. === Multi-state Properties === .Properties of a Multi-State Resource [width="95%",cols="3m,5<",options="header",align="center"] |========================================================= |Field |Description |id |Your name for the multi-state resource indexterm:[id,Multi-State Property] indexterm:[Multi-State,Property,id] |========================================================= === Multi-state Options === Options inherited from <> resources: +priority+, +target-role+, +is-managed+ Options inherited from <> resources: +clone-max+, +clone-node-max+, +notify+, +globally-unique+, +ordered+, +interleave+ .Multi-state-specific resource configuration options [width="95%",cols="1m,1,3<",options="header",align="center"] |========================================================= |Field |Default |Description |master-max |1 |How many copies of the resource can be promoted to the +master+ role indexterm:[master-max,Multi-State Option] indexterm:[Multi-State,Option,master-max] |master-node-max |1 |How many copies of the resource can be promoted to the +master+ role on a single node indexterm:[master-node-max,Multi-State Option] indexterm:[Multi-State,Option,master-node-max] |========================================================= === Multi-state Instance Attributes === Multi-state resources have no instance attributes; however, any that are set here will be inherited by a master's children. === Multi-state Contents === Masters must contain exactly one primitive or group resource. [WARNING] You should never reference the name of a master's child. If you think you need to do this, you probably need to re-evaluate your design. === Monitoring Multi-State Resources === The usual monitor actions are insufficient to monitor a multi-state resource, because pacemaker needs to verify not only that the resource is active, but also that its actual role matches its intended one. Define two monitoring actions: the usual one will cover the slave role, and an additional one with +role="master"+ will cover the master role. .Monitoring both states of a multi-state resource ====== [source,XML] ------- ------- ====== [IMPORTANT] =========== It is crucial that _every_ monitor operation has a different interval! Pacemaker currently differentiates between operations only by resource and interval; so if (for example) a master/slave resource had the same monitor interval for both roles, Pacemaker would ignore the role when checking the status -- which would cause unexpected return codes, and therefore unnecessary complications. =========== === Multi-state Constraints === In most cases, multi-state resources will have a single copy on each active cluster node. If this is not the case, you can indicate which nodes the cluster should preferentially assign copies to with resource location constraints. These constraints are written no differently from those for primitive resources except that the master's +id+ is used. When considering multi-state resources in constraints, for most purposes it is sufficient to treat them as clones. The exception is that the +first-action+ and/or +then-action+ fields for ordering constraints may be set to +promote+ or +demote+ to constrain the master role, and colocation constraints may contain +rsc-role+ and/or +with-rsc-role+ fields. .Additional colocation constraint options for multi-state resources [width="95%",cols="1m,1,3<",options="header",align="center"] |========================================================= |Field |Default |Description |rsc-role |Started |An additional attribute of colocation constraints that specifies the role that +rsc+ must be in. Allowed values: +Started+, +Master+, +Slave+. indexterm:[rsc-role,Ordering Constraints] indexterm:[Constraints,Ordering,rsc-role] |with-rsc-role |Started |An additional attribute of colocation constraints that specifies the role that +with-rsc+ must be in. Allowed values: +Started+, +Master+, +Slave+. indexterm:[with-rsc-role,Ordering Constraints] indexterm:[Constraints,Ordering,with-rsc-role] |========================================================= .Constraints involving multi-state resources ====== [source,XML] ------- ------- ====== In the example above, +myApp+ will wait until one of the database copies has been started and promoted to master before being started itself on the same node. Only if no copies can be promoted will +myApp+ be prevented from being active. Additionally, the cluster will wait for +myApp+ to be stopped before demoting the database. Colocation of a primitive or group resource with a multi-state resource means that it can run on any machine with an active copy of the multi-state resource that has the specified role (+master+ or +slave+). In the example above, the cluster will choose a location based on where database is running as a +master+, and if there are multiple +master+ instances it will also factor in +myApp+'s own location preferences when deciding which location to choose. Colocation with regular clones and other multi-state resources is also possible. In such cases, the set of allowed locations for the +rsc+ clone is (after role filtering) limited to nodes on which the +with-rsc+ multi-state resource is (or will be) in the specified role. Placement is then performed as normal. ==== Using Multi-state Resources in Colocation Sets ==== .Additional colocation set options relevant to multi-state resources [width="95%",cols="1m,1,6<",options="header",align="center"] |========================================================= |Field |Default |Description |role |Started |The role that 'all members' of the set must be in. Allowed values: +Started+, +Master+, +Slave+. indexterm:[role,Ordering Constraints] indexterm:[Constraints,Ordering,role] |========================================================= In the following example +B+'s master must be located on the same node as +A+'s master. Additionally resources +C+ and +D+ must be located on the same node as +A+'s and +B+'s masters. .Colocate C and D with A's and B's master instances ====== [source,XML] ------- ------- ====== ==== Using Multi-state Resources in Ordering Sets ==== .Additional ordered set options relevant to multi-state resources [width="95%",cols="1m,1,3<",options="header",align="center"] |========================================================= |Field |Default |Description |action |value of +first-action+ |An additional attribute of ordering constraint sets that specifies the action that applies to 'all members' of the set. Allowed values: +start+, +stop+, +promote+, +demote+. indexterm:[action,Ordering Constraints] indexterm:[Constraints,Ordering,action] |========================================================= .Start C and D after first promoting A and B ====== [source,XML] ------- ------- ====== In the above example, +B+ cannot be promoted to a master role until +A+ has been promoted. Additionally, resources +C+ and +D+ must wait until +A+ and +B+ have been promoted before they can start. === Multi-state Stickiness === indexterm:[resource-stickiness,Multi-State] As with regular clones, multi-state resources are slightly sticky by default. See <> for details. === Which Resource Instance is Promoted === During the start operation, most resource agents should call the `crm_master` utility. This tool automatically detects both the resource and host and should be used to set a preference for being promoted. Based on this, +master-max+, and +master-node-max+, the instance(s) with the highest preference will be promoted. An alternative is to create a location constraint that indicates which nodes are most preferred as masters. .Explicitly preferring node1 to be promoted to master ====== [source,XML] ------- ------- ====== === Requirements for Multi-state Resource Agents === Since multi-state resources are an extension of cloned resources, all the requirements for resource agents that support clones are also requirements for resource agents that support multi-state resources. Additionally, multi-state resources require two extra actions, +demote+ and +promote+, which are responsible for changing the state of the resource. Like +start+ and +stop+, they should return +$\{OCF_SUCCESS}+ if they completed successfully or a relevant error code if they did not. The states can mean whatever you wish, but when the resource is started, it must come up in the mode called +slave+. From there the cluster will decide which instances to promote to +master+. In addition to the clone requirements for monitor actions, agents must also _accurately_ report which state they are in. The cluster relies on the agent to report its status (including role) accurately and does not indicate to the agent what role it currently believes it to be in. .Role implications of OCF return codes [width="95%",cols="1,1<",options="header",align="center"] |========================================================= |Monitor Return Code |Description |OCF_NOT_RUNNING |Stopped indexterm:[Return Code,OCF_NOT_RUNNING] |OCF_SUCCESS |Running (Slave) indexterm:[Return Code,OCF_SUCCESS] |OCF_RUNNING_MASTER |Running (Master) indexterm:[Return Code,OCF_RUNNING_MASTER] |OCF_FAILED_MASTER |Failed (Master) indexterm:[Return Code,OCF_FAILED_MASTER] |Other |Failed (Slave) |========================================================= ==== Multi-state Notifications ==== Like clones, supporting notifications requires the +notify+ action to be implemented. If supported, the notify action will be passed a number of extra variables which, when combined with additional context, can be used to calculate the current state of the cluster and what is about to happen to it. .Environment variables supplied with multi-state notify actions footnote:[Emphasized variables are specific to +Master+ resources, and all behave in the same manner as described for Clone resources.] [width="95%",cols="5,3<",options="header",align="center"] |========================================================= |Variable |Description |OCF_RESKEY_CRM_meta_notify_type |Allowed values: +pre+, +post+ indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,type] indexterm:[type,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_operation |Allowed values: +start+, +stop+ indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,operation] indexterm:[operation,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_active_resource |Resources that are running indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,active_resource] indexterm:[active_resource,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_inactive_resource |Resources that are not running indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,inactive_resource] indexterm:[inactive_resource,Notification Environment Variable] |_OCF_RESKEY_CRM_meta_notify_master_resource_ |Resources that are running in +Master+ mode indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,master_resource] indexterm:[master_resource,Notification Environment Variable] |_OCF_RESKEY_CRM_meta_notify_slave_resource_ |Resources that are running in +Slave+ mode indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,slave_resource] indexterm:[slave_resource,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_start_resource |Resources to be started indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,start_resource] indexterm:[start_resource,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_stop_resource |Resources to be stopped indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,stop_resource] indexterm:[stop_resource,Notification Environment Variable] |_OCF_RESKEY_CRM_meta_notify_promote_resource_ |Resources to be promoted indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,promote_resource] indexterm:[promote_resource,Notification Environment Variable] |_OCF_RESKEY_CRM_meta_notify_demote_resource_ |Resources to be demoted indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,demote_resource] indexterm:[demote_resource,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_start_uname |Nodes on which resources will be started indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,start_uname] indexterm:[start_uname,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_stop_uname |Nodes on which resources will be stopped indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,stop_uname] indexterm:[stop_uname,Notification Environment Variable] |_OCF_RESKEY_CRM_meta_notify_promote_uname_ |Nodes on which resources will be promoted indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,promote_uname] indexterm:[promote_uname,Notification Environment Variable] |_OCF_RESKEY_CRM_meta_notify_demote_uname_ |Nodes on which resources will be demoted indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,demote_uname] indexterm:[demote_uname,Notification Environment Variable] |OCF_RESKEY_CRM_meta_notify_active_uname |Nodes on which resources are running indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,active_uname] indexterm:[active_uname,Notification Environment Variable] |_OCF_RESKEY_CRM_meta_notify_master_uname_ |Nodes on which resources are running in +Master+ mode indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,master_uname] indexterm:[master_uname,Notification Environment Variable] |_OCF_RESKEY_CRM_meta_notify_slave_uname_ |Nodes on which resources are running in +Slave+ mode indexterm:[Environment Variable,OCF_RESKEY_CRM_meta_notify_,slave_uname] indexterm:[slave_uname,Notification Environment Variable] |========================================================= ==== Proper Interpretation of Multi-state Notification Environment Variables ==== .Pre-notification (demote): * +Active+ resources: +$OCF_RESKEY_CRM_meta_notify_active_resource+ * +Master+ resources: +$OCF_RESKEY_CRM_meta_notify_master_resource+ * +Slave+ resources: +$OCF_RESKEY_CRM_meta_notify_slave_resource+ * Inactive resources: +$OCF_RESKEY_CRM_meta_notify_inactive_resource+ * Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+ * Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+ * Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+ * Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+ .Post-notification (demote) / Pre-notification (stop): * +Active+ resources: +$OCF_RESKEY_CRM_meta_notify_active_resource+ * +Master+ resources: ** +$OCF_RESKEY_CRM_meta_notify_master_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+ * +Slave+ resources: +$OCF_RESKEY_CRM_meta_notify_slave_resource+ * Inactive resources: +$OCF_RESKEY_CRM_meta_notify_inactive_resource+ * Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+ * Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+ * Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+ * Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+ * Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+ .Post-notification (stop) / Pre-notification (start) * +Active+ resources: ** +$OCF_RESKEY_CRM_meta_notify_active_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ * +Master+ resources: ** +$OCF_RESKEY_CRM_meta_notify_master_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+ * +Slave+ resources: ** +$OCF_RESKEY_CRM_meta_notify_slave_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ * Inactive resources: ** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+ ** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ * Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+ * Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+ * Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+ * Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+ * Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+ * Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+ .Post-notification (start) / Pre-notification (promote) * +Active+ resources: ** +$OCF_RESKEY_CRM_meta_notify_active_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ ** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+ * +Master+ resources: ** +$OCF_RESKEY_CRM_meta_notify_master_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+ * +Slave+ resources: ** +$OCF_RESKEY_CRM_meta_notify_slave_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ ** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+ * Inactive resources: ** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+ ** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_start_resource+ * Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+ * Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+ * Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+ * Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+ * Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+ * Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+ * Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+ .Post-notification (promote) * +Active+ resources: ** +$OCF_RESKEY_CRM_meta_notify_active_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ ** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+ * +Master+ resources: ** +$OCF_RESKEY_CRM_meta_notify_master_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_demote_resource+ ** plus +$OCF_RESKEY_CRM_meta_notify_promote_resource+ * +Slave+ resources: ** +$OCF_RESKEY_CRM_meta_notify_slave_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ ** plus +$OCF_RESKEY_CRM_meta_notify_start_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_promote_resource+ * Inactive resources: ** +$OCF_RESKEY_CRM_meta_notify_inactive_resource+ ** plus +$OCF_RESKEY_CRM_meta_notify_stop_resource+ ** minus +$OCF_RESKEY_CRM_meta_notify_start_resource+ * Resources to be started: +$OCF_RESKEY_CRM_meta_notify_start_resource+ * Resources to be promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+ * Resources to be demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+ * Resources to be stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+ * Resources that were started: +$OCF_RESKEY_CRM_meta_notify_start_resource+ * Resources that were promoted: +$OCF_RESKEY_CRM_meta_notify_promote_resource+ * Resources that were demoted: +$OCF_RESKEY_CRM_meta_notify_demote_resource+ * Resources that were stopped: +$OCF_RESKEY_CRM_meta_notify_stop_resource+ [[s-resource-bundle]] == Bundles - Isolated Environments == indexterm:[bundle] indexterm:[Resource,bundle] indexterm:[Docker,bundle] Pacemaker (version 1.1.17 and later) supports a special syntax for combining an isolated environment with the infrastructure support that it needs: the 'bundle'. The only isolation technology currently supported by Pacemaker bundles is https://www.docker.com/[Docker] containers. footnote:[Docker is a trademark of Docker, Inc. No endorsement by or association with Docker, Inc. is implied.] .A bundle for a containerized web server ==== [source,XML] ---- ---- ==== === Bundle Properties === .Properties of a Bundle [width="95%",cols="3m,5<",options="header",align="center"] |========================================================= |Field |Description |id |A unique name for the bundle (required) indexterm:[id,bundle] indexterm:[bundle,Property,id] |description |Arbitrary text (not used by Pacemaker) indexterm:[description,bundle] indexterm:[bundle,Property,description] |========================================================= === Docker Properties === A bundle must contain exactly one ++ element. Before configuring a bundle in Pacemaker, the user must install Docker and supply a fully configured Docker image on every node allowed to run the bundle. Pacemaker will create an implicit +ocf:heartbeat:docker+ resource to manage a bundle's Docker container. .Properties of a Bundle's Docker Element [width="95%",cols="2m,1,4<",options="header",align="center"] |========================================================= |Field |Default |Description |image | |Docker image tag (required) indexterm:[image,Docker] indexterm:[Docker,Property,image] |replicas |Value of +masters+ if that is positive, else 1 |A positive integer specifying the number of container instances to launch indexterm:[replicas,Docker] indexterm:[Docker,Property,replicas] |replicas-per-host |1 |A positive integer specifying the number of container instances allowed to run on a single node indexterm:[replicas-per-host,Docker] indexterm:[Docker,Property,replicas-per-host] |masters |0 |A non-negative integer that, if positive, indicates that the containerized service should be treated as a multistate service, with this many replicas allowed to run the service in the master role indexterm:[masters,Docker] indexterm:[Docker,Property,masters] |network | |If specified, this will be passed to +docker run+ as the https://docs.docker.com/engine/reference/run/#network-settings[network setting] for the Docker container. indexterm:[network,Docker] indexterm:[Docker,Property,network] +|run-command +|`/usr/sbin/pacemaker_remoted` if bundle contains a +primitive+, otherwise none +|This command will be run inside the container when launching it ("PID 1"). If + the bundle contains a +primitive+, this command 'must' start pacemaker_remoted + (but could, for example, be a script that does other stuff, too). + indexterm:[network,Docker] + indexterm:[Docker,Property,network] + |options | |Extra command-line options to pass to `docker run` indexterm:[options,Docker] indexterm:[Docker,Property,options] |========================================================= === Bundle Network Properties === A bundle may optionally contain one ++ element. indexterm:[bundle,network] .Properties of a Bundle's Network Element [width="95%",cols="2m,1,4<",options="header",align="center"] |========================================================= |Field |Default |Description |ip-range-start | |If specified, Pacemaker will create an implicit +ocf:heartbeat:IPaddr2+ resource for each container instance, starting with this IP address, using up to +replicas+ sequential addresses. These addresses can be used from the host's network to reach the service inside the container, though it is not visible within the container itself. Only IPv4 addresses are currently supported. indexterm:[ip-range-start,network] indexterm:[network,Property,ip-range-start] |host-netmask |32 |If +ip-range-start+ is specified, the IP addresses are created with this CIDR netmask (as a number of bits). indexterm:[host-netmask,network] indexterm:[network,Property,host-netmask] |host-interface | |If +ip-range-start+ is specified, the IP addresses are created on this host interface (by default, it will be determined from the IP address). indexterm:[host-interface,network] indexterm:[network,Property,host-interface] |control-port | |If the bundle contains a +primitive+, the cluster will use this integer TCP port for communication with Pacemaker Remote inside the container. This takes precedence over the value of any PCMK_remote_port environment variable set in the container image. This can allow a +primitive+ to be specified without using +ip-start-range+, in which case +replicas-per-host+ must be 1. indexterm:[control-port,network] indexterm:[network,Property,control-port] |========================================================= [NOTE] ==== If +ip-range-start+ is used, Pacemaker will automatically ensure that +/etc/hosts+ inside the containers has entries for each replica and its assigned IP. Replicas are named by the bundle id plus a dash and an integer counter starting with zero. For example, if a bundle named +httpd-bundle+ has +replicas=2+, its containers will be named +httpd-bundle-0+ and +httpd-bundle-1+. ==== Additionally, a ++ element may optionally contain one or more ++ elements. indexterm:[bundle,network,port-mapping] .Properties of a Bundle's Port-Mapping Element [width="95%",cols="2m,1,4<",options="header",align="center"] |========================================================= |Field |Default |Description |id | |A unique name for the port mapping (required) indexterm:[id,port-mapping] indexterm:[port-mapping,Property,id] |port | |If this is specified, connections to this TCP port number on the host network (on the container's assigned IP address, if +ip-range-start+ is specified) will be forwarded to the container network. Exactly one of +port+ or +range+ must be specified in a +port-mapping+. indexterm:[port,port-mapping] indexterm:[port-mapping,Property,port] |internal-port |value of +port+ |If +port+ and this are specified, connections to +port+ on the host's network will be forwarded to this port on the container network. indexterm:[internal-port,port-mapping] indexterm:[port-mapping,Property,internal-port] |range | |If this is specified, connections to these TCP port numbers (expressed as 'first_port'-'last_port') on the host network (on the container's assigned IP address, if +ip-range-start+ is specified) will be forwarded to the same ports in the container network. Exactly one of +port+ or +range+ must be specified in a +port-mapping+. indexterm:[range,port-mapping] indexterm:[port-mapping,Property,range] |========================================================= [NOTE] ==== If the bundle contains a +primitive+, Pacemaker will automatically map the +control-port+, so it is not necessary to specify that port in a +port-mapping+. ==== === Bundle Storage Properties === A bundle may optionally contain one ++ element. A ++ element has no properties of its own, but may contain one or more ++ elements. indexterm:[bundle,storage,storage-mapping] .Properties of a Bundle's Storage-Mapping Element [width="95%",cols="2m,1,4<",options="header",align="center"] |========================================================= |Field |Default |Description |id | |A unique name for the storage mapping (required) indexterm:[id,storage-mapping] indexterm:[storage-mapping,Property,id] |source-dir | |The absolute path on the host's filesystem that will be mapped into the container. Exactly one of +source-dir+ and +source-dir-root+ must be specified in a +storage-mapping+. indexterm:[source-dir,storage-mapping] indexterm:[storage-mapping,Property,source-dir] |source-dir-root | |The start of a path on the host's filesystem that will be mapped into the container, using a different subdirectory on the host for each container instance. Exactly one of +source-dir+ and +source-dir-root+ must be specified in a +storage-mapping+. indexterm:[source-dir-root,storage-mapping] indexterm:[storage-mapping,Property,source-dir-root] |target-dir | |The path name within the container where the host storage will be mapped (required) indexterm:[target-dir,storage-mapping] indexterm:[storage-mapping,Property,target-dir] |options | |File system mount options to use when mapping the storage indexterm:[options,storage-mapping] indexterm:[storage-mapping,Property,options] |========================================================= [NOTE] ==== If the bundle contains a +primitive+, Pacemaker will automatically map the equivalent of +source-dir=/etc/pacemaker/authkey target-dir=/etc/pacemaker/authkey+ and +source-dir-root=/var/log/pacemaker/bundles target-dir=/var/log+ into the container, so it is not necessary to specify those paths in a +storage-mapping+. Newer versions of +ocf:heartbeat:docker+ will automatically create the source directories if they do not exist, but the user may want to ensure they exist beforehand. ==== === Bundle Primitive === A bundle may optionally contain one ++ resource (see <>). The primitive may have operations, instance attributes and meta-attributes defined, as usual. If a bundle contains a primitive resource, the container image must include the Pacemaker Remote daemon, and either +ip-range-start+ or +control-port+ must be configured in the bundle. Pacemaker will create an implicit +ocf:pacemaker:remote+ resource for the connection, launch Pacemaker Remote within the container, and monitor and manage the primitive resource via Pacemaker Remote. If the bundle has more than one container instance (replica), the primitive resource will function as an implicit clone (see <>) -- a multistate clone if the bundle has +masters+ greater than zero (see <>). [IMPORTANT] ==== If Pacemaker Remote nodes are used in a cluster with bundle resources that contain a +primitive+, it is strongly recommended to configure location constraints prohibiting the bundles from running on those nodes. Running a bundle with a +primitive+ on a Pacemaker Remote node might work if +ip-range-start+ is used, but that configuration has not yet been tested. ==== === Limitations of Bundles === Currently, bundles may not be cloned, or included in groups or colocation constraints. This includes the bundle's primitive and any resources implicitly created by Pacemaker for the bundle. Bundles do not have instance attributes, meta-attributes, utilization attributes, or operations of their own, though a primitive included in a bundle may. diff --git a/doc/Pacemaker_Explained/en-US/Ch-Options.txt b/doc/Pacemaker_Explained/en-US/Ch-Options.txt index c99f3d1c68..3dfd512745 100644 --- a/doc/Pacemaker_Explained/en-US/Ch-Options.txt +++ b/doc/Pacemaker_Explained/en-US/Ch-Options.txt @@ -1,447 +1,454 @@ = Cluster-Wide Configuration = == CIB Properties == Certain settings are defined by CIB properties (that is, attributes of the +cib+ tag) rather than with the rest of the cluster configuration in the +configuration+ section. The reason is simply a matter of parsing. These options are used by the configuration database which is, by design, mostly ignorant of the content it holds. So the decision was made to place them in an easy-to-find location. .CIB Properties [width="95%",cols="2m,5<",options="header",align="center"] |========================================================= |Field |Description | admin_epoch | indexterm:[Configuration Version,Cluster] indexterm:[Cluster,Option,Configuration Version] indexterm:[admin_epoch,Cluster Option] indexterm:[Cluster,Option,admin_epoch] When a node joins the cluster, the cluster performs a check to see which node has the best configuration. It asks the node with the highest (+admin_epoch+, +epoch+, +num_updates+) tuple to replace the configuration on all the nodes -- which makes setting them, and setting them correctly, very important. +admin_epoch+ is never modified by the cluster; you can use this to make the configurations on any inactive nodes obsolete. _Never set this value to zero_. In such cases, the cluster cannot tell the difference between your configuration and the "empty" one used when nothing is found on disk. | epoch | indexterm:[epoch,Cluster Option] indexterm:[Cluster,Option,epoch] The cluster increments this every time the configuration is updated (usually by the administrator). | num_updates | indexterm:[num_updates,Cluster Option] indexterm:[Cluster,Option,num_updates] The cluster increments this every time the configuration or status is updated (usually by the cluster) and resets it to 0 when epoch changes. | validate-with | indexterm:[validate-with,Cluster Option] indexterm:[Cluster,Option,validate-with] Determines the type of XML validation that will be done on the configuration. If set to +none+, the cluster will not verify that updates conform to the DTD (nor reject ones that don't). This option can be useful when operating a mixed-version cluster during an upgrade. |cib-last-written | indexterm:[cib-last-written,Cluster Property] indexterm:[Cluster,Property,cib-last-written] Indicates when the configuration was last written to disk. Maintained by the cluster; for informational purposes only. |have-quorum | indexterm:[have-quorum,Cluster Property] indexterm:[Cluster,Property,have-quorum] Indicates if the cluster has quorum. If false, this may mean that the cluster cannot start resources or fence other nodes (see +no-quorum-policy+ below). Maintained by the cluster. |dc-uuid | indexterm:[dc-uuid,Cluster Property] indexterm:[Cluster,Property,dc-uuid] Indicates which cluster node is the current leader. Used by the cluster when placing resources and determining the order of some events. Maintained by the cluster. |========================================================= === Working with CIB Properties === Although these fields can be written to by the user, in most cases the cluster will overwrite any values specified by the user with the "correct" ones. To change the ones that can be specified by the user, for example +admin_epoch+, one should use: ---- # cibadmin --modify --xml-text '' ---- A complete set of CIB properties will look something like this: .Attributes set for a cib object ====== [source,XML] ------- ------- ====== [[s-cluster-options]] == Cluster Options == Cluster options, as you might expect, control how the cluster behaves when confronted with certain situations. They are grouped into sets within the +crm_config+ section, and, in advanced configurations, there may be more than one set. (This will be described later in the section on <> where we will show how to have the cluster use different sets of options during working hours than during weekends.) For now, we will describe the simple case where each option is present at most once. You can obtain an up-to-date list of cluster options, including their default values, by running the `man pengine` and `man crmd` commands. .Cluster Options [width="95%",cols="5m,2,11>). | enable-startup-probes | TRUE | indexterm:[enable-startup-probes,Cluster Option] indexterm:[Cluster,Option,enable-startup-probes] Should the cluster check for active resources during startup? | maintenance-mode | FALSE | indexterm:[maintenance-mode,Cluster Option] indexterm:[Cluster,Option,maintenance-mode] Should the cluster refrain from monitoring, starting and stopping resources? | stonith-enabled | TRUE | indexterm:[stonith-enabled,Cluster Option] indexterm:[Cluster,Option,stonith-enabled] Should failed nodes and nodes with resources that can't be stopped be shot? If you value your data, set up a STONITH device and enable this. If true, or unset, the cluster will refuse to start resources unless one or more STONITH resources have been configured. If false, unresponsive nodes are immediately assumed to be running no resources, and resource takeover to online nodes starts without any further protection (which means _data loss_ if the unresponsive node still accesses shared storage, for example). See also the +requires+ meta-attribute in <>. | stonith-action | reboot | indexterm:[stonith-action,Cluster Option] indexterm:[Cluster,Option,stonith-action] Action to send to STONITH device. Allowed values are +reboot+ and +off+. The value +poweroff+ is also allowed, but is only used for legacy devices. | stonith-timeout | 60s | indexterm:[stonith-timeout,Cluster Option] indexterm:[Cluster,Option,stonith-timeout] How long to wait for STONITH actions (reboot, on, off) to complete | stonith-max-attempts | 10 | indexterm:[stonith-max-attempts,Cluster Option] indexterm:[Cluster,Option,stonith-max-attempts] How many times stonith can fail before it will no longer be attempted on a target. Positive non-zero values are allowed. '(since 1.1.17)' | concurrent-fencing | FALSE | indexterm:[concurrent-fencing,Cluster Option] indexterm:[Cluster,Option,concurrent-fencing] Is the cluster allowed to initiate multiple fence actions concurrently? | cluster-delay | 60s | indexterm:[cluster-delay,Cluster Option] indexterm:[Cluster,Option,cluster-delay] Estimated maximum round-trip delay over the network (excluding action execution). If the TE requires an action to be executed on another node, it will consider the action failed if it does not get a response from the other node in this time (after considering the action's own timeout). The "correct" value will depend on the speed and load of your network and cluster nodes. | dc-deadtime | 20s | indexterm:[dc-deadtime,Cluster Option] indexterm:[Cluster,Option,dc-deadtime] How long to wait for a response from other nodes during startup. The "correct" value will depend on the speed/load of your network and the type of switches used. | cluster-recheck-interval | 15min | indexterm:[cluster-recheck-interval,Cluster Option] indexterm:[Cluster,Option,cluster-recheck-interval] Polling interval for time-based changes to options, resource parameters and constraints. The Cluster is primarily event-driven, but your configuration can have elements that take effect based on the time of day. To ensure these changes take effect, we can optionally poll the cluster's status for changes. A value of 0 disables polling. Positive values are an interval (in seconds unless other SI units are specified, e.g. 5min). | pe-error-series-max | -1 | indexterm:[pe-error-series-max,Cluster Option] indexterm:[Cluster,Option,pe-error-series-max] The number of PE inputs resulting in ERRORs to save. Used when reporting problems. A value of -1 means unlimited (report all). | pe-warn-series-max | -1 | indexterm:[pe-warn-series-max,Cluster Option] indexterm:[Cluster,Option,pe-warn-series-max] The number of PE inputs resulting in WARNINGs to save. Used when reporting problems. A value of -1 means unlimited (report all). | pe-input-series-max | -1 | indexterm:[pe-input-series-max,Cluster Option] indexterm:[Cluster,Option,pe-input-series-max] The number of "normal" PE inputs to save. Used when reporting problems. A value of -1 means unlimited (report all). +| placement-strategy | default | +indexterm:[placement-strategy,Cluster Option] +indexterm:[Cluster,Option,placement-strategy] + How the cluster should allocate resources to nodes (see <>). + Allowed values are +default+, +utilization+, +balanced+, and +minimal+. + '(since 1.1.0)' + | node-health-strategy | none | indexterm:[node-health-strategy,Cluster Option] indexterm:[Cluster,Option,node-health-strategy] How the cluster should react to node health attributes (see <>). Allowed values are +none+, +migrate-on-red+, +only-green+, +progressive+, and +custom+. | node-health-base | 0 | indexterm:[node-health-base,Cluster Option] indexterm:[Cluster,Option,node-health-base] The base health score assigned to a node. Only used when +node-health-strategy+ is +progressive+. '(since 1.1.16)' | node-health-green | 0 | indexterm:[node-health-green,Cluster Option] indexterm:[Cluster,Option,node-health-green] The score to use for a node health attribute whose value is +green+. Only used when +node-health-strategy+ is +progressive+ or +custom+. | node-health-yellow | 0 | indexterm:[node-health-yellow,Cluster Option] indexterm:[Cluster,Option,node-health-yellow] The score to use for a node health attribute whose value is +yellow+. Only used when +node-health-strategy+ is +progressive+ or +custom+. | node-health-red | 0 | indexterm:[node-health-red,Cluster Option] indexterm:[Cluster,Option,node-health-red] The score to use for a node health attribute whose value is +red+. Only used when +node-health-strategy+ is +progressive+ or +custom+. | remove-after-stop | FALSE | indexterm:[remove-after-stop,Cluster Option] indexterm:[Cluster,Option,remove-after-stop] _Advanced Use Only:_ Should the cluster remove resources from the LRM after they are stopped? Values other than the default are, at best, poorly tested and potentially dangerous. | startup-fencing | TRUE | indexterm:[startup-fencing,Cluster Option] indexterm:[Cluster,Option,startup-fencing] _Advanced Use Only:_ Should the cluster shoot unseen nodes? Not using the default is very unsafe! | election-timeout | 2min | indexterm:[election-timeout,Cluster Option] indexterm:[Cluster,Option,election-timeout] _Advanced Use Only:_ If you need to adjust this value, it probably indicates the presence of a bug. | shutdown-escalation | 20min | indexterm:[shutdown-escalation,Cluster Option] indexterm:[Cluster,Option,shutdown-escalation] _Advanced Use Only:_ If you need to adjust this value, it probably indicates the presence of a bug. | crmd-integration-timeout | 3min | indexterm:[crmd-integration-timeout,Cluster Option] indexterm:[Cluster,Option,crmd-integration-timeout] _Advanced Use Only:_ If you need to adjust this value, it probably indicates the presence of a bug. | crmd-finalization-timeout | 30min | indexterm:[crmd-finalization-timeout,Cluster Option] indexterm:[Cluster,Option,crmd-finalization-timeout] _Advanced Use Only:_ If you need to adjust this value, it probably indicates the presence of a bug. | crmd-transition-delay | 0s | indexterm:[crmd-transition-delay,Cluster Option] indexterm:[Cluster,Option,crmd-transition-delay] _Advanced Use Only:_ Delay cluster recovery for the configured interval to allow for additional/related events to occur. Useful if your configuration is sensitive to the order in which ping updates arrive. Enabling this option will slow down cluster recovery under all conditions. |default-resource-stickiness | 0 | indexterm:[default-resource-stickiness,Cluster Option] indexterm:[Cluster,Option,default-resource-stickiness] _Deprecated:_ See <> instead | is-managed-default | TRUE | indexterm:[is-managed-default,Cluster Option] indexterm:[Cluster,Option,is-managed-default] _Deprecated:_ See <> instead | default-action-timeout | 20s | indexterm:[default-action-timeout,Cluster Option] indexterm:[Cluster,Option,default-action-timeout] _Deprecated:_ See <> instead |========================================================= === Querying and Setting Cluster Options === indexterm:[Querying,Cluster Option] indexterm:[Setting,Cluster Option] indexterm:[Cluster,Querying Options] indexterm:[Cluster,Setting Options] Cluster options can be queried and modified using the `crm_attribute` tool. To get the current value of +cluster-delay+, you can run: ---- # crm_attribute --query --name cluster-delay ---- which is more simply written as ---- # crm_attribute -G -n cluster-delay ---- If a value is found, you'll see a result like this: ---- # crm_attribute -G -n cluster-delay scope=crm_config name=cluster-delay value=60s ---- If no value is found, the tool will display an error: ---- # crm_attribute -G -n clusta-deway scope=crm_config name=clusta-deway value=(null) Error performing operation: No such device or address ---- To use a different value (for example, 30 seconds), simply run: ---- # crm_attribute --name cluster-delay --update 30s ---- To go back to the cluster's default value, you can delete the value, for example: ---- # crm_attribute --name cluster-delay --delete Deleted crm_config option: id=cib-bootstrap-options-cluster-delay name=cluster-delay ---- === When Options are Listed More Than Once === If you ever see something like the following, it means that the option you're modifying is present more than once. .Deleting an option that is listed twice ======= ------ # crm_attribute --name batch-limit --delete Multiple attributes match name=batch-limit in crm_config: Value: 50 (set=cib-bootstrap-options, id=cib-bootstrap-options-batch-limit) Value: 100 (set=custom, id=custom-batch-limit) Please choose from one of the matches above and supply the 'id' with --id ------- ======= In such cases, follow the on-screen instructions to perform the requested action. To determine which value is currently being used by the cluster, refer to <>. diff --git a/doc/Pacemaker_Explained/en-US/Ch-Utilization.txt b/doc/Pacemaker_Explained/en-US/Ch-Utilization.txt index a7238dc07b..9fecf4c681 100644 --- a/doc/Pacemaker_Explained/en-US/Ch-Utilization.txt +++ b/doc/Pacemaker_Explained/en-US/Ch-Utilization.txt @@ -1,227 +1,229 @@ = Utilization and Placement Strategy = +[[s-utilization]] + Pacemaker decides where to place a resource according to the resource allocation scores on every node. The resource will be allocated to the node where the resource has the highest score. If the resource allocation scores on all the nodes are equal, by the default placement strategy, Pacemaker will choose a node with the least number of allocated resources for balancing the load. If the number of resources on each node is equal, the first eligible node listed in the CIB will be chosen to run the resource. Often, in real-world situations, different resources use significantly different proportions of a node's capacities (memory, I/O, etc.). We cannot balance the load ideally just according to the number of resources allocated to a node. Besides, if resources are placed such that their combined requirements exceed the provided capacity, they may fail to start completely or run with degraded performance. To take these factors into account, Pacemaker allows you to configure: . The capacity a certain node provides. . The capacity a certain resource requires. . An overall strategy for placement of resources. == Utilization attributes == To configure the capacity that a node provides or a resource requires, you can use 'utilization attributes' in +node+ and +resource+ objects. You can name utilization attributes according to your preferences and define as many name/value pairs as your configuration needs. However, the attributes' values must be integers. .Specifying CPU and RAM capacities of two nodes ==== [source,XML] ---- ---- ==== .Specifying CPU and RAM consumed by several resources ==== [source,XML] ---- ---- ==== A node is considered eligible for a resource if it has sufficient free capacity to satisfy the resource's requirements. The nature of the required or provided capacities is completely irrelevant to Pacemaker -- it just makes sure that all capacity requirements of a resource are satisfied before placing a resource to a node. == Placement Strategy == After you have configured the capacities your nodes provide and the capacities your resources require, you need to set the +placement-strategy+ in the global cluster options, otherwise the capacity configurations have 'no effect'. Four values are available for the +placement-strategy+: +default+:: Utilization values are not taken into account at all. Resources are allocated according to allocation scores. If scores are equal, resources are evenly distributed across nodes. +utilization+:: Utilization values are taken into account 'only' when deciding whether a node is considered eligible (i.e. whether it has sufficient free capacity to satisfy the resource's requirements). Load-balancing is still done based on the number of resources allocated to a node. +balanced+:: Utilization values are taken into account when deciding whether a node is eligible to serve a resource 'and' when load-balancing, so an attempt is made to spread the resources in a way that optimizes resource performance. +minimal+:: Utilization values are taken into account 'only' when deciding whether a node is eligible to serve a resource. For load-balancing, an attempt is made to concentrate the resources on as few nodes as possible, thereby enabling possible power savings on the remaining nodes. Set +placement-strategy+ with `crm_attribute`: ---- # crm_attribute --name placement-strategy --update balanced ---- Now Pacemaker will ensure the load from your resources will be distributed evenly throughout the cluster, without the need for convoluted sets of colocation constraints. == Allocation Details == === Which node is preferred to get consumed first when allocating resources? === - The node with the highest node weight gets consumed first. Node weight is a score maintained by the cluster to represent node health. - If multiple nodes have the same node weight: * If +placement-strategy+ is +default+ or +utilization+, the node that has the least number of allocated resources gets consumed first. ** If their numbers of allocated resources are equal, the first eligible node listed in the CIB gets consumed first. * If +placement-strategy+ is +balanced+, the node that has the most free capacity gets consumed first. ** If the free capacities of the nodes are equal, the node that has the least number of allocated resources gets consumed first. *** If their numbers of allocated resources are equal, the first eligible node listed in the CIB gets consumed first. * If +placement-strategy+ is +minimal+, the first eligible node listed in the CIB gets consumed first. === Which node has more free capacity? === If only one type of utilization attribute has been defined, free capacity is a simple numeric comparison. If multiple types of utilization attributes have been defined, then the node that is numerically highest in the the most attribute types has the most free capacity. For example: - If +nodeA+ has more free +cpus+, and +nodeB+ has more free +memory+, then their free capacities are equal. - If +nodeA+ has more free +cpus+, while +nodeB+ has more free +memory+ and +storage+, then +nodeB+ has more free capacity. === Which resource is preferred to be assigned first? === - The resource that has the highest +priority+ (see <>) gets allocated first. - If their priorities are equal, check whether they are already running. The resource that has the highest score on the node where it's running gets allocated first, to prevent resource shuffling. - If the scores above are equal or the resources are not running, the resource has the highest score on the preferred node gets allocated first. - If the scores above are equal, the first runnable resource listed in the CIB gets allocated first. == Limitations and Workarounds == The type of problem Pacemaker is dealing with here is known as the http://en.wikipedia.org/wiki/Knapsack_problem[knapsack problem] and falls into the http://en.wikipedia.org/wiki/NP-complete[NP-complete] category of computer science problems -- a fancy way of saying "it takes a really long time to solve". Clearly in a HA cluster, it's not acceptable to spend minutes, let alone hours or days, finding an optimal solution while services remain unavailable. So instead of trying to solve the problem completely, Pacemaker uses a 'best effort' algorithm for determining which node should host a particular service. This means it arrives at a solution much faster than traditional linear programming algorithms, but by doing so at the price of leaving some services stopped. In the contrived example at the start of this chapter: - +rsc-small+ would be allocated to +node1+ - +rsc-medium+ would be allocated to +node2+ - +rsc-large+ would remain inactive Which is not ideal. There are various approaches to dealing with the limitations of pacemaker's placement strategy: Ensure you have sufficient physical capacity.:: It might sound obvious, but if the physical capacity of your nodes is (close to) maxed out by the cluster under normal conditions, then failover isn't going to go well. Even without the utilization feature, you'll start hitting timeouts and getting secondary failures. Build some buffer into the capabilities advertised by the nodes.:: Advertise slightly more resources than we physically have, on the (usually valid) assumption that a resource will not use 100% of the configured amount of CPU, memory and so forth 'all' the time. This practice is sometimes called 'overcommit'. Specify resource priorities.:: If the cluster is going to sacrifice services, it should be the ones you care about (comparatively) the least. Ensure that resource priorities are properly set so that your most important resources are scheduled first. diff --git a/include/crm/common/xml.h b/include/crm/common/xml.h index 601df3dfdd..ad9edbc8e0 100644 --- a/include/crm/common/xml.h +++ b/include/crm/common/xml.h @@ -1,398 +1,398 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef CRM_COMMON_XML__H # define CRM_COMMON_XML__H /** * \file * \brief Wrappers for and extensions to libxml2 * \ingroup core */ # include # include # include # include # include # include # include # include # include -/* Compression costs a LOT, don't do it unless we're hitting message limits +/* Define compression parameters for IPC messages * - * For now, use 256k as the lower size, which means we can have 4 big data fields - * before we hit heartbeat's message limit - * - * The previous limit was 10k, compressing 184 of 1071 messages accounted for 23% - * of the total CPU used by the cib + * Compression costs a LOT, so we don't want to do it unless we're hitting + * message limits. Currently, we use 128KB as the threshold, because higher + * values don't play well with the heartbeat stack. With an earlier limit of + * 10KB, compressing 184 of 1071 messages accounted for 23% of the total CPU + * used by the cib. */ # define CRM_BZ2_BLOCKS 4 # define CRM_BZ2_WORK 20 # define CRM_BZ2_THRESHOLD 128 * 1024 # define XML_PARANOIA_CHECKS 0 gboolean add_message_xml(xmlNode * msg, const char *field, xmlNode * xml); xmlNode *get_message_xml(xmlNode * msg, const char *field); GHashTable *xml2list(xmlNode * parent); void hash2nvpair(gpointer key, gpointer value, gpointer user_data); void hash2field(gpointer key, gpointer value, gpointer user_data); void hash2metafield(gpointer key, gpointer value, gpointer user_data); void hash2smartfield(gpointer key, gpointer value, gpointer user_data); xmlDoc *getDocPtr(xmlNode * node); /* * Replacement function for xmlCopyPropList which at the very least, * doesn't work the way *I* would expect it to. * * Copy all the attributes/properties from src into target. * * Not recursive, does not return anything. * */ void copy_in_properties(xmlNode * target, xmlNode * src); void expand_plus_plus(xmlNode * target, const char *name, const char *value); void fix_plus_plus_recursive(xmlNode * target); /* * Create a node named "name" as a child of "parent" * If parent is NULL, creates an unconnected node. * * Returns the created node * */ xmlNode *create_xml_node(xmlNode * parent, const char *name); /* * Make a copy of name and value and use the copied memory to create * an attribute for node. * * If node, name or value are NULL, nothing is done. * * If name or value are an empty string, nothing is done. * * Returns FALSE on failure and TRUE on success. * */ const char *crm_xml_add(xmlNode * node, const char *name, const char *value); const char *crm_xml_replace(xmlNode * node, const char *name, const char *value); const char *crm_xml_add_int(xmlNode * node, const char *name, int value); /*! * \brief Add a boolean attribute to an XML object * * Add an attribute with the value XML_BOOLEAN_TRUE or XML_BOOLEAN_FALSE * as appropriate to an XML object. * * \param[in/out] node XML object to add attribute to * \param[in] name Name of attribute to add * \param[in] value Boolean whose value will be tested * * \return Pointer to newly created XML attribute's content, or NULL on error */ static inline const char * crm_xml_add_boolean(xmlNode *node, const char *name, gboolean value) { return crm_xml_add(node, name, (value? "true" : "false")); } /* * Unlink the node and set its doc pointer to NULL so free_xml() * will act appropriately */ void unlink_xml_node(xmlNode * node); /* * */ void purge_diff_markers(xmlNode * a_node); /* * Returns a deep copy of src_node * */ xmlNode *copy_xml(xmlNode * src_node); /* * Add a copy of xml_node to new_parent */ xmlNode *add_node_copy(xmlNode * new_parent, xmlNode * xml_node); int add_node_nocopy(xmlNode * parent, const char *name, xmlNode * child); /* * XML I/O Functions * * Whitespace between tags is discarded. */ xmlNode *filename2xml(const char *filename); xmlNode *stdin2xml(void); xmlNode *string2xml(const char *input); int write_xml_fd(xmlNode * xml_node, const char *filename, int fd, gboolean compress); int write_xml_file(xmlNode * xml_node, const char *filename, gboolean compress); char *dump_xml_formatted(xmlNode * msg); /* Also dump the text node with xml_log_option_text enabled */ char *dump_xml_formatted_with_text(xmlNode * msg); char *dump_xml_unformatted(xmlNode * msg); /* * Diff related Functions */ xmlNode *diff_xml_object(xmlNode * left, xmlNode * right, gboolean suppress); xmlNode *subtract_xml_object(xmlNode * parent, xmlNode * left, xmlNode * right, gboolean full, gboolean * changed, const char *marker); gboolean can_prune_leaf(xmlNode * xml_node); void print_xml_diff(FILE * where, xmlNode * diff); gboolean apply_xml_diff(xmlNode * old, xmlNode * diff, xmlNode ** new); /* * Searching & Modifying */ xmlNode *find_xml_node(xmlNode * cib, const char *node_path, gboolean must_find); xmlNode *find_entity(xmlNode * parent, const char *node_name, const char *id); void xml_remove_prop(xmlNode * obj, const char *name); gboolean replace_xml_child(xmlNode * parent, xmlNode * child, xmlNode * update, gboolean delete_only); gboolean update_xml_child(xmlNode * child, xmlNode * to_update); int find_xml_children(xmlNode ** children, xmlNode * root, const char *tag, const char *field, const char *value, gboolean search_matches); int crm_element_value_int(xmlNode * data, const char *name, int *dest); char *crm_element_value_copy(xmlNode * data, const char *name); int crm_element_value_const_int(const xmlNode * data, const char *name, int *dest); const char *crm_element_value_const(const xmlNode * data, const char *name); xmlNode *get_xpath_object(const char *xpath, xmlNode * xml_obj, int error_level); xmlNode *get_xpath_object_relative(const char *xpath, xmlNode * xml_obj, int error_level); static inline const char * crm_element_name(xmlNode *xml) { return xml? (const char *)(xml->name) : NULL; } const char *crm_element_value(xmlNode * data, const char *name); /*! * \brief Copy an element from one XML object to another * * \param[in] obj1 Source XML * \param[in,out] obj2 Destination XML * \param[in] element Name of element to copy * * \return Pointer to copied value (from source) */ static inline const char * crm_copy_xml_element(xmlNode *obj1, xmlNode *obj2, const char *element) { const char *value = crm_element_value(obj1, element); crm_xml_add(obj2, element, value); return value; } void xml_validate(const xmlNode * root); gboolean xml_has_children(const xmlNode * root); char *calculate_on_disk_digest(xmlNode * local_cib); char *calculate_operation_digest(xmlNode * local_cib, const char *version); char *calculate_xml_versioned_digest(xmlNode * input, gboolean sort, gboolean do_filter, const char *version); /* schema-related functions (from schemas.c) */ gboolean validate_xml(xmlNode * xml_blob, const char *validation, gboolean to_logs); gboolean validate_xml_verbose(xmlNode * xml_blob); /*! * \brief Try update CIB XML to the highest pacemaker's standard if feasible * * "Update" means either actively employ XSLT-based transformation(s) * (if intermediate product to transform valid per its declared schema version, * transformation available, proceeded successfully with a result valid per * expectated newer schema version), or just try to bump the marked validating * schema until all gradually rising schema versions attested or the first * such attempt subsequently fails to validate. Which of the two styles will * be used depends on \p transform parameter (positive/negative, respectively). * * \param[in/out] xml_blob XML tree representing CIB, may be swapped with * an "updated" one * \param[out] best The highest configuration version (per its index * in the global schemas table) it was possible to * reach during the update steps while ensuring * the validity of the result; if no validation * success was observed against possibly multiple * schemas, the value is less or equal the result * of get_schema_version applied on the * input \p xml_blob value (unless that function * maps it to -1, then 0 would be used instead) * \param[in] max When \p transform is positive, this allows to * set upper boundary schema (per its index in the * global schemas table) beyond which its forbidden * to update by the means of XSLT transformation * \param[in] transform Whether to employ XSLT-based transformation so * as allow overcoming possible incompatibilities * between major schema versions (see above) * \param[in] to_logs If true, output notable progress info to * internal log streams; if false, to stderr * * \return pcmk_ok if no non-recoverable error encountered (up to * caller to evaluate if the update satisfies the requirements * per returned \p best value), negative value carrying the reason * otherwise */ int update_validation(xmlNode **xml_blob, int *best, int max, gboolean transform, gboolean to_logs); int get_schema_version(const char *name); const char *get_schema_name(int version); const char *xml_latest_schema(void); gboolean cli_config_update(xmlNode ** xml, int *best_version, gboolean to_logs); void crm_xml_init(void); void crm_xml_cleanup(void); static inline xmlNode * __xml_first_child(xmlNode * parent) { xmlNode *child = NULL; if (parent) { child = parent->children; while (child && child->type == XML_TEXT_NODE) { child = child->next; } } return child; } static inline xmlNode * __xml_next(xmlNode * child) { if (child) { child = child->next; while (child && child->type == XML_TEXT_NODE) { child = child->next; } } return child; } static inline xmlNode * __xml_first_child_element(xmlNode * parent) { xmlNode *child = NULL; if (parent) { child = parent->children; } while (child) { if(child->type == XML_ELEMENT_NODE) { return child; } child = child->next; } return NULL; } static inline xmlNode * __xml_next_element(xmlNode * child) { while (child) { child = child->next; if(child && child->type == XML_ELEMENT_NODE) { return child; } } return NULL; } void free_xml(xmlNode * child); xmlNode *first_named_child(xmlNode * parent, const char *name); xmlNode *sorted_xml(xmlNode * input, xmlNode * parent, gboolean recursive); xmlXPathObjectPtr xpath_search(xmlNode * xml_top, const char *path); void crm_foreach_xpath_result(xmlNode *xml, const char *xpath, void (*helper)(xmlNode*, void*), void *user_data); xmlNode *expand_idref(xmlNode * input, xmlNode * top); void freeXpathObject(xmlXPathObjectPtr xpathObj); xmlNode *getXpathResult(xmlXPathObjectPtr xpathObj, int index); void dedupXpathResults(xmlXPathObjectPtr xpathObj); static inline int numXpathResults(xmlXPathObjectPtr xpathObj) { if(xpathObj == NULL || xpathObj->nodesetval == NULL) { return 0; } return xpathObj->nodesetval->nodeNr; } bool xml_acl_enabled(xmlNode *xml); void xml_acl_disable(xmlNode *xml); bool xml_acl_denied(xmlNode *xml); /* Part or all of a change was rejected */ bool xml_acl_filtered_copy(const char *user, xmlNode* acl_source, xmlNode *xml, xmlNode ** result); bool xml_tracking_changes(xmlNode * xml); bool xml_document_dirty(xmlNode *xml); void xml_track_changes(xmlNode * xml, const char *user, xmlNode *acl_source, bool enforce_acls); void xml_calculate_changes(xmlNode * old, xmlNode * new); /* For comparing two documents after the fact */ void xml_accept_changes(xmlNode * xml); void xml_log_changes(uint8_t level, const char *function, xmlNode *xml); void xml_log_patchset(uint8_t level, const char *function, xmlNode *xml); bool xml_patch_versions(xmlNode *patchset, int add[3], int del[3]); xmlNode *xml_create_patchset( int format, xmlNode *source, xmlNode *target, bool *config, bool manage_version); int xml_apply_patchset(xmlNode *xml, xmlNode *patchset, bool check_version); void patchset_process_digest(xmlNode *patch, xmlNode *source, xmlNode *target, bool with_digest); void save_xml_to_file(xmlNode * xml, const char *desc, const char *filename); char *xml_get_path(xmlNode *xml); char * crm_xml_escape(const char *text); void crm_xml_sanitize_id(char *id); void crm_xml_set_id(xmlNode *xml, const char *format, ...) __attribute__ ((__format__ (__printf__, 2, 3))); #endif diff --git a/include/crm_config.h.in b/include/crm_config.h.in index 7eb01d424a..7532ac17c5 100755 --- a/include/crm_config.h.in +++ b/include/crm_config.h.in @@ -1,105 +1,102 @@ /* crm_config.h.in */ /* * Copyright (C) 2006 - 2007 * Andrew Beekhof * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef CRM_CONFIG__H__IN #define CRM_CONFIG__H__IN /****** Versions ******/ /* Current pacemaker version */ #undef PACEMAKER_VERSION /* Build version */ #undef BUILD_VERSION -/* Current version of the Pacemaker Relax-NG Schema */ -#undef CRM_DTD_VERSION - /****** Other ******/ /* Group to run Pacemaker daemons as */ #undef CRM_DAEMON_GROUP /* User to run Pacemaker daemons as */ #undef CRM_DAEMON_USER /****** Directories ******/ /* Where Pacemaker can store log files */ #undef CRM_LOG_DIR /* Location for Pacemaker daemons */ #undef CRM_DAEMON_DIR /* Where to keep blackbox dumps */ #undef CRM_BLACKBOX_DIR /* Where to keep configuration files */ #undef CRM_CONFIG_DIR /* Where to keep PEngine outputs */ #undef PE_STATE_DIR /* Location to store core files produced by Pacemaker daemons */ #undef CRM_CORE_DIR /* Where to keep state files and sockets */ #undef CRM_STATE_DIR /* Location for the Pacemaker Relax-NG Schema */ #undef CRM_DTD_DIRECTORY /****** Features ******/ /* Set of enabled features */ #undef CRM_FEATURES /* Support the Heartbeat messaging and membership layer */ #undef SUPPORT_HEARTBEAT /* Support the Corosync messaging and membership layer */ #undef SUPPORT_COROSYNC /* Support the Pacemaker plugin for Corosync */ #undef SUPPORT_PLUGIN /* Support the consumption of membership and quorum from corosync */ #undef SUPPORT_CS_QUORUM /* Support systemd based system services */ #undef SUPPORT_SYSTEMD /* Support upstart based system services */ #undef SUPPORT_UPSTART /****** Compatibility ******/ /* Does corosync use libqb for its ipc */ #undef CS_USES_LIBQB /* Compatibility alias for SUPPORT_COROSYNC */ #undef SUPPORT_AIS /* Compatibility alias for SUPPORT_COROSYNC */ #undef AIS_COROSYNC #endif /* CRM_CONFIG__H__IN */ diff --git a/lib/common/mainloop.c b/lib/common/mainloop.c index 385ca7da36..0ebdafcccd 100644 --- a/lib/common/mainloop.c +++ b/lib/common/mainloop.c @@ -1,1251 +1,1251 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #ifndef _GNU_SOURCE # define _GNU_SOURCE #endif #include #include #include #include #include #include #include #include #include struct mainloop_child_s { pid_t pid; char *desc; unsigned timerid; unsigned watchid; gboolean timeout; void *privatedata; enum mainloop_child_flags flags; /* Called when a process dies */ void (*callback) (mainloop_child_t * p, pid_t pid, int core, int signo, int exitcode); }; struct trigger_s { GSource source; gboolean running; gboolean trigger; void *user_data; guint id; }; static gboolean crm_trigger_prepare(GSource * source, gint * timeout) { crm_trigger_t *trig = (crm_trigger_t *) source; /* cluster-glue's FD and IPC related sources make use of * g_source_add_poll() but do not set a timeout in their prepare * functions * * This means mainloop's poll() will block until an event for one * of these sources occurs - any /other/ type of source, such as * this one or g_idle_*, that doesn't use g_source_add_poll() is * S-O-L and won't be processed until there is something fd-based * happens. * * Luckily the timeout we can set here affects all sources and * puts an upper limit on how long poll() can take. * * So unconditionally set a small-ish timeout, not too small that * we're in constant motion, which will act as an upper bound on * how long the signal handling might be delayed for. */ *timeout = 500; /* Timeout in ms */ return trig->trigger; } static gboolean crm_trigger_check(GSource * source) { crm_trigger_t *trig = (crm_trigger_t *) source; return trig->trigger; } static gboolean crm_trigger_dispatch(GSource * source, GSourceFunc callback, gpointer userdata) { int rc = TRUE; crm_trigger_t *trig = (crm_trigger_t *) source; if (trig->running) { /* Wait until the existing job is complete before starting the next one */ return TRUE; } trig->trigger = FALSE; if (callback) { rc = callback(trig->user_data); if (rc < 0) { crm_trace("Trigger handler %p not yet complete", trig); trig->running = TRUE; rc = TRUE; } } return rc; } static void crm_trigger_finalize(GSource * source) { crm_trace("Trigger %p destroyed", source); } #if 0 struct _GSourceCopy { gpointer callback_data; GSourceCallbackFuncs *callback_funcs; const GSourceFuncs *source_funcs; guint ref_count; GMainContext *context; gint priority; guint flags; guint source_id; GSList *poll_fds; GSource *prev; GSource *next; char *name; void *priv; }; static int g_source_refcount(GSource * source) { /* Duplicating the contents of private header files is a necessary evil */ if (source) { struct _GSourceCopy *evil = (struct _GSourceCopy*)source; return evil->ref_count; } return 0; } #else static int g_source_refcount(GSource * source) { return 0; } #endif static GSourceFuncs crm_trigger_funcs = { crm_trigger_prepare, crm_trigger_check, crm_trigger_dispatch, crm_trigger_finalize, }; static crm_trigger_t * mainloop_setup_trigger(GSource * source, int priority, int (*dispatch) (gpointer user_data), gpointer userdata) { crm_trigger_t *trigger = NULL; trigger = (crm_trigger_t *) source; trigger->id = 0; trigger->trigger = FALSE; trigger->user_data = userdata; if (dispatch) { g_source_set_callback(source, dispatch, trigger, NULL); } g_source_set_priority(source, priority); g_source_set_can_recurse(source, FALSE); crm_trace("Setup %p with ref-count=%u", source, g_source_refcount(source)); trigger->id = g_source_attach(source, NULL); crm_trace("Attached %p with ref-count=%u", source, g_source_refcount(source)); return trigger; } void mainloop_trigger_complete(crm_trigger_t * trig) { crm_trace("Trigger handler %p complete", trig); trig->running = FALSE; } /* If dispatch returns: * -1: Job running but not complete * 0: Remove the trigger from mainloop * 1: Leave the trigger in mainloop */ crm_trigger_t * mainloop_add_trigger(int priority, int (*dispatch) (gpointer user_data), gpointer userdata) { GSource *source = NULL; CRM_ASSERT(sizeof(crm_trigger_t) > sizeof(GSource)); source = g_source_new(&crm_trigger_funcs, sizeof(crm_trigger_t)); CRM_ASSERT(source != NULL); return mainloop_setup_trigger(source, priority, dispatch, userdata); } void mainloop_set_trigger(crm_trigger_t * source) { if(source) { source->trigger = TRUE; } } gboolean mainloop_destroy_trigger(crm_trigger_t * source) { GSource *gs = NULL; if(source == NULL) { return TRUE; } gs = (GSource *)source; if(g_source_refcount(gs) > 2) { crm_info("Trigger %p is still referenced %u times", gs, g_source_refcount(gs)); } g_source_destroy(gs); /* Remove from mainloop, ref_count-- */ g_source_unref(gs); /* The caller no longer carries a reference to source * * At this point the source should be free'd, * unless we're currently processing said * source, in which case mainloop holds an * additional reference and it will be free'd * once our processing completes */ return TRUE; } typedef struct signal_s { crm_trigger_t trigger; /* must be first */ void (*handler) (int sig); int signal; } crm_signal_t; static crm_signal_t *crm_signals[NSIG]; static gboolean crm_signal_dispatch(GSource * source, GSourceFunc callback, gpointer userdata) { crm_signal_t *sig = (crm_signal_t *) source; if(sig->signal != SIGCHLD) { crm_notice("Caught '%s' signal "CRM_XS" %d (%s handler)", strsignal(sig->signal), sig->signal, (sig->handler? "invoking" : "no")); } sig->trigger.trigger = FALSE; if (sig->handler) { sig->handler(sig->signal); } return TRUE; } static void mainloop_signal_handler(int sig) { if (sig > 0 && sig < NSIG && crm_signals[sig] != NULL) { mainloop_set_trigger((crm_trigger_t *) crm_signals[sig]); } } static GSourceFuncs crm_signal_funcs = { crm_trigger_prepare, crm_trigger_check, crm_signal_dispatch, crm_trigger_finalize, }; gboolean crm_signal(int sig, void (*dispatch) (int sig)) { sigset_t mask; struct sigaction sa; struct sigaction old; if (sigemptyset(&mask) < 0) { crm_perror(LOG_ERR, "Call to sigemptyset failed"); return FALSE; } memset(&sa, 0, sizeof(struct sigaction)); sa.sa_handler = dispatch; sa.sa_flags = SA_RESTART; sa.sa_mask = mask; if (sigaction(sig, &sa, &old) < 0) { crm_perror(LOG_ERR, "Could not install signal handler for signal %d", sig); return FALSE; } return TRUE; } gboolean mainloop_add_signal(int sig, void (*dispatch) (int sig)) { GSource *source = NULL; int priority = G_PRIORITY_HIGH - 1; if (sig == SIGTERM) { /* TERM is higher priority than other signals, * signals are higher priority than other ipc. * Yes, minus: smaller is "higher" */ priority--; } if (sig >= NSIG || sig < 0) { crm_err("Signal %d is out of range", sig); return FALSE; } else if (crm_signals[sig] != NULL && crm_signals[sig]->handler == dispatch) { crm_trace("Signal handler for %d is already installed", sig); return TRUE; } else if (crm_signals[sig] != NULL) { crm_err("Different signal handler for %d is already installed", sig); return FALSE; } CRM_ASSERT(sizeof(crm_signal_t) > sizeof(GSource)); source = g_source_new(&crm_signal_funcs, sizeof(crm_signal_t)); crm_signals[sig] = (crm_signal_t *) mainloop_setup_trigger(source, priority, NULL, NULL); CRM_ASSERT(crm_signals[sig] != NULL); crm_signals[sig]->handler = dispatch; crm_signals[sig]->signal = sig; if (crm_signal(sig, mainloop_signal_handler) == FALSE) { crm_signal_t *tmp = crm_signals[sig]; crm_signals[sig] = NULL; mainloop_destroy_trigger((crm_trigger_t *) tmp); return FALSE; } #if 0 /* If we want signals to interrupt mainloop's poll(), instead of waiting for * the timeout, then we should call siginterrupt() below * * For now, just enforce a low timeout */ if (siginterrupt(sig, 1) < 0) { crm_perror(LOG_INFO, "Could not enable system call interruptions for signal %d", sig); } #endif return TRUE; } gboolean mainloop_destroy_signal(int sig) { crm_signal_t *tmp = NULL; if (sig >= NSIG || sig < 0) { crm_err("Signal %d is out of range", sig); return FALSE; } else if (crm_signal(sig, NULL) == FALSE) { crm_perror(LOG_ERR, "Could not uninstall signal handler for signal %d", sig); return FALSE; } else if (crm_signals[sig] == NULL) { return TRUE; } crm_trace("Destroying signal %d", sig); tmp = crm_signals[sig]; crm_signals[sig] = NULL; mainloop_destroy_trigger((crm_trigger_t *) tmp); return TRUE; } static qb_array_t *gio_map = NULL; void mainloop_cleanup(void) { if(gio_map) { qb_array_free(gio_map); } } /* * libqb... */ struct gio_to_qb_poll { int32_t is_used; guint source; int32_t events; void *data; qb_ipcs_dispatch_fn_t fn; enum qb_loop_priority p; }; static gboolean gio_read_socket(GIOChannel * gio, GIOCondition condition, gpointer data) { struct gio_to_qb_poll *adaptor = (struct gio_to_qb_poll *)data; gint fd = g_io_channel_unix_get_fd(gio); crm_trace("%p.%d %d", data, fd, condition); /* if this assert get's hit, then there is a race condition between * when we destroy a fd and when mainloop actually gives it up */ CRM_ASSERT(adaptor->is_used > 0); return (adaptor->fn(fd, condition, adaptor->data) == 0); } static void gio_poll_destroy(gpointer data) { struct gio_to_qb_poll *adaptor = (struct gio_to_qb_poll *)data; adaptor->is_used--; CRM_ASSERT(adaptor->is_used >= 0); if (adaptor->is_used == 0) { crm_trace("Marking adaptor %p unused", adaptor); adaptor->source = 0; } } static int32_t gio_poll_dispatch_update(enum qb_loop_priority p, int32_t fd, int32_t evts, void *data, qb_ipcs_dispatch_fn_t fn, int32_t add) { struct gio_to_qb_poll *adaptor; GIOChannel *channel; int32_t res = 0; res = qb_array_index(gio_map, fd, (void **)&adaptor); if (res < 0) { crm_err("Array lookup failed for fd=%d: %d", fd, res); return res; } crm_trace("Adding fd=%d to mainloop as adaptor %p", fd, adaptor); if (add && adaptor->source) { crm_err("Adaptor for descriptor %d is still in-use", fd); return -EEXIST; } if (!add && !adaptor->is_used) { crm_err("Adaptor for descriptor %d is not in-use", fd); return -ENOENT; } /* channel is created with ref_count = 1 */ channel = g_io_channel_unix_new(fd); if (!channel) { crm_err("No memory left to add fd=%d", fd); return -ENOMEM; } if (adaptor->source) { g_source_remove(adaptor->source); adaptor->source = 0; } /* Because unlike the poll() API, glib doesn't tell us about HUPs by default */ evts |= (G_IO_HUP | G_IO_NVAL | G_IO_ERR); adaptor->fn = fn; adaptor->events = evts; adaptor->data = data; adaptor->p = p; adaptor->is_used++; adaptor->source = g_io_add_watch_full(channel, G_PRIORITY_DEFAULT, evts, gio_read_socket, adaptor, gio_poll_destroy); /* Now that mainloop now holds a reference to channel, * thanks to g_io_add_watch_full(), drop ours from g_io_channel_unix_new(). * * This means that channel will be free'd by: * g_main_context_dispatch() * -> g_source_destroy_internal() * -> g_source_callback_unref() * shortly after gio_poll_destroy() completes */ g_io_channel_unref(channel); crm_trace("Added to mainloop with gsource id=%d", adaptor->source); if (adaptor->source > 0) { return 0; } return -EINVAL; } static int32_t gio_poll_dispatch_add(enum qb_loop_priority p, int32_t fd, int32_t evts, void *data, qb_ipcs_dispatch_fn_t fn) { return gio_poll_dispatch_update(p, fd, evts, data, fn, QB_TRUE); } static int32_t gio_poll_dispatch_mod(enum qb_loop_priority p, int32_t fd, int32_t evts, void *data, qb_ipcs_dispatch_fn_t fn) { return gio_poll_dispatch_update(p, fd, evts, data, fn, QB_FALSE); } static int32_t gio_poll_dispatch_del(int32_t fd) { struct gio_to_qb_poll *adaptor; crm_trace("Looking for fd=%d", fd); if (qb_array_index(gio_map, fd, (void **)&adaptor) == 0) { if (adaptor->source) { g_source_remove(adaptor->source); adaptor->source = 0; } } return 0; } struct qb_ipcs_poll_handlers gio_poll_funcs = { .job_add = NULL, .dispatch_add = gio_poll_dispatch_add, .dispatch_mod = gio_poll_dispatch_mod, .dispatch_del = gio_poll_dispatch_del, }; static enum qb_ipc_type pick_ipc_type(enum qb_ipc_type requested) { const char *env = getenv("PCMK_ipc_type"); if (env && strcmp("shared-mem", env) == 0) { return QB_IPC_SHM; } else if (env && strcmp("socket", env) == 0) { return QB_IPC_SOCKET; } else if (env && strcmp("posix", env) == 0) { return QB_IPC_POSIX_MQ; } else if (env && strcmp("sysv", env) == 0) { return QB_IPC_SYSV_MQ; } else if (requested == QB_IPC_NATIVE) { /* We prefer shared memory because the server never blocks on * send. If part of a message fits into the socket, libqb * needs to block until the remainder can be sent also. * Otherwise the client will wait forever for the remaining * bytes. */ return QB_IPC_SHM; } return requested; } qb_ipcs_service_t * mainloop_add_ipc_server(const char *name, enum qb_ipc_type type, struct qb_ipcs_service_handlers * callbacks) { int rc = 0; qb_ipcs_service_t *server = NULL; if (gio_map == NULL) { gio_map = qb_array_create_2(64, sizeof(struct gio_to_qb_poll), 1); } crm_client_init(); server = qb_ipcs_create(name, 0, pick_ipc_type(type), callbacks); #ifdef HAVE_IPCS_GET_BUFFER_SIZE /* All clients should use at least ipc_buffer_max as their buffer size */ qb_ipcs_enforce_buffer_size(server, crm_ipc_default_buffer_size()); #endif qb_ipcs_poll_handlers_set(server, &gio_poll_funcs); rc = qb_ipcs_run(server); if (rc < 0) { crm_err("Could not start %s IPC server: %s (%d)", name, pcmk_strerror(rc), rc); return NULL; } return server; } void mainloop_del_ipc_server(qb_ipcs_service_t * server) { if (server) { qb_ipcs_destroy(server); } } struct mainloop_io_s { char *name; void *userdata; int fd; guint source; crm_ipc_t *ipc; GIOChannel *channel; int (*dispatch_fn_ipc) (const char *buffer, ssize_t length, gpointer userdata); int (*dispatch_fn_io) (gpointer userdata); void (*destroy_fn) (gpointer userdata); }; static gboolean mainloop_gio_callback(GIOChannel * gio, GIOCondition condition, gpointer data) { gboolean keep = TRUE; mainloop_io_t *client = data; CRM_ASSERT(client->fd == g_io_channel_unix_get_fd(gio)); if (condition & G_IO_IN) { if (client->ipc) { long rc = 0; int max = 10; do { rc = crm_ipc_read(client->ipc); if (rc <= 0) { crm_trace("Message acquisition from %s[%p] failed: %s (%ld)", client->name, client, pcmk_strerror(rc), rc); } else if (client->dispatch_fn_ipc) { const char *buffer = crm_ipc_buffer(client->ipc); crm_trace("New message from %s[%p] = %ld (I/O condition=%d)", client->name, client, rc, condition); if (client->dispatch_fn_ipc(buffer, rc, client->userdata) < 0) { crm_trace("Connection to %s no longer required", client->name); keep = FALSE; } } } while (keep && rc > 0 && --max > 0); } else { crm_trace("New message from %s[%p] %u", client->name, client, condition); if (client->dispatch_fn_io) { if (client->dispatch_fn_io(client->userdata) < 0) { crm_trace("Connection to %s no longer required", client->name); keep = FALSE; } } } } if (client->ipc && crm_ipc_connected(client->ipc) == FALSE) { crm_err("Connection to %s[%p] closed (I/O condition=%d)", client->name, client, condition); keep = FALSE; } else if (condition & (G_IO_HUP | G_IO_NVAL | G_IO_ERR)) { crm_trace("The connection %s[%p] has been closed (I/O condition=%d)", client->name, client, condition); keep = FALSE; } else if ((condition & G_IO_IN) == 0) { /* #define GLIB_SYSDEF_POLLIN =1 #define GLIB_SYSDEF_POLLPRI =2 #define GLIB_SYSDEF_POLLOUT =4 #define GLIB_SYSDEF_POLLERR =8 #define GLIB_SYSDEF_POLLHUP =16 #define GLIB_SYSDEF_POLLNVAL =32 typedef enum { G_IO_IN GLIB_SYSDEF_POLLIN, G_IO_OUT GLIB_SYSDEF_POLLOUT, G_IO_PRI GLIB_SYSDEF_POLLPRI, G_IO_ERR GLIB_SYSDEF_POLLERR, G_IO_HUP GLIB_SYSDEF_POLLHUP, G_IO_NVAL GLIB_SYSDEF_POLLNVAL } GIOCondition; A bitwise combination representing a condition to watch for on an event source. G_IO_IN There is data to read. G_IO_OUT Data can be written (without blocking). G_IO_PRI There is urgent data to read. G_IO_ERR Error condition. G_IO_HUP Hung up (the connection has been broken, usually for pipes and sockets). G_IO_NVAL Invalid request. The file descriptor is not open. */ crm_err("Strange condition: %d", condition); } /* keep == FALSE results in mainloop_gio_destroy() being called * just before the source is removed from mainloop */ return keep; } static void mainloop_gio_destroy(gpointer c) { mainloop_io_t *client = c; char *c_name = strdup(client->name); /* client->source is valid but about to be destroyed (ref_count == 0) in gmain.c * client->channel will still have ref_count > 0... should be == 1 */ crm_trace("Destroying client %s[%p]", c_name, c); if (client->ipc) { crm_ipc_close(client->ipc); } if (client->destroy_fn) { void (*destroy_fn) (gpointer userdata) = client->destroy_fn; client->destroy_fn = NULL; destroy_fn(client->userdata); } if (client->ipc) { crm_ipc_t *ipc = client->ipc; client->ipc = NULL; crm_ipc_destroy(ipc); } crm_trace("Destroyed client %s[%p]", c_name, c); free(client->name); client->name = NULL; free(client); free(c_name); } mainloop_io_t * mainloop_add_ipc_client(const char *name, int priority, size_t max_size, void *userdata, struct ipc_client_callbacks *callbacks) { mainloop_io_t *client = NULL; crm_ipc_t *conn = crm_ipc_new(name, max_size); if (conn && crm_ipc_connect(conn)) { int32_t fd = crm_ipc_get_fd(conn); client = mainloop_add_fd(name, priority, fd, userdata, NULL); } if (client == NULL) { crm_perror(LOG_TRACE, "Connection to %s failed", name); if (conn) { crm_ipc_close(conn); crm_ipc_destroy(conn); } return NULL; } client->ipc = conn; client->destroy_fn = callbacks->destroy; client->dispatch_fn_ipc = callbacks->dispatch; return client; } void mainloop_del_ipc_client(mainloop_io_t * client) { mainloop_del_fd(client); } crm_ipc_t * mainloop_get_ipc_client(mainloop_io_t * client) { if (client) { return client->ipc; } return NULL; } mainloop_io_t * mainloop_add_fd(const char *name, int priority, int fd, void *userdata, struct mainloop_fd_callbacks * callbacks) { mainloop_io_t *client = NULL; if (fd >= 0) { client = calloc(1, sizeof(mainloop_io_t)); if (client == NULL) { return NULL; } client->name = strdup(name); client->userdata = userdata; if (callbacks) { client->destroy_fn = callbacks->destroy; client->dispatch_fn_io = callbacks->dispatch; } client->fd = fd; client->channel = g_io_channel_unix_new(fd); client->source = g_io_add_watch_full(client->channel, priority, (G_IO_IN | G_IO_HUP | G_IO_NVAL | G_IO_ERR), mainloop_gio_callback, client, mainloop_gio_destroy); /* Now that mainloop now holds a reference to channel, * thanks to g_io_add_watch_full(), drop ours from g_io_channel_unix_new(). * * This means that channel will be free'd by: * g_main_context_dispatch() or g_source_remove() * -> g_source_destroy_internal() * -> g_source_callback_unref() * shortly after mainloop_gio_destroy() completes */ g_io_channel_unref(client->channel); crm_trace("Added connection %d for %s[%p].%d", client->source, client->name, client, fd); } else { errno = EINVAL; } return client; } void mainloop_del_fd(mainloop_io_t * client) { if (client != NULL) { crm_trace("Removing client %s[%p]", client->name, client); if (client->source) { /* Results in mainloop_gio_destroy() being called just * before the source is removed from mainloop */ g_source_remove(client->source); } } } static GListPtr child_list = NULL; pid_t mainloop_child_pid(mainloop_child_t * child) { return child->pid; } const char * mainloop_child_name(mainloop_child_t * child) { return child->desc; } int mainloop_child_timeout(mainloop_child_t * child) { return child->timeout; } void * mainloop_child_userdata(mainloop_child_t * child) { return child->privatedata; } void mainloop_clear_child_userdata(mainloop_child_t * child) { child->privatedata = NULL; } /* good function name */ static void child_free(mainloop_child_t *child) { if (child->timerid != 0) { crm_trace("Removing timer %d", child->timerid); g_source_remove(child->timerid); child->timerid = 0; } free(child->desc); free(child); } /* terrible function name */ static int child_kill_helper(mainloop_child_t *child) { int rc; if (child->flags & mainloop_leave_pid_group) { crm_debug("Kill pid %d only. leave group intact.", child->pid); rc = kill(child->pid, SIGKILL); } else { crm_debug("Kill pid %d's group", child->pid); rc = kill(-child->pid, SIGKILL); } if (rc < 0) { if (errno != ESRCH) { crm_perror(LOG_ERR, "kill(%d, KILL) failed", child->pid); } return -errno; } return 0; } static gboolean child_timeout_callback(gpointer p) { mainloop_child_t *child = p; int rc = 0; child->timerid = 0; if (child->timeout) { crm_crit("%s process (PID %d) will not die!", child->desc, (int)child->pid); return FALSE; } rc = child_kill_helper(child); if (rc == ESRCH) { /* Nothing left to do. pid doesn't exist */ return FALSE; } child->timeout = TRUE; crm_warn("%s process (PID %d) timed out", child->desc, (int)child->pid); child->timerid = g_timeout_add(5000, child_timeout_callback, child); return FALSE; } static gboolean child_waitpid(mainloop_child_t *child, int flags) { int rc = 0; int core = 0; int signo = 0; int status = 0; int exitcode = 0; rc = waitpid(child->pid, &status, flags); if(rc == 0) { crm_perror(LOG_DEBUG, "wait(%d) = %d", child->pid, rc); return FALSE; } else if(rc != child->pid) { signo = SIGCHLD; exitcode = 1; status = 1; crm_perror(LOG_ERR, "Call to waitpid(%d) failed", child->pid); } else { crm_trace("Managed process %d exited: %p", child->pid, child); if (WIFEXITED(status)) { exitcode = WEXITSTATUS(status); crm_trace("Managed process %d (%s) exited with rc=%d", child->pid, child->desc, exitcode); } else if (WIFSIGNALED(status)) { signo = WTERMSIG(status); crm_trace("Managed process %d (%s) exited with signal=%d", child->pid, child->desc, signo); } #ifdef WCOREDUMP if (WCOREDUMP(status)) { core = 1; crm_err("Managed process %d (%s) dumped core", child->pid, child->desc); } #endif } if (child->callback) { child->callback(child, child->pid, core, signo, exitcode); } return TRUE; } static void child_death_dispatch(int signal) { GListPtr iter = child_list; gboolean exited; while(iter) { GListPtr saved = NULL; mainloop_child_t *child = iter->data; exited = child_waitpid(child, WNOHANG); saved = iter; iter = iter->next; if (exited == FALSE) { continue; } crm_trace("Removing process entry %p for %d", child, child->pid); child_list = g_list_remove_link(child_list, saved); g_list_free(saved); child_free(child); } } static gboolean child_signal_init(gpointer p) { crm_trace("Installed SIGCHLD handler"); /* Do NOT use g_child_watch_add() and friends, they rely on pthreads */ mainloop_add_signal(SIGCHLD, child_death_dispatch); /* In case they terminated before the signal handler was installed */ child_death_dispatch(SIGCHLD); return FALSE; } int mainloop_child_kill(pid_t pid) { GListPtr iter; mainloop_child_t *child = NULL; mainloop_child_t *match = NULL; /* It is impossible to block SIGKILL, this allows us to * call waitpid without WNOHANG flag.*/ int waitflags = 0, rc = 0; for (iter = child_list; iter != NULL && match == NULL; iter = iter->next) { child = iter->data; if (pid == child->pid) { match = child; } } if (match == NULL) { return FALSE; } rc = child_kill_helper(match); if(rc == -ESRCH) { - /* Its gone, but hasn't shown up in waitpid() yet + /* It's gone, but hasn't shown up in waitpid() yet * * Wait until we get SIGCHLD and let child_death_dispatch() * clean it up as normal (so we get the correct return * code/status) * * The blocking alternative would be to call: * child_waitpid(match, 0); */ crm_trace("Waiting for child %d to be reaped by child_death_dispatch()", match->pid); return TRUE; } else if(rc != 0) { /* If KILL for some other reason set the WNOHANG flag since we * can't be certain what happened. */ waitflags = WNOHANG; } if (child_waitpid(match, waitflags) == FALSE) { /* not much we can do if this occurs */ return FALSE; } child_list = g_list_remove(child_list, match); child_free(match); return TRUE; } /* Create/Log a new tracked process * To track a process group, use -pid */ void mainloop_child_add_with_flags(pid_t pid, int timeout, const char *desc, void *privatedata, enum mainloop_child_flags flags, void (*callback) (mainloop_child_t * p, pid_t pid, int core, int signo, int exitcode)) { static bool need_init = TRUE; mainloop_child_t *child = g_new(mainloop_child_t, 1); child->pid = pid; child->timerid = 0; child->timeout = FALSE; child->privatedata = privatedata; child->callback = callback; child->flags = flags; if(desc) { child->desc = strdup(desc); } if (timeout) { child->timerid = g_timeout_add(timeout, child_timeout_callback, child); } child_list = g_list_append(child_list, child); if(need_init) { need_init = FALSE; /* SIGCHLD processing has to be invoked from mainloop. * We do not want it to be possible to both add a child pid * to mainloop, and have the pid's exit callback invoked within * the same callstack. */ g_timeout_add(1, child_signal_init, NULL); } } void mainloop_child_add(pid_t pid, int timeout, const char *desc, void *privatedata, void (*callback) (mainloop_child_t * p, pid_t pid, int core, int signo, int exitcode)) { mainloop_child_add_with_flags(pid, timeout, desc, privatedata, 0, callback); } struct mainloop_timer_s { guint id; guint period_ms; bool repeat; char *name; GSourceFunc cb; void *userdata; }; struct mainloop_timer_s mainloop; static gboolean mainloop_timer_cb(gpointer user_data) { int id = 0; bool repeat = FALSE; struct mainloop_timer_s *t = user_data; CRM_ASSERT(t != NULL); id = t->id; t->id = 0; /* Ensure it's unset during callbacks so that * mainloop_timer_running() works as expected */ if(t->cb) { crm_trace("Invoking callbacks for timer %s", t->name); repeat = t->repeat; if(t->cb(t->userdata) == FALSE) { crm_trace("Timer %s complete", t->name); repeat = FALSE; } } if(repeat) { /* Restore if repeating */ t->id = id; } return repeat; } bool mainloop_timer_running(mainloop_timer_t *t) { if(t && t->id != 0) { return TRUE; } return FALSE; } void mainloop_timer_start(mainloop_timer_t *t) { mainloop_timer_stop(t); if(t && t->period_ms > 0) { crm_trace("Starting timer %s", t->name); t->id = g_timeout_add(t->period_ms, mainloop_timer_cb, t); } } void mainloop_timer_stop(mainloop_timer_t *t) { if(t && t->id != 0) { crm_trace("Stopping timer %s", t->name); g_source_remove(t->id); t->id = 0; } } guint mainloop_timer_set_period(mainloop_timer_t *t, guint period_ms) { guint last = 0; if(t) { last = t->period_ms; t->period_ms = period_ms; } if(t && t->id != 0 && last != t->period_ms) { mainloop_timer_start(t); } return last; } mainloop_timer_t * mainloop_timer_add(const char *name, guint period_ms, bool repeat, GSourceFunc cb, void *userdata) { mainloop_timer_t *t = calloc(1, sizeof(mainloop_timer_t)); if(t) { if(name) { t->name = crm_strdup_printf("%s-%u-%d", name, period_ms, repeat); } else { t->name = crm_strdup_printf("%p-%u-%d", t, period_ms, repeat); } t->id = 0; t->period_ms = period_ms; t->repeat = repeat; t->cb = cb; t->userdata = userdata; crm_trace("Created timer %s with %p %p", t->name, userdata, t->userdata); } return t; } void mainloop_timer_del(mainloop_timer_t *t) { if(t) { crm_trace("Destroying timer %s", t->name); mainloop_timer_stop(t); free(t->name); free(t); } } diff --git a/lib/pengine/container.c b/lib/pengine/container.c index b328861e86..b93d98f6e9 100644 --- a/lib/pengine/container.c +++ b/lib/pengine/container.c @@ -1,904 +1,978 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #define VARIANT_CONTAINER 1 #include "./variant.h" void tuple_free(container_grouping_t *tuple); static char * next_ip(const char *last_ip) { unsigned int oct1 = 0; unsigned int oct2 = 0; unsigned int oct3 = 0; unsigned int oct4 = 0; int rc = sscanf(last_ip, "%u.%u.%u.%u", &oct1, &oct2, &oct3, &oct4); if (rc != 4) { /*@ TODO check for IPv6 */ return NULL; } else if (oct3 > 253) { return NULL; } else if (oct4 > 253) { ++oct3; oct4 = 1; } else { ++oct4; } return crm_strdup_printf("%u.%u.%u.%u", oct1, oct2, oct3, oct4); } static int allocate_ip(container_variant_data_t *data, container_grouping_t *tuple, char *buffer, int max) { if(data->ip_range_start == NULL) { return 0; } else if(data->ip_last) { tuple->ipaddr = next_ip(data->ip_last); } else { tuple->ipaddr = strdup(data->ip_range_start); } data->ip_last = tuple->ipaddr; #if 0 return snprintf(buffer, max, " --add-host=%s-%d:%s --link %s-docker-%d:%s-link-%d", data->prefix, tuple->offset, tuple->ipaddr, data->prefix, tuple->offset, data->prefix, tuple->offset); #else return snprintf(buffer, max, " --add-host=%s-%d:%s", data->prefix, tuple->offset, tuple->ipaddr); #endif } static xmlNode * create_resource(const char *name, const char *provider, const char *kind) { xmlNode *rsc = create_xml_node(NULL, XML_CIB_TAG_RESOURCE); crm_xml_add(rsc, XML_ATTR_ID, name); crm_xml_add(rsc, XML_AGENT_ATTR_CLASS, "ocf"); crm_xml_add(rsc, XML_AGENT_ATTR_PROVIDER, provider); crm_xml_add(rsc, XML_ATTR_TYPE, kind); return rsc; } static void create_nvp(xmlNode *parent, const char *name, const char *value) { xmlNode *xml_nvp = create_xml_node(parent, XML_CIB_TAG_NVPAIR); crm_xml_set_id(xml_nvp, "%s-%s", ID(parent), name); crm_xml_add(xml_nvp, XML_NVPAIR_ATTR_NAME, name); crm_xml_add(xml_nvp, XML_NVPAIR_ATTR_VALUE, value); } static void create_op(xmlNode *parent, const char *prefix, const char *task, const char *interval) { xmlNode *xml_op = create_xml_node(parent, "op"); crm_xml_set_id(xml_op, "%s-%s-%s", prefix, task, interval); crm_xml_add(xml_op, XML_LRM_ATTR_INTERVAL, interval); crm_xml_add(xml_op, "name", task); } /*! * \internal * \brief Check whether cluster can manage resource inside container * * \param[in] data Container variant data * * \return TRUE if networking configuration is acceptable, FALSE otherwise * * \note The resource is manageable if an IP range or control port has been * specified. If a control port is used without an IP range, replicas per * host must be 1. */ static bool valid_network(container_variant_data_t *data) { if(data->ip_range_start) { return TRUE; } if(data->control_port) { if(data->replicas_per_host > 1) { pe_err("Specifying the 'control-port' for %s requires 'replicas-per-host=1'", data->prefix); data->replicas_per_host = 1; /* @TODO to be sure: clear_bit(rsc->flags, pe_rsc_unique); */ } return TRUE; } return FALSE; } static bool create_ip_resource( resource_t *parent, container_variant_data_t *data, container_grouping_t *tuple, pe_working_set_t * data_set) { if(data->ip_range_start) { char *id = NULL; xmlNode *xml_ip = NULL; xmlNode *xml_obj = NULL; id = crm_strdup_printf("%s-ip-%s", data->prefix, tuple->ipaddr); crm_xml_sanitize_id(id); xml_ip = create_resource(id, "heartbeat", "IPaddr2"); free(id); xml_obj = create_xml_node(xml_ip, XML_TAG_ATTR_SETS); crm_xml_set_id(xml_obj, "%s-attributes-%d", data->prefix, tuple->offset); create_nvp(xml_obj, "ip", tuple->ipaddr); if(data->host_network) { create_nvp(xml_obj, "nic", data->host_network); } if(data->host_netmask) { create_nvp(xml_obj, "cidr_netmask", data->host_netmask); } else { create_nvp(xml_obj, "cidr_netmask", "32"); } xml_obj = create_xml_node(xml_ip, "operations"); create_op(xml_obj, ID(xml_ip), "monitor", "60s"); // TODO: Other ops? Timeouts and intervals from underlying resource? if (common_unpack(xml_ip, &tuple->ip, NULL, data_set) == false) { return FALSE; } parent->children = g_list_append(parent->children, tuple->ip); } return TRUE; } static bool create_docker_resource( resource_t *parent, container_variant_data_t *data, container_grouping_t *tuple, pe_working_set_t * data_set) { int offset = 0, max = 4096; char *buffer = calloc(1, max+1); int doffset = 0, dmax = 1024; char *dbuffer = calloc(1, dmax+1); char *id = NULL; xmlNode *xml_docker = NULL; xmlNode *xml_obj = NULL; id = crm_strdup_printf("%s-docker-%d", data->prefix, tuple->offset); crm_xml_sanitize_id(id); xml_docker = create_resource(id, "heartbeat", "docker"); free(id); xml_obj = create_xml_node(xml_docker, XML_TAG_ATTR_SETS); crm_xml_set_id(xml_obj, "%s-attributes-%d", data->prefix, tuple->offset); create_nvp(xml_obj, "image", data->image); create_nvp(xml_obj, "allow_pull", "true"); create_nvp(xml_obj, "force_kill", "false"); create_nvp(xml_obj, "reuse", "false"); offset += snprintf(buffer+offset, max-offset, "-h %s-%d --restart=no ", data->prefix, tuple->offset); if(data->docker_network) { // offset += snprintf(buffer+offset, max-offset, " --link-local-ip=%s", tuple->ipaddr); offset += snprintf(buffer+offset, max-offset, " --net=%s", data->docker_network); } if(data->control_port) { offset += snprintf(buffer+offset, max-offset, " -e PCMK_remote_port=%s", data->control_port); } else { offset += snprintf(buffer+offset, max-offset, " -e PCMK_remote_port=%d", DEFAULT_REMOTE_PORT); } for(GListPtr pIter = data->mounts; pIter != NULL; pIter = pIter->next) { container_mount_t *mount = pIter->data; if(mount->flags) { char *source = crm_strdup_printf( "%s/%s-%d", mount->source, data->prefix, tuple->offset); if(doffset > 0) { doffset += snprintf(dbuffer+doffset, dmax-doffset, ","); } doffset += snprintf(dbuffer+doffset, dmax-doffset, "%s", source); offset += snprintf(buffer+offset, max-offset, " -v %s:%s", source, mount->target); free(source); } else { offset += snprintf(buffer+offset, max-offset, " -v %s:%s", mount->source, mount->target); } if(mount->options) { offset += snprintf(buffer+offset, max-offset, ":%s", mount->options); } } for(GListPtr pIter = data->ports; pIter != NULL; pIter = pIter->next) { container_port_t *port = pIter->data; if(tuple->ipaddr) { offset += snprintf(buffer+offset, max-offset, " -p %s:%s:%s", tuple->ipaddr, port->source, port->target); } else { offset += snprintf(buffer+offset, max-offset, " -p %s:%s", port->source, port->target); } } if(data->docker_run_options) { offset += snprintf(buffer+offset, max-offset, " %s", data->docker_run_options); } if(data->docker_host_options) { offset += snprintf(buffer+offset, max-offset, " %s", data->docker_host_options); } create_nvp(xml_obj, "run_opts", buffer); free(buffer); create_nvp(xml_obj, "mount_points", dbuffer); free(dbuffer); if(tuple->child) { if(data->docker_run_command) { create_nvp(xml_obj, "run_cmd", data->docker_run_command); } else { create_nvp(xml_obj, "run_cmd", SBIN_DIR"/pacemaker_remoted"); } /* TODO: Allow users to specify their own? * * We just want to know if the container is alive, we'll - * monitor the child independantly + * monitor the child independently */ create_nvp(xml_obj, "monitor_cmd", "/bin/true"); /* } else if(child && data->untrusted) { * Support this use-case? * * The ability to have resources started/stopped by us, but * unable to set attributes, etc. * * Arguably better to control API access this with ACLs like * "normal" remote nodes * * create_nvp(xml_obj, "run_cmd", "/usr/libexec/pacemaker/lrmd"); * create_nvp(xml_obj, "monitor_cmd", "/usr/libexec/pacemaker/lrmd_internal_ctl -c poke"); */ } else { + if(data->docker_run_command) { + create_nvp(xml_obj, "run_cmd", data->docker_run_command); + } + /* TODO: Allow users to specify their own? * * We don't know what's in the container, so we just want * to know if it is alive */ create_nvp(xml_obj, "monitor_cmd", "/bin/true"); } xml_obj = create_xml_node(xml_docker, "operations"); create_op(xml_obj, ID(xml_docker), "monitor", "60s"); // TODO: Other ops? Timeouts and intervals from underlying resource? if (common_unpack(xml_docker, &tuple->docker, NULL, data_set) == FALSE) { return FALSE; } parent->children = g_list_append(parent->children, tuple->docker); tuple->docker->parent = parent; return TRUE; } static bool create_remote_resource( resource_t *parent, container_variant_data_t *data, container_grouping_t *tuple, pe_working_set_t * data_set) { if (tuple->child && valid_network(data)) { node_t *node = NULL; xmlNode *xml_obj = NULL; xmlNode *xml_remote = NULL; char *nodeid = crm_strdup_printf("%s-%d", data->prefix, tuple->offset); char *id = NULL; if (remote_id_conflict(nodeid, data_set)) { // The biggest hammer we have id = crm_strdup_printf("pcmk-internal-%s-remote-%d", tuple->child->id, tuple->offset); CRM_ASSERT(remote_id_conflict(id, data_set) == FALSE); } else { id = strdup(nodeid); } xml_remote = create_resource(id, "pacemaker", "remote"); free(id); xml_obj = create_xml_node(xml_remote, "operations"); create_op(xml_obj, ID(xml_remote), "monitor", "60s"); xml_obj = create_xml_node(xml_remote, XML_TAG_ATTR_SETS); crm_xml_set_id(xml_obj, "%s-attributes-%d", data->prefix, tuple->offset); if(tuple->ipaddr) { create_nvp(xml_obj, "addr", tuple->ipaddr); } else { create_nvp(xml_obj, "addr", "localhost"); } if(data->control_port) { create_nvp(xml_obj, "port", data->control_port); } else { create_nvp(xml_obj, "port", crm_itoa(DEFAULT_REMOTE_PORT)); } xml_obj = create_xml_node(xml_remote, XML_TAG_META_SETS); crm_xml_set_id(xml_obj, "%s-meta-%d", data->prefix, tuple->offset); create_nvp(xml_obj, XML_OP_ATTR_ALLOW_MIGRATE, "false"); // Sets up node->details->remote_rsc->container == tuple->docker create_nvp(xml_obj, XML_RSC_ATTR_CONTAINER, tuple->docker->id); // TODO: Do this generically, eg with rsc->flags // create_nvp(xml_obj, XML_RSC_ATTR_INTERNAL_RSC, "true"); // Suppress printing // tuple->docker->fillers = g_list_append(tuple->docker->fillers, child); // -INFINITY prevents anyone else from running here node = pe_create_node(strdup(nodeid), nodeid, "remote", "-INFINITY", data_set); tuple->node = node_copy(node); tuple->node->weight = 500; nodeid = NULL; id = NULL; if (common_unpack(xml_remote, &tuple->remote, NULL, data_set) == FALSE) { return FALSE; } tuple->node->details->remote_rsc = tuple->remote; parent->children = g_list_append(parent->children, tuple->remote); } return TRUE; } static bool create_container( resource_t *parent, container_variant_data_t *data, container_grouping_t *tuple, pe_working_set_t * data_set) { if(create_docker_resource(parent, data, tuple, data_set) == FALSE) { return TRUE; } if(create_ip_resource(parent, data, tuple, data_set) == FALSE) { return TRUE; } if(create_remote_resource(parent, data, tuple, data_set) == FALSE) { return TRUE; } if(tuple->child && tuple->ipaddr) { add_hash_param(tuple->child->meta, "external-ip", tuple->ipaddr); } return FALSE; } static void mount_free(container_mount_t *mount) { free(mount->source); free(mount->target); free(mount->options); free(mount); } static void port_free(container_port_t *port) { free(port->source); free(port->target); free(port); } gboolean container_unpack(resource_t * rsc, pe_working_set_t * data_set) { const char *value = NULL; xmlNode *xml_obj = NULL; xmlNode *xml_resource = NULL; container_variant_data_t *container_data = NULL; + CRM_ASSERT(rsc != NULL); pe_rsc_trace(rsc, "Processing resource %s...", rsc->id); container_data = calloc(1, sizeof(container_variant_data_t)); rsc->variant_opaque = container_data; container_data->prefix = strdup(rsc->id); xml_obj = first_named_child(rsc->xml, "docker"); if(xml_obj == NULL) { return FALSE; } value = crm_element_value(xml_obj, "masters"); container_data->masters = crm_parse_int(value, "0"); if (container_data->masters < 0) { pe_err("'masters' for %s must be nonnegative integer, using 0", rsc->id); container_data->masters = 0; } value = crm_element_value(xml_obj, "replicas"); if ((value == NULL) && (container_data->masters > 0)) { container_data->replicas = container_data->masters; } else { container_data->replicas = crm_parse_int(value, "1"); } if (container_data->replicas < 1) { pe_err("'replicas' for %s must be positive integer, using 1", rsc->id); container_data->replicas = 1; } /* * Communication between containers on the same host via the * floating IPs only works if docker is started with: * --userland-proxy=false --ip-masq=false */ value = crm_element_value(xml_obj, "replicas-per-host"); container_data->replicas_per_host = crm_parse_int(value, "1"); if (container_data->replicas_per_host < 1) { pe_err("'replicas-per-host' for %s must be positive integer, using 1", rsc->id); container_data->replicas_per_host = 1; } if (container_data->replicas_per_host == 1) { clear_bit(rsc->flags, pe_rsc_unique); } container_data->docker_run_command = crm_element_value_copy(xml_obj, "run-command"); container_data->docker_run_options = crm_element_value_copy(xml_obj, "options"); container_data->image = crm_element_value_copy(xml_obj, "image"); container_data->docker_network = crm_element_value_copy(xml_obj, "network"); xml_obj = first_named_child(rsc->xml, "network"); if(xml_obj) { container_data->ip_range_start = crm_element_value_copy(xml_obj, "ip-range-start"); container_data->host_netmask = crm_element_value_copy(xml_obj, "host-netmask"); container_data->host_network = crm_element_value_copy(xml_obj, "host-interface"); container_data->control_port = crm_element_value_copy(xml_obj, "control-port"); for (xmlNode *xml_child = __xml_first_child_element(xml_obj); xml_child != NULL; xml_child = __xml_next_element(xml_child)) { container_port_t *port = calloc(1, sizeof(container_port_t)); port->source = crm_element_value_copy(xml_child, "port"); if(port->source == NULL) { port->source = crm_element_value_copy(xml_child, "range"); } else { port->target = crm_element_value_copy(xml_child, "internal-port"); } if(port->source != NULL && strlen(port->source) > 0) { if(port->target == NULL) { port->target = strdup(port->source); } container_data->ports = g_list_append(container_data->ports, port); } else { pe_err("Invalid port directive %s", ID(xml_child)); port_free(port); } } } xml_obj = first_named_child(rsc->xml, "storage"); for (xmlNode *xml_child = __xml_first_child_element(xml_obj); xml_child != NULL; xml_child = __xml_next_element(xml_child)) { container_mount_t *mount = calloc(1, sizeof(container_mount_t)); mount->source = crm_element_value_copy(xml_child, "source-dir"); if(mount->source == NULL) { mount->source = crm_element_value_copy(xml_child, "source-dir-root"); mount->flags = 1; } mount->target = crm_element_value_copy(xml_child, "target-dir"); mount->options = crm_element_value_copy(xml_child, "options"); if(mount->source && mount->target) { container_data->mounts = g_list_append(container_data->mounts, mount); } else { pe_err("Invalid mount directive %s", ID(xml_child)); mount_free(mount); } } xml_obj = first_named_child(rsc->xml, "primitive"); if (xml_obj && valid_network(container_data)) { char *value = NULL; xmlNode *xml_set = NULL; if(container_data->masters > 0) { xml_resource = create_xml_node(NULL, XML_CIB_TAG_MASTER); } else { xml_resource = create_xml_node(NULL, XML_CIB_TAG_INCARNATION); } crm_xml_set_id(xml_resource, "%s-%s", container_data->prefix, xml_resource->name); xml_set = create_xml_node(xml_resource, XML_TAG_META_SETS); crm_xml_set_id(xml_set, "%s-%s-meta", container_data->prefix, xml_resource->name); create_nvp(xml_set, XML_RSC_ATTR_ORDERED, "true"); value = crm_itoa(container_data->replicas); create_nvp(xml_set, XML_RSC_ATTR_INCARNATION_MAX, value); free(value); value = crm_itoa(container_data->replicas_per_host); create_nvp(xml_set, XML_RSC_ATTR_INCARNATION_NODEMAX, value); free(value); if(container_data->replicas_per_host > 1) { create_nvp(xml_set, XML_RSC_ATTR_UNIQUE, "true"); } else { create_nvp(xml_set, XML_RSC_ATTR_UNIQUE, "false"); } if(container_data->masters) { value = crm_itoa(container_data->masters); create_nvp(xml_set, XML_RSC_ATTR_MASTER_MAX, value); free(value); } //crm_xml_add(xml_obj, XML_ATTR_ID, container_data->prefix); add_node_copy(xml_resource, xml_obj); } else if(xml_obj) { pe_err("Cannot control %s inside %s without either ip-range-start or control-port", rsc->id, ID(xml_obj)); return FALSE; } if(xml_resource) { int lpc = 0; GListPtr childIter = NULL; resource_t *new_rsc = NULL; container_mount_t *mount = NULL; - container_port_t *port = calloc(1, sizeof(container_port_t)); + container_port_t *port = NULL; int offset = 0, max = 1024; - char *buffer = calloc(1, max+1); + char *buffer = NULL; + + if (common_unpack(xml_resource, &new_rsc, rsc, data_set) == FALSE) { + pe_err("Failed unpacking resource %s", ID(rsc->xml)); + if (new_rsc != NULL && new_rsc->fns != NULL) { + new_rsc->fns->free(new_rsc); + } + return FALSE; + } + + container_data->child = new_rsc; + container_data->child->orig_xml = xml_obj; // Also the trigger for common_free() + // to free xml_resource as container_data->child->xml mount = calloc(1, sizeof(container_mount_t)); mount->source = strdup(DEFAULT_REMOTE_KEY_LOCATION); mount->target = strdup(DEFAULT_REMOTE_KEY_LOCATION); mount->options = NULL; mount->flags = 0; container_data->mounts = g_list_append(container_data->mounts, mount); mount = calloc(1, sizeof(container_mount_t)); mount->source = strdup(CRM_LOG_DIR "/bundles"); mount->target = strdup("/var/log"); mount->options = NULL; mount->flags = 1; container_data->mounts = g_list_append(container_data->mounts, mount); + port = calloc(1, sizeof(container_port_t)); if(container_data->control_port) { port->source = strdup(container_data->control_port); } else { port->source = crm_itoa(DEFAULT_REMOTE_PORT); } - port->target = strdup(port->source); container_data->ports = g_list_append(container_data->ports, port); - if (common_unpack(xml_resource, &new_rsc, rsc, data_set) == FALSE) { - pe_err("Failed unpacking resource %s", crm_element_value(rsc->xml, XML_ATTR_ID)); - if (new_rsc != NULL && new_rsc->fns != NULL) { - new_rsc->fns->free(new_rsc); - } - return FALSE; - } - - container_data->child = new_rsc; - container_data->child->orig_xml = xml_obj; // Also the trigger for common_free() - // to free xml_resource as container_data->child->xml - + buffer = calloc(1, max+1); for(childIter = container_data->child->children; childIter != NULL; childIter = childIter->next) { container_grouping_t *tuple = calloc(1, sizeof(container_grouping_t)); tuple->child = childIter->data; tuple->offset = lpc++; offset += allocate_ip(container_data, tuple, buffer+offset, max-offset); container_data->tuples = g_list_append(container_data->tuples, tuple); } container_data->docker_host_options = buffer; } else { // Just a naked container, no pacemaker-remote int offset = 0, max = 1024; char *buffer = calloc(1, max+1); for(int lpc = 0; lpc < container_data->replicas; lpc++) { container_grouping_t *tuple = calloc(1, sizeof(container_grouping_t)); tuple->offset = lpc; offset += allocate_ip(container_data, tuple, buffer+offset, max-offset); container_data->tuples = g_list_append(container_data->tuples, tuple); } container_data->docker_host_options = buffer; } for (GListPtr gIter = container_data->tuples; gIter != NULL; gIter = gIter->next) { container_grouping_t *tuple = (container_grouping_t *)gIter->data; // TODO: Remove from list if create_container() returns TRUE create_container(rsc, container_data, tuple, data_set); } if(container_data->child) { rsc->children = g_list_append(rsc->children, container_data->child); } return TRUE; } +static int +tuple_rsc_active(resource_t *rsc, gboolean all) +{ + if (rsc) { + gboolean child_active = rsc->fns->active(rsc, all); + + if (child_active && !all) { + return TRUE; + } else if (!child_active && all) { + return FALSE; + } + } + return -1; +} + gboolean container_active(resource_t * rsc, gboolean all) { - return TRUE; + container_variant_data_t *container_data = NULL; + GListPtr iter = NULL; + + get_container_variant_data(container_data, rsc); + for (iter = container_data->tuples; iter != NULL; iter = iter->next) { + container_grouping_t *tuple = (container_grouping_t *)(iter->data); + int rsc_active; + + rsc_active = tuple_rsc_active(tuple->ip, all); + if (rsc_active >= 0) { + return (gboolean) rsc_active; + } + + rsc_active = tuple_rsc_active(tuple->child, all); + if (rsc_active >= 0) { + return (gboolean) rsc_active; + } + + rsc_active = tuple_rsc_active(tuple->docker, all); + if (rsc_active >= 0) { + return (gboolean) rsc_active; + } + + rsc_active = tuple_rsc_active(tuple->remote, all); + if (rsc_active >= 0) { + return (gboolean) rsc_active; + } + } + + /* If "all" is TRUE, we've already checked that no resources were inactive, + * so return TRUE; if "all" is FALSE, we didn't find any active resources, + * so return FALSE. + */ + return all; } resource_t * find_container_child(const char *stem, resource_t * rsc, node_t *node) { container_variant_data_t *container_data = NULL; resource_t *parent = uber_parent(rsc); CRM_ASSERT(parent->parent); parent = parent->parent; get_container_variant_data(container_data, parent); if (is_not_set(rsc->flags, pe_rsc_unique)) { for (GListPtr gIter = container_data->tuples; gIter != NULL; gIter = gIter->next) { container_grouping_t *tuple = (container_grouping_t *)gIter->data; CRM_ASSERT(tuple); if(tuple->node->details == node->details) { rsc = tuple->child; break; } } } if (rsc && safe_str_neq(stem, rsc->id)) { free(rsc->clone_name); rsc->clone_name = strdup(stem); } return rsc; } +static void +print_rsc_in_list(resource_t *rsc, const char *pre_text, long options, + void *print_data) +{ + if (rsc != NULL) { + if (options & pe_print_html) { + status_print("
  • "); + } + rsc->fns->print(rsc, pre_text, options, print_data); + if (options & pe_print_html) { + status_print("
  • \n"); + } + } +} + static void container_print_xml(resource_t * rsc, const char *pre_text, long options, void *print_data) { container_variant_data_t *container_data = NULL; char *child_text = NULL; CRM_CHECK(rsc != NULL, return); if (pre_text == NULL) { pre_text = ""; } - child_text = crm_concat(pre_text, " ", ' '); + child_text = crm_concat(pre_text, " ", ' '); - status_print("%sid); + status_print("type=\"docker\" "); + status_print("image=\"%s\" ", container_data->image); + status_print("unique=\"%s\" ", is_set(rsc->flags, pe_rsc_unique)? "true" : "false"); status_print("managed=\"%s\" ", is_set(rsc->flags, pe_rsc_managed) ? "true" : "false"); status_print("failed=\"%s\" ", is_set(rsc->flags, pe_rsc_failed) ? "true" : "false"); status_print(">\n"); - get_container_variant_data(container_data, rsc); - - status_print("%sDocker container: %s [%s]%s%s", - pre_text, rsc->id, container_data->image, - is_set(rsc->flags, pe_rsc_unique) ? " (unique)" : "", - is_set(rsc->flags, pe_rsc_managed) ? "" : " (unmanaged)"); - for (GListPtr gIter = container_data->tuples; gIter != NULL; gIter = gIter->next) { container_grouping_t *tuple = (container_grouping_t *)gIter->data; CRM_ASSERT(tuple); - if(tuple->ip) { - tuple->ip->fns->print(tuple->ip, child_text, options, print_data); - } - if(tuple->child) { - tuple->child->fns->print(tuple->child, child_text, options, print_data); - } - if(tuple->docker) { - tuple->docker->fns->print(tuple->docker, child_text, options, print_data); - } - if(tuple->remote) { - tuple->remote->fns->print(tuple->remote, child_text, options, print_data); - } - } - status_print("%s\n", pre_text); + status_print("%s \n", pre_text, tuple->offset); + print_rsc_in_list(tuple->ip, child_text, options, print_data); + print_rsc_in_list(tuple->child, child_text, options, print_data); + print_rsc_in_list(tuple->docker, child_text, options, print_data); + print_rsc_in_list(tuple->remote, child_text, options, print_data); + status_print("%s \n", pre_text); + } + status_print("%s\n", pre_text); free(child_text); } static void tuple_print(container_grouping_t * tuple, const char *pre_text, long options, void *print_data) { node_t *node = NULL; resource_t *rsc = tuple->child; int offset = 0; char buffer[LINE_MAX]; if(rsc == NULL) { rsc = tuple->docker; } if(tuple->remote) { offset += snprintf(buffer + offset, LINE_MAX - offset, "%s", rsc_printable_id(tuple->remote)); } else { offset += snprintf(buffer + offset, LINE_MAX - offset, "%s", rsc_printable_id(tuple->docker)); } if(tuple->ipaddr) { offset += snprintf(buffer + offset, LINE_MAX - offset, " (%s)", tuple->ipaddr); } if(tuple->remote && tuple->remote->running_on != NULL) { node = tuple->remote->running_on->data; } else if (tuple->remote == NULL && rsc->running_on != NULL) { node = rsc->running_on->data; } common_print(rsc, pre_text, buffer, node, options, print_data); } void container_print(resource_t * rsc, const char *pre_text, long options, void *print_data) { container_variant_data_t *container_data = NULL; char *child_text = NULL; CRM_CHECK(rsc != NULL, return); if (options & pe_print_xml) { container_print_xml(rsc, pre_text, options, print_data); return; } get_container_variant_data(container_data, rsc); if (pre_text == NULL) { pre_text = " "; } - child_text = crm_strdup_printf(" %s", pre_text); status_print("%sDocker container%s: %s [%s]%s%s\n", pre_text, container_data->replicas>1?" set":"", rsc->id, container_data->image, is_set(rsc->flags, pe_rsc_unique) ? " (unique)" : "", is_set(rsc->flags, pe_rsc_managed) ? "" : " (unmanaged)"); + if (options & pe_print_html) { + status_print("
    \n
      \n"); + } + for (GListPtr gIter = container_data->tuples; gIter != NULL; gIter = gIter->next) { container_grouping_t *tuple = (container_grouping_t *)gIter->data; CRM_ASSERT(tuple); + if (options & pe_print_html) { + status_print("
    • "); + } + if(is_set(options, pe_print_clone_details)) { + child_text = crm_strdup_printf(" %s", pre_text); if(g_list_length(container_data->tuples) > 1) { status_print(" %sReplica[%d]\n", pre_text, tuple->offset); } - - if(tuple->ip) { - tuple->ip->fns->print(tuple->ip, child_text, options, print_data); - } - if(tuple->docker) { - tuple->docker->fns->print(tuple->docker, child_text, options, print_data); - } - if(tuple->remote) { - tuple->remote->fns->print(tuple->remote, child_text, options, print_data); + if (options & pe_print_html) { + status_print("
      \n
        \n"); } - if(tuple->child) { - tuple->child->fns->print(tuple->child, child_text, options, print_data); + print_rsc_in_list(tuple->ip, child_text, options, print_data); + print_rsc_in_list(tuple->docker, child_text, options, print_data); + print_rsc_in_list(tuple->remote, child_text, options, print_data); + print_rsc_in_list(tuple->child, child_text, options, print_data); + if (options & pe_print_html) { + status_print("
      \n"); } } else { - char *child_text = crm_strdup_printf("%s ", pre_text); + child_text = crm_strdup_printf("%s ", pre_text); tuple_print(tuple, child_text, options, print_data); } + free(child_text); + + if (options & pe_print_html) { + status_print("
    • \n"); + } + } + if (options & pe_print_html) { + status_print("
    \n"); } } void tuple_free(container_grouping_t *tuple) { if(tuple == NULL) { return; } // TODO: Free tuple->node ? if(tuple->ip) { tuple->ip->fns->free(tuple->ip); tuple->ip = NULL; } if(tuple->child) { tuple->child->fns->free(tuple->child); tuple->child = NULL; } if(tuple->docker) { tuple->docker->fns->free(tuple->docker); tuple->docker = NULL; } if(tuple->remote) { tuple->remote->fns->free(tuple->remote); tuple->remote = NULL; } free(tuple->ipaddr); free(tuple); } void container_free(resource_t * rsc) { container_variant_data_t *container_data = NULL; CRM_CHECK(rsc != NULL, return); get_container_variant_data(container_data, rsc); pe_rsc_trace(rsc, "Freeing %s", rsc->id); free(container_data->prefix); free(container_data->image); free(container_data->control_port); free(container_data->host_network); free(container_data->host_netmask); free(container_data->ip_range_start); free(container_data->docker_network); free(container_data->docker_run_options); free(container_data->docker_run_command); free(container_data->docker_host_options); g_list_free_full(container_data->tuples, (GDestroyNotify)tuple_free); g_list_free_full(container_data->mounts, (GDestroyNotify)mount_free); g_list_free_full(container_data->ports, (GDestroyNotify)port_free); common_free(rsc); } enum rsc_role_e container_resource_state(const resource_t * rsc, gboolean current) { enum rsc_role_e container_role = RSC_ROLE_UNKNOWN; return container_role; } diff --git a/lib/pengine/unpack.c b/lib/pengine/unpack.c index ef51cd5780..6aa51ddd65 100644 --- a/lib/pengine/unpack.c +++ b/lib/pengine/unpack.c @@ -1,3485 +1,3480 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #include #include #include #include CRM_TRACE_INIT_DATA(pe_status); #define set_config_flag(data_set, option, flag) do { \ const char *tmp = pe_pref(data_set->config_hash, option); \ if(tmp) { \ if(crm_is_true(tmp)) { \ set_bit(data_set->flags, flag); \ } else { \ clear_bit(data_set->flags, flag); \ } \ } \ } while(0) gboolean unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, xmlNode ** last_failure, enum action_fail_response *failed, pe_working_set_t * data_set); static gboolean determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node); static gboolean is_dangling_container_remote_node(node_t *node) { /* we are looking for a remote-node that was supposed to be mapped to a * container resource, but all traces of that container have disappeared * from both the config and the status section. */ if (is_remote_node(node) && node->details->remote_rsc && node->details->remote_rsc->container == NULL && is_set(node->details->remote_rsc->flags, pe_rsc_orphan_container_filler)) { return TRUE; } return FALSE; } void pe_fence_node(pe_working_set_t * data_set, node_t * node, const char *reason) { CRM_CHECK(node, return); /* A guest node is fenced by marking its container as failed */ if (is_container_remote_node(node)) { resource_t *rsc = node->details->remote_rsc->container; if (is_set(rsc->flags, pe_rsc_failed) == FALSE) { crm_warn("Guest node %s will be fenced (by recovering %s) %s", node->details->uname, rsc->id, reason); /* We don't mark the node as unclean, because that would prevent the * node from running resources. We want to allow it to run resources * in this transition if the recovery succeeds. */ node->details->remote_requires_reset = TRUE; set_bit(rsc->flags, pe_rsc_failed); } } else if (is_dangling_container_remote_node(node)) { crm_info("Cleaning up dangling connection resource for guest node %s %s" " (fencing is already done, guest resource no longer exists)", node->details->uname, reason); set_bit(node->details->remote_rsc->flags, pe_rsc_failed); } else if (is_baremetal_remote_node(node)) { resource_t *rsc = node->details->remote_rsc; if (rsc && (!is_set(rsc->flags, pe_rsc_managed))) { crm_notice("Not fencing node %s because connection is unmanaged, " "otherwise would %s", node->details->uname, reason); } else { if (pe_can_fence(data_set, node)) { crm_warn("Node %s will be fenced %s", node->details->uname, reason); } else { crm_warn("Node %s is unclean %s", node->details->uname, reason); } node->details->remote_requires_reset = TRUE; } node->details->unclean = TRUE; } else if (node->details->unclean == FALSE) { if (pe_can_fence(data_set, node)) { crm_warn("Node %s will be fenced %s", node->details->uname, reason); } else { crm_warn("Node %s is unclean %s", node->details->uname, reason); } node->details->unclean = TRUE; } else { crm_trace("Huh? %s %s", node->details->uname, reason); } } gboolean unpack_config(xmlNode * config, pe_working_set_t * data_set) { const char *value = NULL; GHashTable *config_hash = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); xmlXPathObjectPtr xpathObj = NULL; if(is_not_set(data_set->flags, pe_flag_enable_unfencing)) { xpathObj = xpath_search(data_set->input, "//nvpair[@name='provides' and @value='unfencing']"); if(xpathObj && numXpathResults(xpathObj) > 0) { set_bit(data_set->flags, pe_flag_enable_unfencing); } freeXpathObject(xpathObj); } if(is_not_set(data_set->flags, pe_flag_enable_unfencing)) { xpathObj = xpath_search(data_set->input, "//nvpair[@name='requires' and @value='unfencing']"); if(xpathObj && numXpathResults(xpathObj) > 0) { set_bit(data_set->flags, pe_flag_enable_unfencing); } freeXpathObject(xpathObj); } #ifdef REDHAT_COMPAT_6 if(is_not_set(data_set->flags, pe_flag_enable_unfencing)) { xpathObj = xpath_search(data_set->input, "//primitive[@type='fence_scsi']"); if(xpathObj && numXpathResults(xpathObj) > 0) { set_bit(data_set->flags, pe_flag_enable_unfencing); } freeXpathObject(xpathObj); } #endif data_set->config_hash = config_hash; unpack_instance_attributes(data_set->input, config, XML_CIB_TAG_PROPSET, NULL, config_hash, CIB_OPTIONS_FIRST, FALSE, data_set->now); verify_pe_options(data_set->config_hash); set_config_flag(data_set, "enable-startup-probes", pe_flag_startup_probes); if(is_not_set(data_set->flags, pe_flag_startup_probes)) { crm_info("Startup probes: disabled (dangerous)"); } value = pe_pref(data_set->config_hash, XML_ATTR_HAVE_WATCHDOG); if (value && crm_is_true(value)) { crm_notice("Watchdog will be used via SBD if fencing is required"); set_bit(data_set->flags, pe_flag_have_stonith_resource); } value = pe_pref(data_set->config_hash, "stonith-timeout"); data_set->stonith_timeout = crm_get_msec(value); crm_debug("STONITH timeout: %d", data_set->stonith_timeout); set_config_flag(data_set, "stonith-enabled", pe_flag_stonith_enabled); crm_debug("STONITH of failed nodes is %s", is_set(data_set->flags, pe_flag_stonith_enabled) ? "enabled" : "disabled"); data_set->stonith_action = pe_pref(data_set->config_hash, "stonith-action"); crm_trace("STONITH will %s nodes", data_set->stonith_action); set_config_flag(data_set, "concurrent-fencing", pe_flag_concurrent_fencing); crm_debug("Concurrent fencing is %s", is_set(data_set->flags, pe_flag_concurrent_fencing) ? "enabled" : "disabled"); set_config_flag(data_set, "stop-all-resources", pe_flag_stop_everything); crm_debug("Stop all active resources: %s", is_set(data_set->flags, pe_flag_stop_everything) ? "true" : "false"); set_config_flag(data_set, "symmetric-cluster", pe_flag_symmetric_cluster); if (is_set(data_set->flags, pe_flag_symmetric_cluster)) { crm_debug("Cluster is symmetric" " - resources can run anywhere by default"); } value = pe_pref(data_set->config_hash, "default-resource-stickiness"); data_set->default_resource_stickiness = char2score(value); crm_debug("Default stickiness: %d", data_set->default_resource_stickiness); value = pe_pref(data_set->config_hash, "no-quorum-policy"); if (safe_str_eq(value, "ignore")) { data_set->no_quorum_policy = no_quorum_ignore; } else if (safe_str_eq(value, "freeze")) { data_set->no_quorum_policy = no_quorum_freeze; } else if (safe_str_eq(value, "suicide")) { gboolean do_panic = FALSE; crm_element_value_int(data_set->input, XML_ATTR_QUORUM_PANIC, &do_panic); if (is_set(data_set->flags, pe_flag_stonith_enabled) == FALSE) { crm_config_err ("Setting no-quorum-policy=suicide makes no sense if stonith-enabled=false"); } if (do_panic && is_set(data_set->flags, pe_flag_stonith_enabled)) { data_set->no_quorum_policy = no_quorum_suicide; } else if (is_set(data_set->flags, pe_flag_have_quorum) == FALSE && do_panic == FALSE) { crm_notice("Resetting no-quorum-policy to 'stop': The cluster has never had quorum"); data_set->no_quorum_policy = no_quorum_stop; } } else { data_set->no_quorum_policy = no_quorum_stop; } switch (data_set->no_quorum_policy) { case no_quorum_freeze: crm_debug("On loss of CCM Quorum: Freeze resources"); break; case no_quorum_stop: crm_debug("On loss of CCM Quorum: Stop ALL resources"); break; case no_quorum_suicide: crm_notice("On loss of CCM Quorum: Fence all remaining nodes"); break; case no_quorum_ignore: crm_notice("On loss of CCM Quorum: Ignore"); break; } set_config_flag(data_set, "stop-orphan-resources", pe_flag_stop_rsc_orphans); crm_trace("Orphan resources are %s", is_set(data_set->flags, pe_flag_stop_rsc_orphans) ? "stopped" : "ignored"); set_config_flag(data_set, "stop-orphan-actions", pe_flag_stop_action_orphans); crm_trace("Orphan resource actions are %s", is_set(data_set->flags, pe_flag_stop_action_orphans) ? "stopped" : "ignored"); set_config_flag(data_set, "remove-after-stop", pe_flag_remove_after_stop); crm_trace("Stopped resources are removed from the status section: %s", is_set(data_set->flags, pe_flag_remove_after_stop) ? "true" : "false"); set_config_flag(data_set, "maintenance-mode", pe_flag_maintenance_mode); crm_trace("Maintenance mode: %s", is_set(data_set->flags, pe_flag_maintenance_mode) ? "true" : "false"); if (is_set(data_set->flags, pe_flag_maintenance_mode)) { clear_bit(data_set->flags, pe_flag_is_managed_default); } else { set_config_flag(data_set, "is-managed-default", pe_flag_is_managed_default); } crm_trace("By default resources are %smanaged", is_set(data_set->flags, pe_flag_is_managed_default) ? "" : "not "); set_config_flag(data_set, "start-failure-is-fatal", pe_flag_start_failure_fatal); crm_trace("Start failures are %s", is_set(data_set->flags, pe_flag_start_failure_fatal) ? "always fatal" : "handled by failcount"); node_score_red = char2score(pe_pref(data_set->config_hash, "node-health-red")); node_score_green = char2score(pe_pref(data_set->config_hash, "node-health-green")); node_score_yellow = char2score(pe_pref(data_set->config_hash, "node-health-yellow")); crm_debug("Node scores: 'red' = %s, 'yellow' = %s, 'green' = %s", pe_pref(data_set->config_hash, "node-health-red"), pe_pref(data_set->config_hash, "node-health-yellow"), pe_pref(data_set->config_hash, "node-health-green")); data_set->placement_strategy = pe_pref(data_set->config_hash, "placement-strategy"); crm_trace("Placement strategy: %s", data_set->placement_strategy); return TRUE; } static void destroy_digest_cache(gpointer ptr) { op_digest_cache_t *data = ptr; free_xml(data->params_all); free_xml(data->params_secure); free_xml(data->params_restart); free(data->digest_all_calc); free(data->digest_restart_calc); free(data->digest_secure_calc); free(data); } node_t * pe_create_node(const char *id, const char *uname, const char *type, const char *score, pe_working_set_t * data_set) { node_t *new_node = NULL; if (pe_find_node(data_set->nodes, uname) != NULL) { crm_config_warn("Detected multiple node entries with uname=%s" " - this is rarely intended", uname); } new_node = calloc(1, sizeof(node_t)); if (new_node == NULL) { return NULL; } new_node->weight = char2score(score); new_node->fixed = FALSE; new_node->details = calloc(1, sizeof(struct node_shared_s)); if (new_node->details == NULL) { free(new_node); return NULL; } crm_trace("Creating node for entry %s/%s", uname, id); new_node->details->id = id; new_node->details->uname = uname; new_node->details->online = FALSE; new_node->details->shutdown = FALSE; new_node->details->rsc_discovery_enabled = TRUE; new_node->details->running_rsc = NULL; new_node->details->type = node_ping; if (safe_str_eq(type, "remote")) { new_node->details->type = node_remote; set_bit(data_set->flags, pe_flag_have_remote_nodes); } else if (type == NULL || safe_str_eq(type, "member") || safe_str_eq(type, NORMALNODE)) { new_node->details->type = node_member; } new_node->details->attrs = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); if (is_remote_node(new_node)) { g_hash_table_insert(new_node->details->attrs, strdup("#kind"), strdup("remote")); } else { g_hash_table_insert(new_node->details->attrs, strdup("#kind"), strdup("cluster")); } new_node->details->utilization = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); new_node->details->digest_cache = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, destroy_digest_cache); data_set->nodes = g_list_insert_sorted(data_set->nodes, new_node, sort_node_uname); return new_node; } bool remote_id_conflict(const char *remote_name, pe_working_set_t *data) { bool match = FALSE; #if 1 pe_find_resource(data->resources, remote_name); #else if (data->name_check == NULL) { data->name_check = g_hash_table_new(crm_str_hash, g_str_equal); for (xml_rsc = __xml_first_child(parent); xml_rsc != NULL; xml_rsc = __xml_next_element(xml_rsc)) { const char *id = ID(xml_rsc); /* avoiding heap allocation here because we know the duration of this hashtable allows us to */ g_hash_table_insert(data->name_check, (char *) id, (char *) id); } } if (g_hash_table_lookup(data->name_check, remote_name)) { match = TRUE; } #endif if (match) { crm_err("Invalid remote-node name, a resource called '%s' already exists.", remote_name); return NULL; } return match; } static const char * expand_remote_rsc_meta(xmlNode *xml_obj, xmlNode *parent, pe_working_set_t *data) { xmlNode *xml_rsc = NULL; xmlNode *xml_tmp = NULL; xmlNode *attr_set = NULL; xmlNode *attr = NULL; const char *container_id = ID(xml_obj); const char *remote_name = NULL; const char *remote_server = NULL; const char *remote_port = NULL; const char *connect_timeout = "60s"; const char *remote_allow_migrate=NULL; for (attr_set = __xml_first_child(xml_obj); attr_set != NULL; attr_set = __xml_next_element(attr_set)) { if (safe_str_neq((const char *)attr_set->name, XML_TAG_META_SETS)) { continue; } for (attr = __xml_first_child(attr_set); attr != NULL; attr = __xml_next_element(attr)) { const char *value = crm_element_value(attr, XML_NVPAIR_ATTR_VALUE); const char *name = crm_element_value(attr, XML_NVPAIR_ATTR_NAME); if (safe_str_eq(name, XML_RSC_ATTR_REMOTE_NODE)) { remote_name = value; } else if (safe_str_eq(name, "remote-addr")) { remote_server = value; } else if (safe_str_eq(name, "remote-port")) { remote_port = value; } else if (safe_str_eq(name, "remote-connect-timeout")) { connect_timeout = value; } else if (safe_str_eq(name, "remote-allow-migrate")) { remote_allow_migrate=value; } } } if (remote_name == NULL) { return NULL; } if (remote_id_conflict(remote_name, data)) { return NULL; } xml_rsc = create_xml_node(parent, XML_CIB_TAG_RESOURCE); crm_xml_add(xml_rsc, XML_ATTR_ID, remote_name); crm_xml_add(xml_rsc, XML_AGENT_ATTR_CLASS, PCMK_RESOURCE_CLASS_OCF); crm_xml_add(xml_rsc, XML_AGENT_ATTR_PROVIDER, "pacemaker"); crm_xml_add(xml_rsc, XML_ATTR_TYPE, "remote"); xml_tmp = create_xml_node(xml_rsc, XML_TAG_META_SETS); crm_xml_set_id(xml_tmp, "%s_%s", remote_name, XML_TAG_META_SETS); attr = create_xml_node(xml_tmp, XML_CIB_TAG_NVPAIR); crm_xml_set_id(attr, "%s_%s", remote_name, "meta-attributes-container"); crm_xml_add(attr, XML_NVPAIR_ATTR_NAME, XML_RSC_ATTR_CONTAINER); crm_xml_add(attr, XML_NVPAIR_ATTR_VALUE, container_id); attr = create_xml_node(xml_tmp, XML_CIB_TAG_NVPAIR); crm_xml_set_id(attr, "%s_%s", remote_name, "meta-attributes-internal"); crm_xml_add(attr, XML_NVPAIR_ATTR_NAME, XML_RSC_ATTR_INTERNAL_RSC); crm_xml_add(attr, XML_NVPAIR_ATTR_VALUE, "true"); if (remote_allow_migrate) { attr = create_xml_node(xml_tmp, XML_CIB_TAG_NVPAIR); crm_xml_set_id(attr, "%s_%s", remote_name, "meta-attributes-container"); crm_xml_add(attr, XML_NVPAIR_ATTR_NAME, XML_OP_ATTR_ALLOW_MIGRATE); crm_xml_add(attr, XML_NVPAIR_ATTR_VALUE, remote_allow_migrate); } xml_tmp = create_xml_node(xml_rsc, "operations"); attr = create_xml_node(xml_tmp, XML_ATTR_OP); crm_xml_set_id(attr, "%s_%s", remote_name, "monitor-interval-30s"); crm_xml_add(attr, XML_ATTR_TIMEOUT, "30s"); crm_xml_add(attr, XML_LRM_ATTR_INTERVAL, "30s"); crm_xml_add(attr, XML_NVPAIR_ATTR_NAME, "monitor"); if (connect_timeout) { attr = create_xml_node(xml_tmp, XML_ATTR_OP); crm_xml_set_id(attr, "%s_%s", remote_name, "start-interval-0"); crm_xml_add(attr, XML_ATTR_TIMEOUT, connect_timeout); crm_xml_add(attr, XML_LRM_ATTR_INTERVAL, "0"); crm_xml_add(attr, XML_NVPAIR_ATTR_NAME, "start"); } if (remote_port || remote_server) { xml_tmp = create_xml_node(xml_rsc, XML_TAG_ATTR_SETS); crm_xml_set_id(xml_tmp, "%s_%s", remote_name, XML_TAG_ATTR_SETS); if (remote_server) { attr = create_xml_node(xml_tmp, XML_CIB_TAG_NVPAIR); crm_xml_set_id(attr, "%s_%s", remote_name, "instance-attributes-addr"); crm_xml_add(attr, XML_NVPAIR_ATTR_NAME, "addr"); crm_xml_add(attr, XML_NVPAIR_ATTR_VALUE, remote_server); } if (remote_port) { attr = create_xml_node(xml_tmp, XML_CIB_TAG_NVPAIR); crm_xml_set_id(attr, "%s_%s", remote_name, "instance-attributes-port"); crm_xml_add(attr, XML_NVPAIR_ATTR_NAME, "port"); crm_xml_add(attr, XML_NVPAIR_ATTR_VALUE, remote_port); } } return remote_name; } static void handle_startup_fencing(pe_working_set_t *data_set, node_t *new_node) { static const char *blind_faith = NULL; static gboolean unseen_are_unclean = TRUE; static gboolean need_warning = TRUE; if ((new_node->details->type == node_remote) && (new_node->details->remote_rsc == NULL)) { /* Ignore fencing for remote nodes that don't have a connection resource * associated with them. This happens when remote node entries get left * in the nodes section after the connection resource is removed. */ return; } blind_faith = pe_pref(data_set->config_hash, "startup-fencing"); if (crm_is_true(blind_faith) == FALSE) { unseen_are_unclean = FALSE; if (need_warning) { crm_warn("Blind faith: not fencing unseen nodes"); /* Warn once per run, not per node and transition */ need_warning = FALSE; } } if (is_set(data_set->flags, pe_flag_stonith_enabled) == FALSE || unseen_are_unclean == FALSE) { /* blind faith... */ new_node->details->unclean = FALSE; } else { /* all nodes are unclean until we've seen their * status entry */ new_node->details->unclean = TRUE; } /* We need to be able to determine if a node's status section * exists or not separate from whether the node is unclean. */ new_node->details->unseen = TRUE; } gboolean unpack_nodes(xmlNode * xml_nodes, pe_working_set_t * data_set) { xmlNode *xml_obj = NULL; node_t *new_node = NULL; const char *id = NULL; const char *uname = NULL; const char *type = NULL; const char *score = NULL; for (xml_obj = __xml_first_child(xml_nodes); xml_obj != NULL; xml_obj = __xml_next_element(xml_obj)) { if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_NODE, TRUE)) { new_node = NULL; id = crm_element_value(xml_obj, XML_ATTR_ID); uname = crm_element_value(xml_obj, XML_ATTR_UNAME); type = crm_element_value(xml_obj, XML_ATTR_TYPE); score = crm_element_value(xml_obj, XML_RULE_ATTR_SCORE); crm_trace("Processing node %s/%s", uname, id); if (id == NULL) { crm_config_err("Must specify id tag in "); continue; } new_node = pe_create_node(id, uname, type, score, data_set); if (new_node == NULL) { return FALSE; } /* if(data_set->have_quorum == FALSE */ /* && data_set->no_quorum_policy == no_quorum_stop) { */ /* /\* start shutting resources down *\/ */ /* new_node->weight = -INFINITY; */ /* } */ handle_startup_fencing(data_set, new_node); add_node_attrs(xml_obj, new_node, FALSE, data_set); unpack_instance_attributes(data_set->input, xml_obj, XML_TAG_UTILIZATION, NULL, new_node->details->utilization, NULL, FALSE, data_set->now); crm_trace("Done with node %s", crm_element_value(xml_obj, XML_ATTR_UNAME)); } } if (data_set->localhost && pe_find_node(data_set->nodes, data_set->localhost) == NULL) { crm_info("Creating a fake local node"); pe_create_node(data_set->localhost, data_set->localhost, NULL, 0, data_set); } return TRUE; } static void setup_container(resource_t * rsc, pe_working_set_t * data_set) { const char *container_id = NULL; if (rsc->children) { GListPtr gIter = rsc->children; for (; gIter != NULL; gIter = gIter->next) { resource_t *child_rsc = (resource_t *) gIter->data; setup_container(child_rsc, data_set); } return; } container_id = g_hash_table_lookup(rsc->meta, XML_RSC_ATTR_CONTAINER); if (container_id && safe_str_neq(container_id, rsc->id)) { resource_t *container = pe_find_resource(data_set->resources, container_id); if (container) { rsc->container = container; container->fillers = g_list_append(container->fillers, rsc); pe_rsc_trace(rsc, "Resource %s's container is %s", rsc->id, container_id); } else { pe_err("Resource %s: Unknown resource container (%s)", rsc->id, container_id); } } } gboolean unpack_remote_nodes(xmlNode * xml_resources, pe_working_set_t * data_set) { xmlNode *xml_obj = NULL; - GHashTable *rsc_name_check = NULL; /* generate remote nodes from resource config before unpacking resources */ for (xml_obj = __xml_first_child(xml_resources); xml_obj != NULL; xml_obj = __xml_next_element(xml_obj)) { const char *new_node_id = NULL; /* first check if this is a bare metal remote node. Bare metal remote nodes * are defined as a resource primitive only. */ if (xml_contains_remote_node(xml_obj)) { new_node_id = ID(xml_obj); /* The "pe_find_node" check is here to make sure we don't iterate over * an expanded node that has already been added to the node list. */ if (new_node_id && pe_find_node(data_set->nodes, new_node_id) == NULL) { crm_trace("Found baremetal remote node %s in container resource %s", new_node_id, ID(xml_obj)); pe_create_node(new_node_id, new_node_id, "remote", NULL, data_set); } continue; } /* Now check for guest remote nodes. * guest remote nodes are defined within a resource primitive. * Example1: a vm resource might be configured as a remote node. * Example2: a vm resource might be configured within a group to be a remote node. * Note: right now we only support guest remote nodes in as a standalone primitive * or a primitive within a group. No cloned primitives can be a guest remote node * right now */ if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_RESOURCE, TRUE)) { /* expands a metadata defined remote resource into the xml config * as an actual rsc primitive to be unpacked later. */ new_node_id = expand_remote_rsc_meta(xml_obj, xml_resources, data_set); if (new_node_id && pe_find_node(data_set->nodes, new_node_id) == NULL) { crm_trace("Found guest remote node %s in container resource %s", new_node_id, ID(xml_obj)); pe_create_node(new_node_id, new_node_id, "remote", NULL, data_set); } continue; } else if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_GROUP, TRUE)) { xmlNode *xml_obj2 = NULL; /* search through a group to see if any of the primitive contain a remote node. */ for (xml_obj2 = __xml_first_child(xml_obj); xml_obj2 != NULL; xml_obj2 = __xml_next_element(xml_obj2)) { new_node_id = expand_remote_rsc_meta(xml_obj2, xml_resources, data_set); if (new_node_id && pe_find_node(data_set->nodes, new_node_id) == NULL) { crm_trace("Found guest remote node %s in container resource %s which is in group %s", new_node_id, ID(xml_obj2), ID(xml_obj)); pe_create_node(new_node_id, new_node_id, "remote", NULL, data_set); } } } } - if (rsc_name_check) { - g_hash_table_destroy(rsc_name_check); - } - return TRUE; } /* Call this after all the nodes and resources have been * unpacked, but before the status section is read. * * A remote node's online status is reflected by the state * of the remote node's connection resource. We need to link * the remote node to this connection resource so we can have * easy access to the connection resource during the PE calculations. */ static void link_rsc2remotenode(pe_working_set_t *data_set, resource_t *new_rsc) { node_t *remote_node = NULL; if (new_rsc->is_remote_node == FALSE) { return; } if (is_set(data_set->flags, pe_flag_quick_location)) { /* remote_nodes and remote_resources are not linked in quick location calculations */ return; } print_resource(LOG_DEBUG_3, "Linking remote-node connection resource, ", new_rsc, FALSE); remote_node = pe_find_node(data_set->nodes, new_rsc->id); CRM_CHECK(remote_node != NULL, return;); remote_node->details->remote_rsc = new_rsc; /* If this is a baremetal remote-node (no container resource * associated with it) then we need to handle startup fencing the same way * as cluster nodes. */ if (new_rsc->container == NULL) { handle_startup_fencing(data_set, remote_node); } else { /* At this point we know if the remote node is a container or baremetal * remote node, update the #kind attribute if a container is involved */ g_hash_table_replace(remote_node->details->attrs, strdup("#kind"), strdup("container")); } } static void destroy_tag(gpointer data) { tag_t *tag = data; if (tag) { free(tag->id); g_list_free_full(tag->refs, free); free(tag); } } gboolean unpack_resources(xmlNode * xml_resources, pe_working_set_t * data_set) { xmlNode *xml_obj = NULL; GListPtr gIter = NULL; data_set->template_rsc_sets = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, destroy_tag); for (xml_obj = __xml_first_child(xml_resources); xml_obj != NULL; xml_obj = __xml_next_element(xml_obj)) { resource_t *new_rsc = NULL; if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_RSC_TEMPLATE, TRUE)) { const char *template_id = ID(xml_obj); if (template_id && g_hash_table_lookup_extended(data_set->template_rsc_sets, template_id, NULL, NULL) == FALSE) { /* Record the template's ID for the knowledge of its existence anyway. */ g_hash_table_insert(data_set->template_rsc_sets, strdup(template_id), NULL); } continue; } crm_trace("Beginning unpack... <%s id=%s... >", crm_element_name(xml_obj), ID(xml_obj)); if (common_unpack(xml_obj, &new_rsc, NULL, data_set)) { data_set->resources = g_list_append(data_set->resources, new_rsc); if (xml_contains_remote_node(xml_obj)) { new_rsc->is_remote_node = TRUE; } print_resource(LOG_DEBUG_3, "Added ", new_rsc, FALSE); } else { crm_config_err("Failed unpacking %s %s", crm_element_name(xml_obj), crm_element_value(xml_obj, XML_ATTR_ID)); if (new_rsc != NULL && new_rsc->fns != NULL) { new_rsc->fns->free(new_rsc); } } } for (gIter = data_set->resources; gIter != NULL; gIter = gIter->next) { resource_t *rsc = (resource_t *) gIter->data; setup_container(rsc, data_set); link_rsc2remotenode(data_set, rsc); } data_set->resources = g_list_sort(data_set->resources, sort_rsc_priority); if (is_set(data_set->flags, pe_flag_quick_location)) { /* Ignore */ } else if (is_set(data_set->flags, pe_flag_stonith_enabled) && is_set(data_set->flags, pe_flag_have_stonith_resource) == FALSE) { crm_config_err("Resource start-up disabled since no STONITH resources have been defined"); crm_config_err("Either configure some or disable STONITH with the stonith-enabled option"); crm_config_err("NOTE: Clusters with shared data need STONITH to ensure data integrity"); } return TRUE; } gboolean unpack_tags(xmlNode * xml_tags, pe_working_set_t * data_set) { xmlNode *xml_tag = NULL; data_set->tags = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, destroy_tag); for (xml_tag = __xml_first_child(xml_tags); xml_tag != NULL; xml_tag = __xml_next_element(xml_tag)) { xmlNode *xml_obj_ref = NULL; const char *tag_id = ID(xml_tag); if (crm_str_eq((const char *)xml_tag->name, XML_CIB_TAG_TAG, TRUE) == FALSE) { continue; } if (tag_id == NULL) { crm_config_err("Failed unpacking %s: %s should be specified", crm_element_name(xml_tag), XML_ATTR_ID); continue; } for (xml_obj_ref = __xml_first_child(xml_tag); xml_obj_ref != NULL; xml_obj_ref = __xml_next_element(xml_obj_ref)) { const char *obj_ref = ID(xml_obj_ref); if (crm_str_eq((const char *)xml_obj_ref->name, XML_CIB_TAG_OBJ_REF, TRUE) == FALSE) { continue; } if (obj_ref == NULL) { crm_config_err("Failed unpacking %s for tag %s: %s should be specified", crm_element_name(xml_obj_ref), tag_id, XML_ATTR_ID); continue; } if (add_tag_ref(data_set->tags, tag_id, obj_ref) == FALSE) { return FALSE; } } } return TRUE; } /* The ticket state section: * "/cib/status/tickets/ticket_state" */ static gboolean unpack_ticket_state(xmlNode * xml_ticket, pe_working_set_t * data_set) { const char *ticket_id = NULL; const char *granted = NULL; const char *last_granted = NULL; const char *standby = NULL; xmlAttrPtr xIter = NULL; ticket_t *ticket = NULL; ticket_id = ID(xml_ticket); if (ticket_id == NULL || strlen(ticket_id) == 0) { return FALSE; } crm_trace("Processing ticket state for %s", ticket_id); ticket = g_hash_table_lookup(data_set->tickets, ticket_id); if (ticket == NULL) { ticket = ticket_new(ticket_id, data_set); if (ticket == NULL) { return FALSE; } } for (xIter = xml_ticket->properties; xIter; xIter = xIter->next) { const char *prop_name = (const char *)xIter->name; const char *prop_value = crm_element_value(xml_ticket, prop_name); if (crm_str_eq(prop_name, XML_ATTR_ID, TRUE)) { continue; } g_hash_table_replace(ticket->state, strdup(prop_name), strdup(prop_value)); } granted = g_hash_table_lookup(ticket->state, "granted"); if (granted && crm_is_true(granted)) { ticket->granted = TRUE; crm_info("We have ticket '%s'", ticket->id); } else { ticket->granted = FALSE; crm_info("We do not have ticket '%s'", ticket->id); } last_granted = g_hash_table_lookup(ticket->state, "last-granted"); if (last_granted) { ticket->last_granted = crm_parse_int(last_granted, 0); } standby = g_hash_table_lookup(ticket->state, "standby"); if (standby && crm_is_true(standby)) { ticket->standby = TRUE; if (ticket->granted) { crm_info("Granted ticket '%s' is in standby-mode", ticket->id); } } else { ticket->standby = FALSE; } crm_trace("Done with ticket state for %s", ticket_id); return TRUE; } static gboolean unpack_tickets_state(xmlNode * xml_tickets, pe_working_set_t * data_set) { xmlNode *xml_obj = NULL; for (xml_obj = __xml_first_child(xml_tickets); xml_obj != NULL; xml_obj = __xml_next_element(xml_obj)) { if (crm_str_eq((const char *)xml_obj->name, XML_CIB_TAG_TICKET_STATE, TRUE) == FALSE) { continue; } unpack_ticket_state(xml_obj, data_set); } return TRUE; } /* Compatibility with the deprecated ticket state section: * "/cib/status/tickets/instance_attributes" */ static void get_ticket_state_legacy(gpointer key, gpointer value, gpointer user_data) { const char *long_key = key; char *state_key = NULL; const char *granted_prefix = "granted-ticket-"; const char *last_granted_prefix = "last-granted-"; static int granted_prefix_strlen = 0; static int last_granted_prefix_strlen = 0; const char *ticket_id = NULL; const char *is_granted = NULL; const char *last_granted = NULL; const char *sep = NULL; ticket_t *ticket = NULL; pe_working_set_t *data_set = user_data; if (granted_prefix_strlen == 0) { granted_prefix_strlen = strlen(granted_prefix); } if (last_granted_prefix_strlen == 0) { last_granted_prefix_strlen = strlen(last_granted_prefix); } if (strstr(long_key, granted_prefix) == long_key) { ticket_id = long_key + granted_prefix_strlen; if (strlen(ticket_id)) { state_key = strdup("granted"); is_granted = value; } } else if (strstr(long_key, last_granted_prefix) == long_key) { ticket_id = long_key + last_granted_prefix_strlen; if (strlen(ticket_id)) { state_key = strdup("last-granted"); last_granted = value; } } else if ((sep = strrchr(long_key, '-'))) { ticket_id = sep + 1; state_key = strndup(long_key, strlen(long_key) - strlen(sep)); } if (ticket_id == NULL || strlen(ticket_id) == 0) { free(state_key); return; } if (state_key == NULL || strlen(state_key) == 0) { free(state_key); return; } ticket = g_hash_table_lookup(data_set->tickets, ticket_id); if (ticket == NULL) { ticket = ticket_new(ticket_id, data_set); if (ticket == NULL) { free(state_key); return; } } g_hash_table_replace(ticket->state, state_key, strdup(value)); if (is_granted) { if (crm_is_true(is_granted)) { ticket->granted = TRUE; crm_info("We have ticket '%s'", ticket->id); } else { ticket->granted = FALSE; crm_info("We do not have ticket '%s'", ticket->id); } } else if (last_granted) { ticket->last_granted = crm_parse_int(last_granted, 0); } } /* remove nodes that are down, stopping */ /* create +ve rsc_to_node constraints between resources and the nodes they are running on */ /* anything else? */ gboolean unpack_status(xmlNode * status, pe_working_set_t * data_set) { const char *id = NULL; const char *uname = NULL; xmlNode *state = NULL; xmlNode *lrm_rsc = NULL; node_t *this_node = NULL; crm_trace("Beginning unpack"); if (data_set->tickets == NULL) { data_set->tickets = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, destroy_ticket); } for (state = __xml_first_child(status); state != NULL; state = __xml_next_element(state)) { if (crm_str_eq((const char *)state->name, XML_CIB_TAG_TICKETS, TRUE)) { xmlNode *xml_tickets = state; GHashTable *state_hash = NULL; /* Compatibility with the deprecated ticket state section: * Unpack the attributes in the deprecated "/cib/status/tickets/instance_attributes" if it exists. */ state_hash = g_hash_table_new_full(crm_str_hash, g_str_equal, g_hash_destroy_str, g_hash_destroy_str); unpack_instance_attributes(data_set->input, xml_tickets, XML_TAG_ATTR_SETS, NULL, state_hash, NULL, TRUE, data_set->now); g_hash_table_foreach(state_hash, get_ticket_state_legacy, data_set); if (state_hash) { g_hash_table_destroy(state_hash); } /* Unpack the new "/cib/status/tickets/ticket_state"s */ unpack_tickets_state(xml_tickets, data_set); } if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE)) { xmlNode *attrs = NULL; const char *resource_discovery_enabled = NULL; id = crm_element_value(state, XML_ATTR_ID); uname = crm_element_value(state, XML_ATTR_UNAME); this_node = pe_find_node_any(data_set->nodes, id, uname); if (uname == NULL) { /* error */ continue; } else if (this_node == NULL) { crm_config_warn("Node %s in status section no longer exists", uname); continue; } else if (is_remote_node(this_node)) { /* online state for remote nodes is determined by the * rsc state after all the unpacking is done. we do however * need to mark whether or not the node has been fenced as this plays * a role during unpacking cluster node resource state */ this_node->details->remote_was_fenced = crm_atoi(crm_element_value(state, XML_NODE_IS_FENCED), "0"); continue; } crm_trace("Processing node id=%s, uname=%s", id, uname); /* Mark the node as provisionally clean * - at least we have seen it in the current cluster's lifetime */ this_node->details->unclean = FALSE; this_node->details->unseen = FALSE; attrs = find_xml_node(state, XML_TAG_TRANSIENT_NODEATTRS, FALSE); add_node_attrs(attrs, this_node, TRUE, data_set); if (crm_is_true(g_hash_table_lookup(this_node->details->attrs, "standby"))) { crm_info("Node %s is in standby-mode", this_node->details->uname); this_node->details->standby = TRUE; } if (crm_is_true(g_hash_table_lookup(this_node->details->attrs, "maintenance"))) { crm_info("Node %s is in maintenance-mode", this_node->details->uname); this_node->details->maintenance = TRUE; } resource_discovery_enabled = g_hash_table_lookup(this_node->details->attrs, XML_NODE_ATTR_RSC_DISCOVERY); if (resource_discovery_enabled && !crm_is_true(resource_discovery_enabled)) { crm_warn("ignoring %s attribute on node %s, disabling resource discovery is not allowed on cluster nodes", XML_NODE_ATTR_RSC_DISCOVERY, this_node->details->uname); } crm_trace("determining node state"); determine_online_status(state, this_node, data_set); if (this_node->details->online && data_set->no_quorum_policy == no_quorum_suicide) { /* Everything else should flow from this automatically * At least until the PE becomes able to migrate off healthy resources */ pe_fence_node(data_set, this_node, "because the cluster does not have quorum"); } } } /* Now that we know all node states, we can safely handle migration ops */ for (state = __xml_first_child(status); state != NULL; state = __xml_next_element(state)) { if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE) == FALSE) { continue; } id = crm_element_value(state, XML_ATTR_ID); uname = crm_element_value(state, XML_ATTR_UNAME); this_node = pe_find_node_any(data_set->nodes, id, uname); if (this_node == NULL) { crm_info("Node %s is unknown", id); continue; } else if (is_remote_node(this_node)) { /* online status of remote node can not be determined until all other * resource status is unpacked. */ continue; } else if (this_node->details->online || is_set(data_set->flags, pe_flag_stonith_enabled)) { crm_trace("Processing lrm resource entries on healthy node: %s", this_node->details->uname); lrm_rsc = find_xml_node(state, XML_CIB_TAG_LRM, FALSE); lrm_rsc = find_xml_node(lrm_rsc, XML_LRM_TAG_RESOURCES, FALSE); unpack_lrm_resources(this_node, lrm_rsc, data_set); } } /* now that the rest of the cluster's status is determined * calculate remote-nodes */ unpack_remote_status(status, data_set); return TRUE; } gboolean unpack_remote_status(xmlNode * status, pe_working_set_t * data_set) { const char *id = NULL; const char *uname = NULL; const char *shutdown = NULL; resource_t *rsc = NULL; GListPtr gIter = NULL; xmlNode *state = NULL; xmlNode *lrm_rsc = NULL; node_t *this_node = NULL; if (is_set(data_set->flags, pe_flag_have_remote_nodes) == FALSE) { crm_trace("no remote nodes to unpack"); return TRUE; } /* get online status */ for (gIter = data_set->nodes; gIter != NULL; gIter = gIter->next) { this_node = gIter->data; if ((this_node == NULL) || (is_remote_node(this_node) == FALSE)) { continue; } determine_remote_online_status(data_set, this_node); } /* process attributes */ for (state = __xml_first_child(status); state != NULL; state = __xml_next_element(state)) { const char *resource_discovery_enabled = NULL; xmlNode *attrs = NULL; if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE) == FALSE) { continue; } id = crm_element_value(state, XML_ATTR_ID); uname = crm_element_value(state, XML_ATTR_UNAME); this_node = pe_find_node_any(data_set->nodes, id, uname); if ((this_node == NULL) || (is_remote_node(this_node) == FALSE)) { continue; } crm_trace("Processing remote node id=%s, uname=%s", id, uname); this_node->details->remote_maintenance = crm_atoi(crm_element_value(state, XML_NODE_IS_MAINTENANCE), "0"); rsc = this_node->details->remote_rsc; if (this_node->details->remote_requires_reset == FALSE) { this_node->details->unclean = FALSE; this_node->details->unseen = FALSE; } attrs = find_xml_node(state, XML_TAG_TRANSIENT_NODEATTRS, FALSE); add_node_attrs(attrs, this_node, TRUE, data_set); shutdown = g_hash_table_lookup(this_node->details->attrs, XML_CIB_ATTR_SHUTDOWN); if (shutdown != NULL && safe_str_neq("0", shutdown)) { crm_info("Node %s is shutting down", this_node->details->uname); this_node->details->shutdown = TRUE; if (rsc) { rsc->next_role = RSC_ROLE_STOPPED; } } if (crm_is_true(g_hash_table_lookup(this_node->details->attrs, "standby"))) { crm_info("Node %s is in standby-mode", this_node->details->uname); this_node->details->standby = TRUE; } if (crm_is_true(g_hash_table_lookup(this_node->details->attrs, "maintenance")) || (rsc && !is_set(rsc->flags, pe_rsc_managed))) { crm_info("Node %s is in maintenance-mode", this_node->details->uname); this_node->details->maintenance = TRUE; } resource_discovery_enabled = g_hash_table_lookup(this_node->details->attrs, XML_NODE_ATTR_RSC_DISCOVERY); if (resource_discovery_enabled && !crm_is_true(resource_discovery_enabled)) { if (is_baremetal_remote_node(this_node) && is_not_set(data_set->flags, pe_flag_stonith_enabled)) { crm_warn("ignoring %s attribute on baremetal remote node %s, disabling resource discovery requires stonith to be enabled.", XML_NODE_ATTR_RSC_DISCOVERY, this_node->details->uname); } else { /* if we're here, this is either a baremetal node and fencing is enabled, * or this is a container node which we don't care if fencing is enabled * or not on. container nodes are 'fenced' by recovering the container resource * regardless of whether fencing is enabled. */ crm_info("Node %s has resource discovery disabled", this_node->details->uname); this_node->details->rsc_discovery_enabled = FALSE; } } } /* process node rsc status */ for (state = __xml_first_child(status); state != NULL; state = __xml_next_element(state)) { if (crm_str_eq((const char *)state->name, XML_CIB_TAG_STATE, TRUE) == FALSE) { continue; } id = crm_element_value(state, XML_ATTR_ID); uname = crm_element_value(state, XML_ATTR_UNAME); this_node = pe_find_node_any(data_set->nodes, id, uname); if ((this_node == NULL) || (is_remote_node(this_node) == FALSE)) { continue; } crm_trace("Processing lrm resource entries on healthy remote node: %s", this_node->details->uname); lrm_rsc = find_xml_node(state, XML_CIB_TAG_LRM, FALSE); lrm_rsc = find_xml_node(lrm_rsc, XML_LRM_TAG_RESOURCES, FALSE); unpack_lrm_resources(this_node, lrm_rsc, data_set); } return TRUE; } static gboolean determine_online_status_no_fencing(pe_working_set_t * data_set, xmlNode * node_state, node_t * this_node) { gboolean online = FALSE; const char *join = crm_element_value(node_state, XML_NODE_JOIN_STATE); const char *is_peer = crm_element_value(node_state, XML_NODE_IS_PEER); const char *in_cluster = crm_element_value(node_state, XML_NODE_IN_CLUSTER); const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED); if (!crm_is_true(in_cluster)) { crm_trace("Node is down: in_cluster=%s", crm_str(in_cluster)); } else if (safe_str_eq(is_peer, ONLINESTATUS)) { if (safe_str_eq(join, CRMD_JOINSTATE_MEMBER)) { online = TRUE; } else { crm_debug("Node is not ready to run resources: %s", join); } } else if (this_node->details->expected_up == FALSE) { crm_trace("CRMd is down: in_cluster=%s", crm_str(in_cluster)); crm_trace("\tis_peer=%s, join=%s, expected=%s", crm_str(is_peer), crm_str(join), crm_str(exp_state)); } else { /* mark it unclean */ pe_fence_node(data_set, this_node, "because node is unexpectedly down"); crm_info("\tin_cluster=%s, is_peer=%s, join=%s, expected=%s", crm_str(in_cluster), crm_str(is_peer), crm_str(join), crm_str(exp_state)); } return online; } static gboolean determine_online_status_fencing(pe_working_set_t * data_set, xmlNode * node_state, node_t * this_node) { gboolean online = FALSE; gboolean do_terminate = FALSE; const char *join = crm_element_value(node_state, XML_NODE_JOIN_STATE); const char *is_peer = crm_element_value(node_state, XML_NODE_IS_PEER); const char *in_cluster = crm_element_value(node_state, XML_NODE_IN_CLUSTER); const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED); const char *terminate = g_hash_table_lookup(this_node->details->attrs, "terminate"); /* - XML_NODE_IN_CLUSTER ::= true|false - XML_NODE_IS_PEER ::= true|false|online|offline - XML_NODE_JOIN_STATE ::= member|down|pending|banned - XML_NODE_EXPECTED ::= member|down */ if (crm_is_true(terminate)) { do_terminate = TRUE; } else if (terminate != NULL && strlen(terminate) > 0) { /* could be a time() value */ char t = terminate[0]; if (t != '0' && isdigit(t)) { do_terminate = TRUE; } } crm_trace("%s: in_cluster=%s, is_peer=%s, join=%s, expected=%s, term=%d", this_node->details->uname, crm_str(in_cluster), crm_str(is_peer), crm_str(join), crm_str(exp_state), do_terminate); online = crm_is_true(in_cluster); if (safe_str_eq(is_peer, ONLINESTATUS)) { is_peer = XML_BOOLEAN_YES; } if (exp_state == NULL) { exp_state = CRMD_JOINSTATE_DOWN; } if (this_node->details->shutdown) { crm_debug("%s is shutting down", this_node->details->uname); /* Slightly different criteria since we can't shut down a dead peer */ online = crm_is_true(is_peer); } else if (in_cluster == NULL) { pe_fence_node(data_set, this_node, "because the peer has not been seen by the cluster"); } else if (safe_str_eq(join, CRMD_JOINSTATE_NACK)) { pe_fence_node(data_set, this_node, "because it failed the pacemaker membership criteria"); } else if (do_terminate == FALSE && safe_str_eq(exp_state, CRMD_JOINSTATE_DOWN)) { if (crm_is_true(in_cluster) || crm_is_true(is_peer)) { crm_info("- Node %s is not ready to run resources", this_node->details->uname); this_node->details->standby = TRUE; this_node->details->pending = TRUE; } else { crm_trace("%s is down or still coming up", this_node->details->uname); } } else if (do_terminate && safe_str_eq(join, CRMD_JOINSTATE_DOWN) && crm_is_true(in_cluster) == FALSE && crm_is_true(is_peer) == FALSE) { crm_info("Node %s was just shot", this_node->details->uname); online = FALSE; } else if (crm_is_true(in_cluster) == FALSE) { pe_fence_node(data_set, this_node, "because the node is no longer part of the cluster"); } else if (crm_is_true(is_peer) == FALSE) { pe_fence_node(data_set, this_node, "because our peer process is no longer available"); /* Everything is running at this point, now check join state */ } else if (do_terminate) { pe_fence_node(data_set, this_node, "because termination was requested"); } else if (safe_str_eq(join, CRMD_JOINSTATE_MEMBER)) { crm_info("Node %s is active", this_node->details->uname); } else if (safe_str_eq(join, CRMD_JOINSTATE_PENDING) || safe_str_eq(join, CRMD_JOINSTATE_DOWN)) { crm_info("Node %s is not ready to run resources", this_node->details->uname); this_node->details->standby = TRUE; this_node->details->pending = TRUE; } else { pe_fence_node(data_set, this_node, "because the peer was in an unknown state"); crm_warn("%s: in-cluster=%s, is-peer=%s, join=%s, expected=%s, term=%d, shutdown=%d", this_node->details->uname, crm_str(in_cluster), crm_str(is_peer), crm_str(join), crm_str(exp_state), do_terminate, this_node->details->shutdown); } return online; } static gboolean determine_remote_online_status(pe_working_set_t * data_set, node_t * this_node) { resource_t *rsc = this_node->details->remote_rsc; resource_t *container = NULL; pe_node_t *host = NULL; /* If there is a node state entry for a (former) Pacemaker Remote node * but no resource creating that node, the node's connection resource will * be NULL. Consider it an offline remote node in that case. */ if (rsc == NULL) { this_node->details->online = FALSE; goto remote_online_done; } container = rsc->container; if (container && (g_list_length(rsc->running_on) == 1)) { host = rsc->running_on->data; } /* If the resource is currently started, mark it online. */ if (rsc->role == RSC_ROLE_STARTED) { crm_trace("%s node %s presumed ONLINE because connection resource is started", (container? "Guest" : "Remote"), this_node->details->id); this_node->details->online = TRUE; } /* consider this node shutting down if transitioning start->stop */ if (rsc->role == RSC_ROLE_STARTED && rsc->next_role == RSC_ROLE_STOPPED) { crm_trace("%s node %s shutting down because connection resource is stopping", (container? "Guest" : "Remote"), this_node->details->id); this_node->details->shutdown = TRUE; } /* Now check all the failure conditions. */ if(container && is_set(container->flags, pe_rsc_failed)) { crm_trace("Guest node %s UNCLEAN because guest resource failed", this_node->details->id); this_node->details->online = FALSE; this_node->details->remote_requires_reset = TRUE; } else if(is_set(rsc->flags, pe_rsc_failed)) { crm_trace("%s node %s OFFLINE because connection resource failed", (container? "Guest" : "Remote"), this_node->details->id); this_node->details->online = FALSE; } else if (rsc->role == RSC_ROLE_STOPPED || (container && container->role == RSC_ROLE_STOPPED)) { crm_trace("%s node %s OFFLINE because its resource is stopped", (container? "Guest" : "Remote"), this_node->details->id); this_node->details->online = FALSE; this_node->details->remote_requires_reset = FALSE; } else if (host && (host->details->online == FALSE) && host->details->unclean) { crm_trace("Guest node %s UNCLEAN because host is unclean", this_node->details->id); this_node->details->online = FALSE; this_node->details->remote_requires_reset = TRUE; } remote_online_done: crm_trace("Remote node %s online=%s", this_node->details->id, this_node->details->online ? "TRUE" : "FALSE"); return this_node->details->online; } gboolean determine_online_status(xmlNode * node_state, node_t * this_node, pe_working_set_t * data_set) { gboolean online = FALSE; const char *shutdown = NULL; const char *exp_state = crm_element_value(node_state, XML_NODE_EXPECTED); if (this_node == NULL) { crm_config_err("No node to check"); return online; } this_node->details->shutdown = FALSE; this_node->details->expected_up = FALSE; shutdown = g_hash_table_lookup(this_node->details->attrs, XML_CIB_ATTR_SHUTDOWN); if (shutdown != NULL && safe_str_neq("0", shutdown)) { this_node->details->shutdown = TRUE; } else if (safe_str_eq(exp_state, CRMD_JOINSTATE_MEMBER)) { this_node->details->expected_up = TRUE; } if (this_node->details->type == node_ping) { this_node->details->unclean = FALSE; online = FALSE; /* As far as resource management is concerned, * the node is safely offline. * Anyone caught abusing this logic will be shot */ } else if (is_set(data_set->flags, pe_flag_stonith_enabled) == FALSE) { online = determine_online_status_no_fencing(data_set, node_state, this_node); } else { online = determine_online_status_fencing(data_set, node_state, this_node); } if (online) { this_node->details->online = TRUE; } else { /* remove node from contention */ this_node->fixed = TRUE; this_node->weight = -INFINITY; } if (online && this_node->details->shutdown) { /* don't run resources here */ this_node->fixed = TRUE; this_node->weight = -INFINITY; } if (this_node->details->type == node_ping) { crm_info("Node %s is not a pacemaker node", this_node->details->uname); } else if (this_node->details->unclean) { pe_proc_warn("Node %s is unclean", this_node->details->uname); } else if (this_node->details->online) { crm_info("Node %s is %s", this_node->details->uname, this_node->details->shutdown ? "shutting down" : this_node->details->pending ? "pending" : this_node->details->standby ? "standby" : this_node->details->maintenance ? "maintenance" : "online"); } else { crm_trace("Node %s is offline", this_node->details->uname); } return online; } char * clone_strip(const char *last_rsc_id) { int lpc = 0; char *zero = NULL; CRM_CHECK(last_rsc_id != NULL, return NULL); lpc = strlen(last_rsc_id); while (--lpc > 0) { switch (last_rsc_id[lpc]) { case 0: crm_err("Empty string: %s", last_rsc_id); return NULL; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': break; case ':': zero = calloc(1, lpc + 1); memcpy(zero, last_rsc_id, lpc); zero[lpc] = 0; return zero; default: goto done; } } done: zero = strdup(last_rsc_id); return zero; } char * clone_zero(const char *last_rsc_id) { int lpc = 0; char *zero = NULL; CRM_CHECK(last_rsc_id != NULL, return NULL); if (last_rsc_id != NULL) { lpc = strlen(last_rsc_id); } while (--lpc > 0) { switch (last_rsc_id[lpc]) { case 0: return NULL; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': break; case ':': zero = calloc(1, lpc + 3); memcpy(zero, last_rsc_id, lpc); zero[lpc] = ':'; zero[lpc + 1] = '0'; zero[lpc + 2] = 0; return zero; default: goto done; } } done: lpc = strlen(last_rsc_id); zero = calloc(1, lpc + 3); memcpy(zero, last_rsc_id, lpc); zero[lpc] = ':'; zero[lpc + 1] = '0'; zero[lpc + 2] = 0; crm_trace("%s -> %s", last_rsc_id, zero); return zero; } static resource_t * create_fake_resource(const char *rsc_id, xmlNode * rsc_entry, pe_working_set_t * data_set) { resource_t *rsc = NULL; xmlNode *xml_rsc = create_xml_node(NULL, XML_CIB_TAG_RESOURCE); copy_in_properties(xml_rsc, rsc_entry); crm_xml_add(xml_rsc, XML_ATTR_ID, rsc_id); crm_log_xml_debug(xml_rsc, "Orphan resource"); if (!common_unpack(xml_rsc, &rsc, NULL, data_set)) { return NULL; } if (xml_contains_remote_node(xml_rsc)) { node_t *node; crm_debug("Detected orphaned remote node %s", rsc_id); rsc->is_remote_node = TRUE; node = pe_find_node(data_set->nodes, rsc_id); if (node == NULL) { node = pe_create_node(rsc_id, rsc_id, "remote", NULL, data_set); } link_rsc2remotenode(data_set, rsc); if (node) { crm_trace("Setting node %s as shutting down due to orphaned connection resource", rsc_id); node->details->shutdown = TRUE; } } if (crm_element_value(rsc_entry, XML_RSC_ATTR_CONTAINER)) { /* This orphaned rsc needs to be mapped to a container. */ crm_trace("Detected orphaned container filler %s", rsc_id); set_bit(rsc->flags, pe_rsc_orphan_container_filler); } set_bit(rsc->flags, pe_rsc_orphan); data_set->resources = g_list_append(data_set->resources, rsc); return rsc; } extern resource_t *create_child_clone(resource_t * rsc, int sub_id, pe_working_set_t * data_set); static resource_t * find_anonymous_clone(pe_working_set_t * data_set, node_t * node, resource_t * parent, const char *rsc_id) { GListPtr rIter = NULL; resource_t *rsc = NULL; gboolean skip_inactive = FALSE; CRM_ASSERT(parent != NULL); CRM_ASSERT(pe_rsc_is_clone(parent)); CRM_ASSERT(is_not_set(parent->flags, pe_rsc_unique)); /* Find an instance active (or partially active for grouped clones) on the specified node */ pe_rsc_trace(parent, "Looking for %s on %s in %s", rsc_id, node->details->uname, parent->id); for (rIter = parent->children; rsc == NULL && rIter; rIter = rIter->next) { GListPtr nIter = NULL; GListPtr locations = NULL; resource_t *child = rIter->data; child->fns->location(child, &locations, TRUE); if (locations == NULL) { pe_rsc_trace(child, "Resource %s, skip inactive", child->id); continue; } for (nIter = locations; nIter && rsc == NULL; nIter = nIter->next) { node_t *childnode = nIter->data; if (childnode->details == node->details) { /* ->find_rsc() because we might be a cloned group */ rsc = parent->fns->find_rsc(child, rsc_id, NULL, pe_find_clone); if(rsc) { pe_rsc_trace(rsc, "Resource %s, active", rsc->id); } } /* Keep this block, it means we'll do the right thing if * anyone toggles the unique flag to 'off' */ if (rsc && rsc->running_on) { crm_notice("/Anonymous/ clone %s is already running on %s", parent->id, node->details->uname); skip_inactive = TRUE; rsc = NULL; } } g_list_free(locations); } /* Find an inactive instance */ if (skip_inactive == FALSE) { pe_rsc_trace(parent, "Looking for %s anywhere", rsc_id); for (rIter = parent->children; rsc == NULL && rIter; rIter = rIter->next) { GListPtr locations = NULL; resource_t *child = rIter->data; if (is_set(child->flags, pe_rsc_block)) { pe_rsc_trace(child, "Skip: blocked in stopped state"); continue; } child->fns->location(child, &locations, TRUE); if (locations == NULL) { /* ->find_rsc() because we might be a cloned group */ rsc = parent->fns->find_rsc(child, rsc_id, NULL, pe_find_clone); pe_rsc_trace(parent, "Resource %s, empty slot", rsc->id); } g_list_free(locations); } } if (rsc == NULL) { /* Create an extra orphan */ resource_t *top = create_child_clone(parent, -1, data_set); /* ->find_rsc() because we might be a cloned group */ rsc = top->fns->find_rsc(top, rsc_id, NULL, pe_find_clone); CRM_ASSERT(rsc != NULL); pe_rsc_debug(parent, "Created orphan %s for %s: %s on %s", top->id, parent->id, rsc_id, node->details->uname); } if (safe_str_neq(rsc_id, rsc->id)) { pe_rsc_debug(rsc, "Internally renamed %s on %s to %s%s", rsc_id, node->details->uname, rsc->id, is_set(rsc->flags, pe_rsc_orphan) ? " (ORPHAN)" : ""); } return rsc; } static resource_t * unpack_find_resource(pe_working_set_t * data_set, node_t * node, const char *rsc_id, xmlNode * rsc_entry) { resource_t *rsc = NULL; resource_t *parent = NULL; crm_trace("looking for %s", rsc_id); rsc = pe_find_resource(data_set->resources, rsc_id); /* no match */ if (rsc == NULL) { /* Even when clone-max=0, we still create a single :0 orphan to match against */ char *tmp = clone_zero(rsc_id); resource_t *clone0 = pe_find_resource(data_set->resources, tmp); if (clone0 && is_not_set(clone0->flags, pe_rsc_unique)) { rsc = clone0; } else { crm_trace("%s is not known as %s either", rsc_id, tmp); } parent = uber_parent(clone0); free(tmp); crm_trace("%s not found: %s", rsc_id, parent ? parent->id : "orphan"); } else if (rsc->variant > pe_native) { crm_trace("%s is no longer a primitive resource, the lrm_resource entry is obsolete", rsc_id); return NULL; } else { parent = uber_parent(rsc); } if(parent && parent->parent) { rsc = find_container_child(rsc_id, rsc, node); } else if (pe_rsc_is_clone(parent)) { if (is_not_set(parent->flags, pe_rsc_unique)) { char *base = clone_strip(rsc_id); rsc = find_anonymous_clone(data_set, node, parent, base); CRM_ASSERT(rsc != NULL); free(base); } if (rsc && safe_str_neq(rsc_id, rsc->id)) { free(rsc->clone_name); rsc->clone_name = strdup(rsc_id); } } return rsc; } static resource_t * process_orphan_resource(xmlNode * rsc_entry, node_t * node, pe_working_set_t * data_set) { resource_t *rsc = NULL; const char *rsc_id = crm_element_value(rsc_entry, XML_ATTR_ID); crm_debug("Detected orphan resource %s on %s", rsc_id, node->details->uname); rsc = create_fake_resource(rsc_id, rsc_entry, data_set); if (is_set(data_set->flags, pe_flag_stop_rsc_orphans) == FALSE) { clear_bit(rsc->flags, pe_rsc_managed); } else { print_resource(LOG_DEBUG_3, "Added orphan", rsc, FALSE); CRM_CHECK(rsc != NULL, return NULL); resource_location(rsc, NULL, -INFINITY, "__orphan_dont_run__", data_set); } return rsc; } static void process_rsc_state(resource_t * rsc, node_t * node, enum action_fail_response on_fail, xmlNode * migrate_op, pe_working_set_t * data_set) { node_t *tmpnode = NULL; CRM_ASSERT(rsc); pe_rsc_trace(rsc, "Resource %s is %s on %s: on_fail=%s", rsc->id, role2text(rsc->role), node->details->uname, fail2text(on_fail)); /* process current state */ if (rsc->role != RSC_ROLE_UNKNOWN) { resource_t *iter = rsc; while (iter) { if (g_hash_table_lookup(iter->known_on, node->details->id) == NULL) { node_t *n = node_copy(node); pe_rsc_trace(rsc, "%s (aka. %s) known on %s", rsc->id, rsc->clone_name, n->details->uname); g_hash_table_insert(iter->known_on, (gpointer) n->details->id, n); } if (is_set(iter->flags, pe_rsc_unique)) { break; } iter = iter->parent; } } /* If a managed resource is believed to be running, but node is down ... */ if (rsc->role > RSC_ROLE_STOPPED && node->details->online == FALSE && node->details->maintenance == FALSE && is_set(rsc->flags, pe_rsc_managed)) { char *reason = NULL; gboolean should_fence = FALSE; /* If this is a guest node, fence it (regardless of whether fencing is * enabled, because guest node fencing is done by recovery of the * container resource rather than by stonithd). Mark the resource * we're processing as failed. When the guest comes back up, its * operation history in the CIB will be cleared, freeing the affected * resource to run again once we are sure we know its state. */ if (is_container_remote_node(node)) { set_bit(rsc->flags, pe_rsc_failed); should_fence = TRUE; } else if (is_set(data_set->flags, pe_flag_stonith_enabled)) { if (is_baremetal_remote_node(node) && node->details->remote_rsc && is_not_set(node->details->remote_rsc->flags, pe_rsc_failed)) { /* setting unseen = true means that fencing of the remote node will * only occur if the connection resource is not going to start somewhere. * This allows connection resources on a failed cluster-node to move to * another node without requiring the baremetal remote nodes to be fenced * as well. */ node->details->unseen = TRUE; reason = crm_strdup_printf("because %s is active there. Fencing will be revoked if remote-node connection can be re-established on another cluster-node.", rsc->id); } should_fence = TRUE; } if (should_fence) { if (reason == NULL) { reason = crm_strdup_printf("because %s is thought to be active there", rsc->id); } pe_fence_node(data_set, node, reason); } free(reason); } if (node->details->unclean) { /* No extra processing needed * Also allows resources to be started again after a node is shot */ on_fail = action_fail_ignore; } switch (on_fail) { case action_fail_ignore: /* nothing to do */ break; case action_fail_fence: /* treat it as if it is still running * but also mark the node as unclean */ pe_fence_node(data_set, node, "because of resource failure(s)"); break; case action_fail_standby: node->details->standby = TRUE; node->details->standby_onfail = TRUE; break; case action_fail_block: /* is_managed == FALSE will prevent any * actions being sent for the resource */ clear_bit(rsc->flags, pe_rsc_managed); set_bit(rsc->flags, pe_rsc_block); break; case action_fail_migrate: /* make sure it comes up somewhere else * or not at all */ resource_location(rsc, node, -INFINITY, "__action_migration_auto__", data_set); break; case action_fail_stop: rsc->next_role = RSC_ROLE_STOPPED; break; case action_fail_recover: if (rsc->role != RSC_ROLE_STOPPED && rsc->role != RSC_ROLE_UNKNOWN) { set_bit(rsc->flags, pe_rsc_failed); stop_action(rsc, node, FALSE); } break; case action_fail_restart_container: set_bit(rsc->flags, pe_rsc_failed); if (rsc->container) { stop_action(rsc->container, node, FALSE); } else if (rsc->role != RSC_ROLE_STOPPED && rsc->role != RSC_ROLE_UNKNOWN) { stop_action(rsc, node, FALSE); } break; case action_fail_reset_remote: set_bit(rsc->flags, pe_rsc_failed); if (is_set(data_set->flags, pe_flag_stonith_enabled)) { tmpnode = NULL; if (rsc->is_remote_node) { tmpnode = pe_find_node(data_set->nodes, rsc->id); } if (tmpnode && is_baremetal_remote_node(tmpnode) && tmpnode->details->remote_was_fenced == 0) { /* connection resource to baremetal resource failed in a way that * should result in fencing the remote-node. */ pe_fence_node(data_set, tmpnode, "because of connection failure(s)"); } } /* require the stop action regardless if fencing is occuring or not. */ if (rsc->role > RSC_ROLE_STOPPED) { stop_action(rsc, node, FALSE); } /* if reconnect delay is in use, prevent the connection from exiting the * "STOPPED" role until the failure is cleared by the delay timeout. */ if (rsc->remote_reconnect_interval) { rsc->next_role = RSC_ROLE_STOPPED; } break; } /* ensure a remote-node connection failure forces an unclean remote-node * to be fenced. By setting unseen = FALSE, the remote-node failure will * result in a fencing operation regardless if we're going to attempt to * reconnect to the remote-node in this transition or not. */ if (is_set(rsc->flags, pe_rsc_failed) && rsc->is_remote_node) { tmpnode = pe_find_node(data_set->nodes, rsc->id); if (tmpnode && tmpnode->details->unclean) { tmpnode->details->unseen = FALSE; } } if (rsc->role != RSC_ROLE_STOPPED && rsc->role != RSC_ROLE_UNKNOWN) { if (is_set(rsc->flags, pe_rsc_orphan)) { if (is_set(rsc->flags, pe_rsc_managed)) { crm_config_warn("Detected active orphan %s running on %s", rsc->id, node->details->uname); } else { crm_config_warn("Cluster configured not to stop active orphans." " %s must be stopped manually on %s", rsc->id, node->details->uname); } } native_add_running(rsc, node, data_set); if (on_fail != action_fail_ignore) { set_bit(rsc->flags, pe_rsc_failed); } } else if (rsc->clone_name && strchr(rsc->clone_name, ':') != NULL) { /* Only do this for older status sections that included instance numbers * Otherwise stopped instances will appear as orphans */ pe_rsc_trace(rsc, "Resetting clone_name %s for %s (stopped)", rsc->clone_name, rsc->id); free(rsc->clone_name); rsc->clone_name = NULL; } else { char *key = stop_key(rsc); GListPtr possible_matches = find_actions(rsc->actions, key, node); GListPtr gIter = possible_matches; for (; gIter != NULL; gIter = gIter->next) { action_t *stop = (action_t *) gIter->data; stop->flags |= pe_action_optional; } g_list_free(possible_matches); free(key); } } /* create active recurring operations as optional */ static void process_recurring(node_t * node, resource_t * rsc, int start_index, int stop_index, GListPtr sorted_op_list, pe_working_set_t * data_set) { int counter = -1; const char *task = NULL; const char *status = NULL; GListPtr gIter = sorted_op_list; CRM_ASSERT(rsc); pe_rsc_trace(rsc, "%s: Start index %d, stop index = %d", rsc->id, start_index, stop_index); for (; gIter != NULL; gIter = gIter->next) { xmlNode *rsc_op = (xmlNode *) gIter->data; int interval = 0; char *key = NULL; const char *id = ID(rsc_op); const char *interval_s = NULL; counter++; if (node->details->online == FALSE) { pe_rsc_trace(rsc, "Skipping %s/%s: node is offline", rsc->id, node->details->uname); break; /* Need to check if there's a monitor for role="Stopped" */ } else if (start_index < stop_index && counter <= stop_index) { pe_rsc_trace(rsc, "Skipping %s/%s: resource is not active", id, node->details->uname); continue; } else if (counter < start_index) { pe_rsc_trace(rsc, "Skipping %s/%s: old %d", id, node->details->uname, counter); continue; } interval_s = crm_element_value(rsc_op, XML_LRM_ATTR_INTERVAL); interval = crm_parse_int(interval_s, "0"); if (interval == 0) { pe_rsc_trace(rsc, "Skipping %s/%s: non-recurring", id, node->details->uname); continue; } status = crm_element_value(rsc_op, XML_LRM_ATTR_OPSTATUS); if (safe_str_eq(status, "-1")) { pe_rsc_trace(rsc, "Skipping %s/%s: status", id, node->details->uname); continue; } task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); /* create the action */ key = generate_op_key(rsc->id, task, interval); pe_rsc_trace(rsc, "Creating %s/%s", key, node->details->uname); custom_action(rsc, key, task, node, TRUE, TRUE, data_set); } } void calculate_active_ops(GListPtr sorted_op_list, int *start_index, int *stop_index) { int counter = -1; int implied_monitor_start = -1; int implied_master_start = -1; const char *task = NULL; const char *status = NULL; GListPtr gIter = sorted_op_list; *stop_index = -1; *start_index = -1; for (; gIter != NULL; gIter = gIter->next) { xmlNode *rsc_op = (xmlNode *) gIter->data; counter++; task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); status = crm_element_value(rsc_op, XML_LRM_ATTR_OPSTATUS); if (safe_str_eq(task, CRMD_ACTION_STOP) && safe_str_eq(status, "0")) { *stop_index = counter; } else if (safe_str_eq(task, CRMD_ACTION_START) || safe_str_eq(task, CRMD_ACTION_MIGRATED)) { *start_index = counter; } else if ((implied_monitor_start <= *stop_index) && safe_str_eq(task, CRMD_ACTION_STATUS)) { const char *rc = crm_element_value(rsc_op, XML_LRM_ATTR_RC); if (safe_str_eq(rc, "0") || safe_str_eq(rc, "8")) { implied_monitor_start = counter; } } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE) || safe_str_eq(task, CRMD_ACTION_DEMOTE)) { implied_master_start = counter; } } if (*start_index == -1) { if (implied_master_start != -1) { *start_index = implied_master_start; } else if (implied_monitor_start != -1) { *start_index = implied_monitor_start; } } } static resource_t * unpack_lrm_rsc_state(node_t * node, xmlNode * rsc_entry, pe_working_set_t * data_set) { GListPtr gIter = NULL; int stop_index = -1; int start_index = -1; enum rsc_role_e req_role = RSC_ROLE_UNKNOWN; const char *task = NULL; const char *rsc_id = crm_element_value(rsc_entry, XML_ATTR_ID); resource_t *rsc = NULL; GListPtr op_list = NULL; GListPtr sorted_op_list = NULL; xmlNode *migrate_op = NULL; xmlNode *rsc_op = NULL; xmlNode *last_failure = NULL; enum action_fail_response on_fail = FALSE; enum rsc_role_e saved_role = RSC_ROLE_UNKNOWN; crm_trace("[%s] Processing %s on %s", crm_element_name(rsc_entry), rsc_id, node->details->uname); /* extract operations */ op_list = NULL; sorted_op_list = NULL; for (rsc_op = __xml_first_child(rsc_entry); rsc_op != NULL; rsc_op = __xml_next_element(rsc_op)) { if (crm_str_eq((const char *)rsc_op->name, XML_LRM_TAG_RSC_OP, TRUE)) { op_list = g_list_prepend(op_list, rsc_op); } } if (op_list == NULL) { /* if there are no operations, there is nothing to do */ return NULL; } /* find the resource */ rsc = unpack_find_resource(data_set, node, rsc_id, rsc_entry); if (rsc == NULL) { rsc = process_orphan_resource(rsc_entry, node, data_set); } CRM_ASSERT(rsc != NULL); /* process operations */ saved_role = rsc->role; on_fail = action_fail_ignore; rsc->role = RSC_ROLE_UNKNOWN; sorted_op_list = g_list_sort(op_list, sort_op_by_callid); for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) { xmlNode *rsc_op = (xmlNode *) gIter->data; task = crm_element_value(rsc_op, XML_LRM_ATTR_TASK); if (safe_str_eq(task, CRMD_ACTION_MIGRATED)) { migrate_op = rsc_op; } unpack_rsc_op(rsc, node, rsc_op, &last_failure, &on_fail, data_set); } /* create active recurring operations as optional */ calculate_active_ops(sorted_op_list, &start_index, &stop_index); process_recurring(node, rsc, start_index, stop_index, sorted_op_list, data_set); /* no need to free the contents */ g_list_free(sorted_op_list); process_rsc_state(rsc, node, on_fail, migrate_op, data_set); if (get_target_role(rsc, &req_role)) { if (rsc->next_role == RSC_ROLE_UNKNOWN || req_role < rsc->next_role) { pe_rsc_debug(rsc, "%s: Overwriting calculated next role %s" " with requested next role %s", rsc->id, role2text(rsc->next_role), role2text(req_role)); rsc->next_role = req_role; } else if (req_role > rsc->next_role) { pe_rsc_info(rsc, "%s: Not overwriting calculated next role %s" " with requested next role %s", rsc->id, role2text(rsc->next_role), role2text(req_role)); } } if (saved_role > rsc->role) { rsc->role = saved_role; } return rsc; } static void handle_orphaned_container_fillers(xmlNode * lrm_rsc_list, pe_working_set_t * data_set) { xmlNode *rsc_entry = NULL; for (rsc_entry = __xml_first_child(lrm_rsc_list); rsc_entry != NULL; rsc_entry = __xml_next_element(rsc_entry)) { resource_t *rsc; resource_t *container; const char *rsc_id; const char *container_id; if (safe_str_neq((const char *)rsc_entry->name, XML_LRM_TAG_RESOURCE)) { continue; } container_id = crm_element_value(rsc_entry, XML_RSC_ATTR_CONTAINER); rsc_id = crm_element_value(rsc_entry, XML_ATTR_ID); if (container_id == NULL || rsc_id == NULL) { continue; } container = pe_find_resource(data_set->resources, container_id); if (container == NULL) { continue; } rsc = pe_find_resource(data_set->resources, rsc_id); if (rsc == NULL || is_set(rsc->flags, pe_rsc_orphan_container_filler) == FALSE || rsc->container != NULL) { continue; } pe_rsc_trace(rsc, "Mapped orphaned rsc %s's container to %s", rsc->id, container_id); rsc->container = container; container->fillers = g_list_append(container->fillers, rsc); } } gboolean unpack_lrm_resources(node_t * node, xmlNode * lrm_rsc_list, pe_working_set_t * data_set) { xmlNode *rsc_entry = NULL; gboolean found_orphaned_container_filler = FALSE; GListPtr unexpected_containers = NULL; GListPtr gIter = NULL; resource_t *remote = NULL; CRM_CHECK(node != NULL, return FALSE); crm_trace("Unpacking resources on %s", node->details->uname); for (rsc_entry = __xml_first_child(lrm_rsc_list); rsc_entry != NULL; rsc_entry = __xml_next_element(rsc_entry)) { if (crm_str_eq((const char *)rsc_entry->name, XML_LRM_TAG_RESOURCE, TRUE)) { resource_t *rsc; rsc = unpack_lrm_rsc_state(node, rsc_entry, data_set); if (!rsc) { continue; } if (is_set(rsc->flags, pe_rsc_orphan_container_filler)) { found_orphaned_container_filler = TRUE; } if (is_set(rsc->flags, pe_rsc_unexpectedly_running)) { remote = rsc_contains_remote_node(data_set, rsc); if (remote) { unexpected_containers = g_list_append(unexpected_containers, remote); } } } } /* If a container resource is unexpectedly up... and the remote-node * connection resource for that container is not up, the entire container * must be recovered. */ for (gIter = unexpected_containers; gIter != NULL; gIter = gIter->next) { remote = (resource_t *) gIter->data; if (remote->role != RSC_ROLE_STARTED) { crm_warn("Recovering container resource %s. Resource is unexpectedly running and involves a remote-node.", remote->container->id); set_bit(remote->container->flags, pe_rsc_failed); } } /* now that all the resource state has been unpacked for this node * we have to go back and map any orphaned container fillers to their * container resource */ if (found_orphaned_container_filler) { handle_orphaned_container_fillers(lrm_rsc_list, data_set); } g_list_free(unexpected_containers); return TRUE; } static void set_active(resource_t * rsc) { resource_t *top = uber_parent(rsc); if (top && top->variant == pe_master) { rsc->role = RSC_ROLE_SLAVE; } else { rsc->role = RSC_ROLE_STARTED; } } static void set_node_score(gpointer key, gpointer value, gpointer user_data) { node_t *node = value; int *score = user_data; node->weight = *score; } #define STATUS_PATH_MAX 1024 static xmlNode * find_lrm_op(const char *resource, const char *op, const char *node, const char *source, pe_working_set_t * data_set) { int offset = 0; char xpath[STATUS_PATH_MAX]; offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "//node_state[@uname='%s']", node); offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "//" XML_LRM_TAG_RESOURCE "[@id='%s']", resource); /* Need to check against transition_magic too? */ if (source && safe_str_eq(op, CRMD_ACTION_MIGRATE)) { offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "/" XML_LRM_TAG_RSC_OP "[@operation='%s' and @migrate_target='%s']", op, source); } else if (source && safe_str_eq(op, CRMD_ACTION_MIGRATED)) { offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "/" XML_LRM_TAG_RSC_OP "[@operation='%s' and @migrate_source='%s']", op, source); } else { offset += snprintf(xpath + offset, STATUS_PATH_MAX - offset, "/" XML_LRM_TAG_RSC_OP "[@operation='%s']", op); } CRM_LOG_ASSERT(offset > 0); return get_xpath_object(xpath, data_set->input, LOG_DEBUG); } static void unpack_rsc_migration(resource_t *rsc, node_t *node, xmlNode *xml_op, pe_working_set_t * data_set) { /* * The normal sequence is (now): migrate_to(Src) -> migrate_from(Tgt) -> stop(Src) * * So if a migrate_to is followed by a stop, then we don't need to care what * happened on the target node * * Without the stop, we need to look for a successful migrate_from. * This would also imply we're no longer running on the source * * Without the stop, and without a migrate_from op we make sure the resource * gets stopped on both source and target (assuming the target is up) * */ int stop_id = 0; int task_id = 0; xmlNode *stop_op = find_lrm_op(rsc->id, CRMD_ACTION_STOP, node->details->id, NULL, data_set); if (stop_op) { crm_element_value_int(stop_op, XML_LRM_ATTR_CALLID, &stop_id); } crm_element_value_int(xml_op, XML_LRM_ATTR_CALLID, &task_id); if (stop_op == NULL || stop_id < task_id) { int from_rc = 0, from_status = 0; const char *migrate_source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE); const char *migrate_target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET); node_t *target = pe_find_node(data_set->nodes, migrate_target); node_t *source = pe_find_node(data_set->nodes, migrate_source); xmlNode *migrate_from = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATED, migrate_target, migrate_source, data_set); rsc->role = RSC_ROLE_STARTED; /* can be master? */ if (migrate_from) { crm_element_value_int(migrate_from, XML_LRM_ATTR_RC, &from_rc); crm_element_value_int(migrate_from, XML_LRM_ATTR_OPSTATUS, &from_status); pe_rsc_trace(rsc, "%s op on %s exited with status=%d, rc=%d", ID(migrate_from), migrate_target, from_status, from_rc); } if (migrate_from && from_rc == PCMK_OCF_OK && from_status == PCMK_LRM_OP_DONE) { pe_rsc_trace(rsc, "Detected dangling migration op: %s on %s", ID(xml_op), migrate_source); /* all good * just need to arrange for the stop action to get sent * but _without_ affecting the target somehow */ rsc->role = RSC_ROLE_STOPPED; rsc->dangling_migrations = g_list_prepend(rsc->dangling_migrations, node); } else if (migrate_from) { /* Failed */ if (target && target->details->online) { pe_rsc_trace(rsc, "Marking active on %s %p %d", migrate_target, target, target->details->online); native_add_running(rsc, target, data_set); } } else { /* Pending or complete but erased */ if (target && target->details->online) { pe_rsc_trace(rsc, "Marking active on %s %p %d", migrate_target, target, target->details->online); native_add_running(rsc, target, data_set); if (source && source->details->online) { /* If we make it here we have a partial migration. The migrate_to * has completed but the migrate_from on the target has not. Hold on * to the target and source on the resource. Later on if we detect that * the resource is still going to run on that target, we may continue * the migration */ rsc->partial_migration_target = target; rsc->partial_migration_source = source; } } else { /* Consider it failed here - forces a restart, prevents migration */ set_bit(rsc->flags, pe_rsc_failed); clear_bit(rsc->flags, pe_rsc_allow_migrate); } } } } static void unpack_rsc_migration_failure(resource_t *rsc, node_t *node, xmlNode *xml_op, pe_working_set_t * data_set) { const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); CRM_ASSERT(rsc); if (safe_str_eq(task, CRMD_ACTION_MIGRATED)) { int stop_id = 0; int migrate_id = 0; const char *migrate_source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE); const char *migrate_target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET); xmlNode *stop_op = find_lrm_op(rsc->id, CRMD_ACTION_STOP, migrate_source, NULL, data_set); xmlNode *migrate_op = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATE, migrate_source, migrate_target, data_set); if (stop_op) { crm_element_value_int(stop_op, XML_LRM_ATTR_CALLID, &stop_id); } if (migrate_op) { crm_element_value_int(migrate_op, XML_LRM_ATTR_CALLID, &migrate_id); } /* Get our state right */ rsc->role = RSC_ROLE_STARTED; /* can be master? */ if (stop_op == NULL || stop_id < migrate_id) { node_t *source = pe_find_node(data_set->nodes, migrate_source); if (source && source->details->online) { native_add_running(rsc, source, data_set); } } } else if (safe_str_eq(task, CRMD_ACTION_MIGRATE)) { int stop_id = 0; int migrate_id = 0; const char *migrate_source = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_SOURCE); const char *migrate_target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET); xmlNode *stop_op = find_lrm_op(rsc->id, CRMD_ACTION_STOP, migrate_target, NULL, data_set); xmlNode *migrate_op = find_lrm_op(rsc->id, CRMD_ACTION_MIGRATED, migrate_target, migrate_source, data_set); if (stop_op) { crm_element_value_int(stop_op, XML_LRM_ATTR_CALLID, &stop_id); } if (migrate_op) { crm_element_value_int(migrate_op, XML_LRM_ATTR_CALLID, &migrate_id); } /* Get our state right */ rsc->role = RSC_ROLE_STARTED; /* can be master? */ if (stop_op == NULL || stop_id < migrate_id) { node_t *target = pe_find_node(data_set->nodes, migrate_target); pe_rsc_trace(rsc, "Stop: %p %d, Migrated: %p %d", stop_op, stop_id, migrate_op, migrate_id); if (target && target->details->online) { native_add_running(rsc, target, data_set); } } else if (migrate_op == NULL) { /* Make sure it gets cleaned up, the stop may pre-date the migrate_from */ rsc->dangling_migrations = g_list_prepend(rsc->dangling_migrations, node); } } } static void record_failed_op(xmlNode *op, node_t* node, pe_working_set_t * data_set) { xmlNode *xIter = NULL; const char *op_key = crm_element_value(op, XML_LRM_ATTR_TASK_KEY); if (node->details->online == FALSE) { return; } for (xIter = data_set->failed->children; xIter; xIter = xIter->next) { const char *key = crm_element_value(xIter, XML_LRM_ATTR_TASK_KEY); const char *uname = crm_element_value(xIter, XML_ATTR_UNAME); if(safe_str_eq(op_key, key) && safe_str_eq(uname, node->details->uname)) { crm_trace("Skipping duplicate entry %s on %s", op_key, node->details->uname); return; } } crm_trace("Adding entry %s on %s", op_key, node->details->uname); crm_xml_add(op, XML_ATTR_UNAME, node->details->uname); add_node_copy(data_set->failed, op); } static const char *get_op_key(xmlNode *xml_op) { const char *key = crm_element_value(xml_op, XML_LRM_ATTR_TASK_KEY); if(key == NULL) { key = ID(xml_op); } return key; } static void unpack_rsc_op_failure(resource_t * rsc, node_t * node, int rc, xmlNode * xml_op, xmlNode ** last_failure, enum action_fail_response * on_fail, pe_working_set_t * data_set) { int interval = 0; bool is_probe = FALSE; action_t *action = NULL; const char *key = get_op_key(xml_op); const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); const char *op_version = crm_element_value(xml_op, XML_ATTR_CRM_VERSION); CRM_ASSERT(rsc); *last_failure = xml_op; crm_element_value_int(xml_op, XML_LRM_ATTR_INTERVAL, &interval); if(interval == 0 && safe_str_eq(task, CRMD_ACTION_STATUS)) { is_probe = TRUE; pe_rsc_trace(rsc, "is a probe: %s", key); } if (rc != PCMK_OCF_NOT_INSTALLED || is_set(data_set->flags, pe_flag_symmetric_cluster)) { crm_warn("Processing failed op %s for %s on %s: %s (%d)", task, rsc->id, node->details->uname, services_ocf_exitcode_str(rc), rc); record_failed_op(xml_op, node, data_set); } else { crm_trace("Processing failed op %s for %s on %s: %s (%d)", task, rsc->id, node->details->uname, services_ocf_exitcode_str(rc), rc); } action = custom_action(rsc, strdup(key), task, NULL, TRUE, FALSE, data_set); if ((action->on_fail <= action_fail_fence && *on_fail < action->on_fail) || (action->on_fail == action_fail_reset_remote && *on_fail <= action_fail_recover) || (action->on_fail == action_fail_restart_container && *on_fail <= action_fail_recover) || (*on_fail == action_fail_restart_container && action->on_fail >= action_fail_migrate)) { pe_rsc_trace(rsc, "on-fail %s -> %s for %s (%s)", fail2text(*on_fail), fail2text(action->on_fail), action->uuid, key); *on_fail = action->on_fail; } if (safe_str_eq(task, CRMD_ACTION_STOP)) { resource_location(rsc, node, -INFINITY, "__stop_fail__", data_set); } else if (safe_str_eq(task, CRMD_ACTION_MIGRATE) || safe_str_eq(task, CRMD_ACTION_MIGRATED)) { unpack_rsc_migration_failure(rsc, node, xml_op, data_set); } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) { rsc->role = RSC_ROLE_MASTER; } else if (safe_str_eq(task, CRMD_ACTION_DEMOTE)) { /* * staying in role=master ends up putting the PE/TE into a loop * setting role=slave is not dangerous because no master will be * promoted until the failed resource has been fully stopped */ if (action->on_fail == action_fail_block) { rsc->role = RSC_ROLE_MASTER; rsc->next_role = RSC_ROLE_STOPPED; } else if(rc == PCMK_OCF_NOT_RUNNING) { rsc->role = RSC_ROLE_STOPPED; } else { crm_warn("Forcing %s to stop after a failed demote action", rsc->id); rsc->role = RSC_ROLE_SLAVE; rsc->next_role = RSC_ROLE_STOPPED; } } else if (compare_version("2.0", op_version) > 0 && safe_str_eq(task, CRMD_ACTION_START)) { crm_warn("Compatibility handling for failed op %s on %s", key, node->details->uname); resource_location(rsc, node, -INFINITY, "__legacy_start__", data_set); } if(is_probe && rc == PCMK_OCF_NOT_INSTALLED) { /* leave stopped */ pe_rsc_trace(rsc, "Leaving %s stopped", rsc->id); rsc->role = RSC_ROLE_STOPPED; } else if (rsc->role < RSC_ROLE_STARTED) { pe_rsc_trace(rsc, "Setting %s active", rsc->id); set_active(rsc); } pe_rsc_trace(rsc, "Resource %s: role=%s, unclean=%s, on_fail=%s, fail_role=%s", rsc->id, role2text(rsc->role), node->details->unclean ? "true" : "false", fail2text(action->on_fail), role2text(action->fail_role)); if (action->fail_role != RSC_ROLE_STARTED && rsc->next_role < action->fail_role) { rsc->next_role = action->fail_role; } if (action->fail_role == RSC_ROLE_STOPPED) { int score = -INFINITY; resource_t *fail_rsc = rsc; if (fail_rsc->parent) { resource_t *parent = uber_parent(fail_rsc); if (pe_rsc_is_clone(parent) && is_not_set(parent->flags, pe_rsc_unique)) { /* for clone and master resources, if a child fails on an operation * with on-fail = stop, all the resources fail. Do this by preventing * the parent from coming up again. */ fail_rsc = parent; } } crm_warn("Making sure %s doesn't come up again", fail_rsc->id); /* make sure it doesn't come up again */ g_hash_table_destroy(fail_rsc->allowed_nodes); fail_rsc->allowed_nodes = node_hash_from_list(data_set->nodes); g_hash_table_foreach(fail_rsc->allowed_nodes, set_node_score, &score); } pe_free_action(action); } static int determine_op_status( resource_t *rsc, int rc, int target_rc, node_t * node, xmlNode * xml_op, enum action_fail_response * on_fail, pe_working_set_t * data_set) { int interval = 0; int result = PCMK_LRM_OP_DONE; const char *key = get_op_key(xml_op); const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); bool is_probe = FALSE; CRM_ASSERT(rsc); crm_element_value_int(xml_op, XML_LRM_ATTR_INTERVAL, &interval); if (interval == 0 && safe_str_eq(task, CRMD_ACTION_STATUS)) { is_probe = TRUE; } if (target_rc >= 0 && target_rc != rc) { result = PCMK_LRM_OP_ERROR; pe_rsc_debug(rsc, "%s on %s returned '%s' (%d) instead of the expected value: '%s' (%d)", key, node->details->uname, services_ocf_exitcode_str(rc), rc, services_ocf_exitcode_str(target_rc), target_rc); } /* we could clean this up significantly except for old LRMs and CRMs that * didn't include target_rc and liked to remap status */ switch (rc) { case PCMK_OCF_OK: if (is_probe && target_rc == 7) { result = PCMK_LRM_OP_DONE; set_bit(rsc->flags, pe_rsc_unexpectedly_running); pe_rsc_info(rsc, "Operation %s found resource %s active on %s", task, rsc->id, node->details->uname); /* legacy code for pre-0.6.5 operations */ } else if (target_rc < 0 && interval > 0 && rsc->role == RSC_ROLE_MASTER) { /* catch status ops that return 0 instead of 8 while they * are supposed to be in master mode */ result = PCMK_LRM_OP_ERROR; } break; case PCMK_OCF_NOT_RUNNING: if (is_probe || target_rc == rc || is_not_set(rsc->flags, pe_rsc_managed)) { result = PCMK_LRM_OP_DONE; rsc->role = RSC_ROLE_STOPPED; /* clear any previous failure actions */ *on_fail = action_fail_ignore; rsc->next_role = RSC_ROLE_UNKNOWN; } else if (safe_str_neq(task, CRMD_ACTION_STOP)) { result = PCMK_LRM_OP_ERROR; } break; case PCMK_OCF_RUNNING_MASTER: if (is_probe) { result = PCMK_LRM_OP_DONE; pe_rsc_info(rsc, "Operation %s found resource %s active in master mode on %s", task, rsc->id, node->details->uname); } else if (target_rc == rc) { /* nothing to do */ } else if (target_rc >= 0) { result = PCMK_LRM_OP_ERROR; /* legacy code for pre-0.6.5 operations */ } else if (safe_str_neq(task, CRMD_ACTION_STATUS) || rsc->role != RSC_ROLE_MASTER) { result = PCMK_LRM_OP_ERROR; if (rsc->role != RSC_ROLE_MASTER) { crm_err("%s reported %s in master mode on %s", key, rsc->id, node->details->uname); } } rsc->role = RSC_ROLE_MASTER; break; case PCMK_OCF_DEGRADED_MASTER: case PCMK_OCF_FAILED_MASTER: rsc->role = RSC_ROLE_MASTER; result = PCMK_LRM_OP_ERROR; break; case PCMK_OCF_NOT_CONFIGURED: result = PCMK_LRM_OP_ERROR_FATAL; break; case PCMK_OCF_NOT_INSTALLED: case PCMK_OCF_INVALID_PARAM: case PCMK_OCF_INSUFFICIENT_PRIV: case PCMK_OCF_UNIMPLEMENT_FEATURE: if (rc == PCMK_OCF_UNIMPLEMENT_FEATURE && interval > 0) { result = PCMK_LRM_OP_NOTSUPPORTED; break; } else if (pe_can_fence(data_set, node) == FALSE && safe_str_eq(task, CRMD_ACTION_STOP)) { /* If a stop fails and we can't fence, there's nothing else we can do */ pe_proc_err("No further recovery can be attempted for %s: %s action failed with '%s' (%d)", rsc->id, task, services_ocf_exitcode_str(rc), rc); clear_bit(rsc->flags, pe_rsc_managed); set_bit(rsc->flags, pe_rsc_block); } result = PCMK_LRM_OP_ERROR_HARD; break; default: if (result == PCMK_LRM_OP_DONE) { crm_info("Treating %s (rc=%d) on %s as an ERROR", key, rc, node->details->uname); result = PCMK_LRM_OP_ERROR; } } return result; } static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNode *xml_op, pe_working_set_t * data_set) { bool expired = FALSE; time_t last_failure = 0; int interval = 0; int failure_timeout = rsc->failure_timeout; const char *key = get_op_key(xml_op); const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); const char *clear_reason = NULL; /* clearing recurring monitor operation failures automatically * needs to be carefully considered */ if (safe_str_eq(crm_element_value(xml_op, XML_LRM_ATTR_TASK), "monitor") && safe_str_neq(crm_element_value(xml_op, XML_LRM_ATTR_INTERVAL), "0")) { /* TODO, in the future we should consider not clearing recurring monitor * op failures unless the last action for a resource was a "stop" action. * otherwise it is possible that clearing the monitor failure will result * in the resource being in an undeterministic state. * * For now we handle this potential undeterministic condition for remote * node connection resources by not clearing a recurring monitor op failure * until after the node has been fenced. */ if (is_set(data_set->flags, pe_flag_stonith_enabled) && (rsc->remote_reconnect_interval)) { node_t *remote_node = pe_find_node(data_set->nodes, rsc->id); if (remote_node && remote_node->details->remote_was_fenced == 0) { if (strstr(ID(xml_op), "last_failure")) { crm_info("Waiting to clear monitor failure for remote node %s until fencing has occurred", rsc->id); } /* disabling failure timeout for this operation because we believe * fencing of the remote node should occur first. */ failure_timeout = 0; } } } if (failure_timeout > 0) { int last_run = 0; if (crm_element_value_int(xml_op, XML_RSC_OP_LAST_CHANGE, &last_run) == 0) { time_t now = get_effective_time(data_set); if (now > (last_run + failure_timeout)) { expired = TRUE; } } } if (expired) { if (failure_timeout > 0) { int fc = get_failcount_full(node, rsc, &last_failure, FALSE, xml_op, data_set); if(fc) { if (get_failcount_full(node, rsc, &last_failure, TRUE, xml_op, data_set) == 0) { clear_reason = "it expired"; } else { expired = FALSE; } } else if (rsc->remote_reconnect_interval && strstr(ID(xml_op), "last_failure")) { /* always clear last failure when reconnect interval is set */ clear_reason = "reconnect interval is set"; } } } else if (strstr(ID(xml_op), "last_failure") && ((strcmp(task, "start") == 0) || (strcmp(task, "monitor") == 0))) { op_digest_cache_t *digest_data = NULL; digest_data = rsc_action_digest_cmp(rsc, xml_op, node, data_set); if (digest_data->rc == RSC_DIGEST_UNKNOWN) { crm_trace("rsc op %s/%s on node %s does not have a op digest to compare against", rsc->id, key, node->details->id); } else if (digest_data->rc != RSC_DIGEST_MATCH) { clear_reason = "resource parameters have changed"; } } if (clear_reason != NULL) { char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0); action_t *clear_op = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set); add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE); crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s", rsc->id, node->details->uname, clear_reason, clear_op->uuid); } crm_element_value_int(xml_op, XML_LRM_ATTR_INTERVAL, &interval); if(expired && interval == 0 && safe_str_eq(task, CRMD_ACTION_STATUS)) { switch(rc) { case PCMK_OCF_OK: case PCMK_OCF_NOT_RUNNING: case PCMK_OCF_RUNNING_MASTER: case PCMK_OCF_DEGRADED: case PCMK_OCF_DEGRADED_MASTER: /* Don't expire probes that return these values */ expired = FALSE; break; } } return expired; } int get_target_rc(xmlNode *xml_op) { int dummy = 0; int target_rc = 0; char *dummy_string = NULL; const char *key = crm_element_value(xml_op, XML_ATTR_TRANSITION_KEY); if (key == NULL) { return -1; } decode_transition_key(key, &dummy_string, &dummy, &dummy, &target_rc); free(dummy_string); return target_rc; } static enum action_fail_response get_action_on_fail(resource_t *rsc, const char *key, const char *task, pe_working_set_t * data_set) { int result = action_fail_recover; action_t *action = custom_action(rsc, strdup(key), task, NULL, TRUE, FALSE, data_set); result = action->on_fail; pe_free_action(action); return result; } static void update_resource_state(resource_t * rsc, node_t * node, xmlNode * xml_op, const char * task, int rc, xmlNode * last_failure, enum action_fail_response * on_fail, pe_working_set_t * data_set) { gboolean clear_past_failure = FALSE; CRM_ASSERT(rsc); CRM_ASSERT(xml_op); if (rc == PCMK_OCF_NOT_RUNNING) { clear_past_failure = TRUE; } else if (rc == PCMK_OCF_NOT_INSTALLED) { rsc->role = RSC_ROLE_STOPPED; } else if (safe_str_eq(task, CRMD_ACTION_STATUS)) { if (last_failure) { const char *op_key = get_op_key(xml_op); const char *last_failure_key = get_op_key(last_failure); if (safe_str_eq(op_key, last_failure_key)) { clear_past_failure = TRUE; } } if (rsc->role < RSC_ROLE_STARTED) { set_active(rsc); } } else if (safe_str_eq(task, CRMD_ACTION_START)) { rsc->role = RSC_ROLE_STARTED; clear_past_failure = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_STOP)) { rsc->role = RSC_ROLE_STOPPED; clear_past_failure = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) { rsc->role = RSC_ROLE_MASTER; clear_past_failure = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_DEMOTE)) { /* Demote from Master does not clear an error */ rsc->role = RSC_ROLE_SLAVE; } else if (safe_str_eq(task, CRMD_ACTION_MIGRATED)) { rsc->role = RSC_ROLE_STARTED; clear_past_failure = TRUE; } else if (safe_str_eq(task, CRMD_ACTION_MIGRATE)) { unpack_rsc_migration(rsc, node, xml_op, data_set); } else if (rsc->role < RSC_ROLE_STARTED) { pe_rsc_trace(rsc, "%s active on %s", rsc->id, node->details->uname); set_active(rsc); } /* clear any previous failure actions */ if (clear_past_failure) { switch (*on_fail) { case action_fail_stop: case action_fail_fence: case action_fail_migrate: case action_fail_standby: pe_rsc_trace(rsc, "%s.%s is not cleared by a completed stop", rsc->id, fail2text(*on_fail)); break; case action_fail_block: case action_fail_ignore: case action_fail_recover: case action_fail_restart_container: *on_fail = action_fail_ignore; rsc->next_role = RSC_ROLE_UNKNOWN; break; case action_fail_reset_remote: if (rsc->remote_reconnect_interval == 0) { /* when reconnect delay is not in use, the connection is allowed * to start again after the remote node is fenced and completely * stopped. Otherwise, with reconnect delay we wait for the failure * to be cleared entirely before reconnected can be attempted. */ *on_fail = action_fail_ignore; rsc->next_role = RSC_ROLE_UNKNOWN; } break; } } } gboolean unpack_rsc_op(resource_t * rsc, node_t * node, xmlNode * xml_op, xmlNode ** last_failure, enum action_fail_response * on_fail, pe_working_set_t * data_set) { int task_id = 0; const char *key = NULL; const char *task = NULL; const char *task_key = NULL; int rc = 0; int status = PCMK_LRM_OP_PENDING-1; int target_rc = get_target_rc(xml_op); int interval = 0; gboolean expired = FALSE; resource_t *parent = rsc; enum action_fail_response failure_strategy = action_fail_recover; CRM_CHECK(rsc != NULL, return FALSE); CRM_CHECK(node != NULL, return FALSE); CRM_CHECK(xml_op != NULL, return FALSE); task_key = get_op_key(xml_op); task = crm_element_value(xml_op, XML_LRM_ATTR_TASK); key = crm_element_value(xml_op, XML_ATTR_TRANSITION_KEY); crm_element_value_int(xml_op, XML_LRM_ATTR_RC, &rc); crm_element_value_int(xml_op, XML_LRM_ATTR_CALLID, &task_id); crm_element_value_int(xml_op, XML_LRM_ATTR_OPSTATUS, &status); crm_element_value_int(xml_op, XML_LRM_ATTR_INTERVAL, &interval); CRM_CHECK(task != NULL, return FALSE); CRM_CHECK(status <= PCMK_LRM_OP_NOT_INSTALLED, return FALSE); CRM_CHECK(status >= PCMK_LRM_OP_PENDING, return FALSE); if (safe_str_eq(task, CRMD_ACTION_NOTIFY)) { /* safe to ignore these */ return TRUE; } if (is_not_set(rsc->flags, pe_rsc_unique)) { parent = uber_parent(rsc); } pe_rsc_trace(rsc, "Unpacking task %s/%s (call_id=%d, status=%d, rc=%d) on %s (role=%s)", task_key, task, task_id, status, rc, node->details->uname, role2text(rsc->role)); if (node->details->unclean) { pe_rsc_trace(rsc, "Node %s (where %s is running) is unclean." " Further action depends on the value of the stop's on-fail attribute", node->details->uname, rsc->id); } if (status == PCMK_LRM_OP_ERROR) { /* Older versions set this if rc != 0 but it's up to us to decide */ status = PCMK_LRM_OP_DONE; } if(status != PCMK_LRM_OP_NOT_INSTALLED) { expired = check_operation_expiry(rsc, node, rc, xml_op, data_set); } /* Degraded results are informational only, re-map them to their error-free equivalents */ if (rc == PCMK_OCF_DEGRADED && safe_str_eq(task, CRMD_ACTION_STATUS)) { rc = PCMK_OCF_OK; /* Add them to the failed list to highlight them for the user */ if ((node->details->shutdown == FALSE) || (node->details->online == TRUE)) { crm_trace("Remapping %d to %d", PCMK_OCF_DEGRADED, PCMK_OCF_OK); record_failed_op(xml_op, node, data_set); } } else if (rc == PCMK_OCF_DEGRADED_MASTER && safe_str_eq(task, CRMD_ACTION_STATUS)) { rc = PCMK_OCF_RUNNING_MASTER; /* Add them to the failed list to highlight them for the user */ if ((node->details->shutdown == FALSE) || (node->details->online == TRUE)) { crm_trace("Remapping %d to %d", PCMK_OCF_DEGRADED_MASTER, PCMK_OCF_RUNNING_MASTER); record_failed_op(xml_op, node, data_set); } } if (expired && target_rc != rc) { const char *magic = crm_element_value(xml_op, XML_ATTR_TRANSITION_MAGIC); pe_rsc_debug(rsc, "Expired operation '%s' on %s returned '%s' (%d) instead of the expected value: '%s' (%d)", key, node->details->uname, services_ocf_exitcode_str(rc), rc, services_ocf_exitcode_str(target_rc), target_rc); if(interval == 0) { crm_notice("Ignoring expired calculated failure %s (rc=%d, magic=%s) on %s", task_key, rc, magic, node->details->uname); goto done; } else if(node->details->online && node->details->unclean == FALSE) { crm_notice("Re-initiated expired calculated failure %s (rc=%d, magic=%s) on %s", task_key, rc, magic, node->details->uname); /* This is SO horrible, but we don't have access to CancelXmlOp() yet */ crm_xml_add(xml_op, XML_LRM_ATTR_RESTART_DIGEST, "calculated-failure-timeout"); goto done; } } if(status == PCMK_LRM_OP_DONE || status == PCMK_LRM_OP_ERROR) { status = determine_op_status(rsc, rc, target_rc, node, xml_op, on_fail, data_set); } pe_rsc_trace(rsc, "Handling status: %d", status); switch (status) { case PCMK_LRM_OP_CANCELLED: /* do nothing?? */ pe_err("Don't know what to do for cancelled ops yet"); break; case PCMK_LRM_OP_PENDING: if (safe_str_eq(task, CRMD_ACTION_START)) { set_bit(rsc->flags, pe_rsc_start_pending); set_active(rsc); } else if (safe_str_eq(task, CRMD_ACTION_PROMOTE)) { rsc->role = RSC_ROLE_MASTER; } else if (safe_str_eq(task, CRMD_ACTION_MIGRATE) && node->details->unclean) { /* If a pending migrate_to action is out on a unclean node, * we have to force the stop action on the target. */ const char *migrate_target = crm_element_value(xml_op, XML_LRM_ATTR_MIGRATE_TARGET); node_t *target = pe_find_node(data_set->nodes, migrate_target); if (target) { stop_action(rsc, target, FALSE); } } if (rsc->pending_task == NULL) { if (safe_str_eq(task, CRMD_ACTION_STATUS) && interval == 0) { /* Pending probes are not printed, even if pending * operations are requested. If someone ever requests that * behavior, uncomment this and the corresponding part of * native.c:native_pending_task(). */ /*rsc->pending_task = strdup("probe");*/ } else { rsc->pending_task = strdup(task); } } break; case PCMK_LRM_OP_DONE: pe_rsc_trace(rsc, "%s/%s completed on %s", rsc->id, task, node->details->uname); update_resource_state(rsc, node, xml_op, task, rc, *last_failure, on_fail, data_set); break; case PCMK_LRM_OP_NOT_INSTALLED: failure_strategy = get_action_on_fail(rsc, task_key, task, data_set); if (failure_strategy == action_fail_ignore) { crm_warn("Cannot ignore failed %s (status=%d, rc=%d) on %s: " "Resource agent doesn't exist", task_key, status, rc, node->details->uname); /* Also for printing it as "FAILED" by marking it as pe_rsc_failed later */ *on_fail = action_fail_migrate; } resource_location(parent, node, -INFINITY, "hard-error", data_set); unpack_rsc_op_failure(rsc, node, rc, xml_op, last_failure, on_fail, data_set); break; case PCMK_LRM_OP_ERROR: case PCMK_LRM_OP_ERROR_HARD: case PCMK_LRM_OP_ERROR_FATAL: case PCMK_LRM_OP_TIMEOUT: case PCMK_LRM_OP_NOTSUPPORTED: failure_strategy = get_action_on_fail(rsc, task_key, task, data_set); if ((failure_strategy == action_fail_ignore) || (failure_strategy == action_fail_restart_container && safe_str_eq(task, CRMD_ACTION_STOP))) { crm_warn("Pretending the failure of %s (rc=%d) on %s succeeded", task_key, rc, node->details->uname); update_resource_state(rsc, node, xml_op, task, target_rc, *last_failure, on_fail, data_set); crm_xml_add(xml_op, XML_ATTR_UNAME, node->details->uname); set_bit(rsc->flags, pe_rsc_failure_ignored); record_failed_op(xml_op, node, data_set); if (failure_strategy == action_fail_restart_container && *on_fail <= action_fail_recover) { *on_fail = failure_strategy; } } else { unpack_rsc_op_failure(rsc, node, rc, xml_op, last_failure, on_fail, data_set); if(status == PCMK_LRM_OP_ERROR_HARD) { do_crm_log(rc != PCMK_OCF_NOT_INSTALLED?LOG_ERR:LOG_NOTICE, "Preventing %s from re-starting on %s: operation %s failed '%s' (%d)", parent->id, node->details->uname, task, services_ocf_exitcode_str(rc), rc); resource_location(parent, node, -INFINITY, "hard-error", data_set); } else if(status == PCMK_LRM_OP_ERROR_FATAL) { crm_err("Preventing %s from re-starting anywhere: operation %s failed '%s' (%d)", parent->id, task, services_ocf_exitcode_str(rc), rc); resource_location(parent, NULL, -INFINITY, "fatal-error", data_set); } } break; } done: pe_rsc_trace(rsc, "Resource %s after %s: role=%s, next=%s", rsc->id, task, role2text(rsc->role), role2text(rsc->next_role)); return TRUE; } gboolean add_node_attrs(xmlNode * xml_obj, node_t * node, gboolean overwrite, pe_working_set_t * data_set) { const char *cluster_name = NULL; g_hash_table_insert(node->details->attrs, strdup("#uname"), strdup(node->details->uname)); g_hash_table_insert(node->details->attrs, strdup("#" XML_ATTR_ID), strdup(node->details->id)); if (safe_str_eq(node->details->id, data_set->dc_uuid)) { data_set->dc_node = node; node->details->is_dc = TRUE; g_hash_table_insert(node->details->attrs, strdup("#" XML_ATTR_DC), strdup(XML_BOOLEAN_TRUE)); } else { g_hash_table_insert(node->details->attrs, strdup("#" XML_ATTR_DC), strdup(XML_BOOLEAN_FALSE)); } cluster_name = g_hash_table_lookup(data_set->config_hash, "cluster-name"); if (cluster_name) { g_hash_table_insert(node->details->attrs, strdup("#cluster-name"), strdup(cluster_name)); } unpack_instance_attributes(data_set->input, xml_obj, XML_TAG_ATTR_SETS, NULL, node->details->attrs, NULL, overwrite, data_set->now); if (g_hash_table_lookup(node->details->attrs, "#site-name") == NULL) { const char *site_name = g_hash_table_lookup(node->details->attrs, "site-name"); if (site_name) { /* Prefix '#' to the key */ g_hash_table_insert(node->details->attrs, strdup("#site-name"), strdup(site_name)); } else if (cluster_name) { /* Default to cluster-name if unset */ g_hash_table_insert(node->details->attrs, strdup("#site-name"), strdup(cluster_name)); } } return TRUE; } static GListPtr extract_operations(const char *node, const char *rsc, xmlNode * rsc_entry, gboolean active_filter) { int counter = -1; int stop_index = -1; int start_index = -1; xmlNode *rsc_op = NULL; GListPtr gIter = NULL; GListPtr op_list = NULL; GListPtr sorted_op_list = NULL; /* extract operations */ op_list = NULL; sorted_op_list = NULL; for (rsc_op = __xml_first_child(rsc_entry); rsc_op != NULL; rsc_op = __xml_next_element(rsc_op)) { if (crm_str_eq((const char *)rsc_op->name, XML_LRM_TAG_RSC_OP, TRUE)) { crm_xml_add(rsc_op, "resource", rsc); crm_xml_add(rsc_op, XML_ATTR_UNAME, node); op_list = g_list_prepend(op_list, rsc_op); } } if (op_list == NULL) { /* if there are no operations, there is nothing to do */ return NULL; } sorted_op_list = g_list_sort(op_list, sort_op_by_callid); /* create active recurring operations as optional */ if (active_filter == FALSE) { return sorted_op_list; } op_list = NULL; calculate_active_ops(sorted_op_list, &start_index, &stop_index); for (gIter = sorted_op_list; gIter != NULL; gIter = gIter->next) { xmlNode *rsc_op = (xmlNode *) gIter->data; counter++; if (start_index < stop_index) { crm_trace("Skipping %s: not active", ID(rsc_entry)); break; } else if (counter < start_index) { crm_trace("Skipping %s: old", ID(rsc_op)); continue; } op_list = g_list_append(op_list, rsc_op); } g_list_free(sorted_op_list); return op_list; } GListPtr find_operations(const char *rsc, const char *node, gboolean active_filter, pe_working_set_t * data_set) { GListPtr output = NULL; GListPtr intermediate = NULL; xmlNode *tmp = NULL; xmlNode *status = find_xml_node(data_set->input, XML_CIB_TAG_STATUS, TRUE); node_t *this_node = NULL; xmlNode *node_state = NULL; for (node_state = __xml_first_child(status); node_state != NULL; node_state = __xml_next_element(node_state)) { if (crm_str_eq((const char *)node_state->name, XML_CIB_TAG_STATE, TRUE)) { const char *uname = crm_element_value(node_state, XML_ATTR_UNAME); if (node != NULL && safe_str_neq(uname, node)) { continue; } this_node = pe_find_node(data_set->nodes, uname); if(this_node == NULL) { CRM_LOG_ASSERT(this_node != NULL); continue; } else if (is_remote_node(this_node)) { determine_remote_online_status(data_set, this_node); } else { determine_online_status(node_state, this_node, data_set); } if (this_node->details->online || is_set(data_set->flags, pe_flag_stonith_enabled)) { /* offline nodes run no resources... * unless stonith is enabled in which case we need to * make sure rsc start events happen after the stonith */ xmlNode *lrm_rsc = NULL; tmp = find_xml_node(node_state, XML_CIB_TAG_LRM, FALSE); tmp = find_xml_node(tmp, XML_LRM_TAG_RESOURCES, FALSE); for (lrm_rsc = __xml_first_child(tmp); lrm_rsc != NULL; lrm_rsc = __xml_next_element(lrm_rsc)) { if (crm_str_eq((const char *)lrm_rsc->name, XML_LRM_TAG_RESOURCE, TRUE)) { const char *rsc_id = crm_element_value(lrm_rsc, XML_ATTR_ID); if (rsc != NULL && safe_str_neq(rsc_id, rsc)) { continue; } intermediate = extract_operations(uname, rsc_id, lrm_rsc, active_filter); output = g_list_concat(output, intermediate); } } } } } return output; } diff --git a/pengine/container.c b/pengine/container.c index 1623861ff0..3da19aa9c8 100644 --- a/pengine/container.c +++ b/pengine/container.c @@ -1,375 +1,378 @@ /* * Copyright (C) 2004 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include #include #include #include #include #include #define VARIANT_CONTAINER 1 #include static bool is_child_container_node(container_variant_data_t *data, pe_node_t *node) { for (GListPtr gIter = data->tuples; gIter != NULL; gIter = gIter->next) { container_grouping_t *tuple = (container_grouping_t *)gIter->data; if(node->details == tuple->node->details) { return TRUE; } } return FALSE; } gint sort_clone_instance(gconstpointer a, gconstpointer b, gpointer data_set); void distribute_children(resource_t *rsc, GListPtr children, GListPtr nodes, int max, int per_host_max, pe_working_set_t * data_set); node_t * container_color(resource_t * rsc, node_t * prefer, pe_working_set_t * data_set) { GListPtr containers = NULL; GListPtr nodes = NULL; container_variant_data_t *container_data = NULL; CRM_CHECK(rsc != NULL, return NULL); get_container_variant_data(container_data, rsc); set_bit(rsc->flags, pe_rsc_allocating); for (GListPtr gIter = container_data->tuples; gIter != NULL; gIter = gIter->next) { container_grouping_t *tuple = (container_grouping_t *)gIter->data; containers = g_list_append(containers, tuple->docker); } dump_node_scores(show_scores ? 0 : scores_log_level, rsc, __FUNCTION__, rsc->allowed_nodes); nodes = g_hash_table_get_values(rsc->allowed_nodes); nodes = g_list_sort_with_data(nodes, sort_node_weight, NULL); containers = g_list_sort_with_data(containers, sort_clone_instance, data_set); distribute_children(rsc, containers, nodes, container_data->replicas, container_data->replicas_per_host, data_set); g_list_free(nodes); g_list_free(containers); for (GListPtr gIter = container_data->tuples; gIter != NULL; gIter = gIter->next) { container_grouping_t *tuple = (container_grouping_t *)gIter->data; CRM_ASSERT(tuple); if(tuple->ip) { tuple->ip->cmds->allocate(tuple->ip, prefer, data_set); } if(tuple->remote) { tuple->remote->cmds->allocate(tuple->remote, prefer, data_set); } // Explicitly allocate tuple->child before the container->child if(tuple->child) { pe_node_t *node = NULL; GHashTableIter iter; g_hash_table_iter_init(&iter, tuple->child->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & node)) { if(node->details != tuple->node->details) { node->weight = -INFINITY; } else { node->weight = INFINITY; } } set_bit(tuple->child->parent->flags, pe_rsc_allocating); tuple->child->cmds->allocate(tuple->child, tuple->node, data_set); clear_bit(tuple->child->parent->flags, pe_rsc_allocating); } } if(container_data->child) { pe_node_t *node = NULL; GHashTableIter iter; g_hash_table_iter_init(&iter, container_data->child->allowed_nodes); while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & node)) { if(is_child_container_node(container_data, node)) { node->weight = 0; } else { node->weight = -INFINITY; } } container_data->child->cmds->allocate(container_data->child, prefer, data_set); } clear_bit(rsc->flags, pe_rsc_allocating); return NULL; } void container_create_actions(resource_t * rsc, pe_working_set_t * data_set) { container_variant_data_t *container_data = NULL; CRM_CHECK(rsc != NULL, return); get_container_variant_data(container_data, rsc); for (GListPtr gIter = container_data->tuples; gIter != NULL; gIter = gIter->next) { container_grouping_t *tuple = (container_grouping_t *)gIter->data; CRM_ASSERT(tuple); if(tuple->ip) { tuple->ip->cmds->create_actions(tuple->ip, data_set); } if(tuple->docker) { tuple->docker->cmds->create_actions(tuple->docker, data_set); } if(tuple->remote) { tuple->remote->cmds->create_actions(tuple->remote, data_set); } } if(container_data->child) { container_data->child->cmds->create_actions(container_data->child, data_set); } } void container_internal_constraints(resource_t * rsc, pe_working_set_t * data_set) { container_variant_data_t *container_data = NULL; CRM_CHECK(rsc != NULL, return); get_container_variant_data(container_data, rsc); for (GListPtr gIter = container_data->tuples; gIter != NULL; gIter = gIter->next) { char *id = NULL; container_grouping_t *tuple = (container_grouping_t *)gIter->data; CRM_ASSERT(tuple); if(tuple->docker) { tuple->docker->cmds->internal_constraints(tuple->docker, data_set); } if(tuple->ip) { tuple->ip->cmds->internal_constraints(tuple->ip, data_set); // Start ip then docker new_rsc_order(tuple->ip, RSC_START, tuple->docker, RSC_START, pe_order_runnable_left, data_set); new_rsc_order(tuple->docker, RSC_STOP, tuple->ip, RSC_STOP, pe_order_implies_first, data_set); id = crm_strdup_printf("%s-ip-with-docker-%d", rsc->id, tuple->offset); rsc_colocation_new(id, NULL, INFINITY, tuple->ip, tuple->docker, NULL, NULL, data_set); free(id); } if(tuple->remote) { tuple->remote->cmds->internal_constraints(tuple->remote, data_set); // Start docker then remote new_rsc_order( tuple->docker, RSC_START, tuple->remote, RSC_START, pe_order_runnable_left, data_set); new_rsc_order( tuple->remote, RSC_STOP, tuple->docker, RSC_STOP, pe_order_implies_first, data_set); if(tuple->ip) { id = crm_strdup_printf("%s-remote-with-ip-%d", rsc->id, tuple->offset); rsc_colocation_new(id, NULL, INFINITY, tuple->remote, tuple->ip, NULL, NULL, data_set); free(id); } else { id = crm_strdup_printf("%s-remote-with-docker-%d", rsc->id, tuple->offset); rsc_colocation_new(id, NULL, INFINITY, tuple->remote, tuple->docker, NULL, NULL, data_set); free(id); } } if(tuple->child) { CRM_ASSERT(tuple->remote); // Start of the remote then child is implicit in the PE's remote logic } } if(container_data->child) { container_data->child->cmds->internal_constraints(container_data->child, data_set); } } void container_rsc_colocation_lh(resource_t * rsc_lh, resource_t * rsc_rh, rsc_colocation_t * constraint) { pe_err("Container %s cannot be colocated with anything", rsc_lh->id); } void container_rsc_colocation_rh(resource_t * rsc_lh, resource_t * rsc_rh, rsc_colocation_t * constraint) { pe_err("Container %s cannot be colocated with anything", rsc_rh->id); } enum pe_action_flags container_action_flags(action_t * action, node_t * node) { enum pe_action_flags flags = (pe_action_optional | pe_action_runnable | pe_action_pseudo); return flags; } enum pe_graph_flags container_update_actions(action_t * first, action_t * then, node_t * node, enum pe_action_flags flags, enum pe_action_flags filter, enum pe_ordering type) { enum pe_graph_flags changed = pe_graph_none; return changed; } void container_rsc_location(resource_t * rsc, rsc_to_node_t * constraint) { GListPtr gIter = rsc->children; pe_rsc_trace(rsc, "Processing location constraint %s for %s", constraint->id, rsc->id); native_rsc_location(rsc, constraint); for (; gIter != NULL; gIter = gIter->next) { resource_t *child_rsc = (resource_t *) gIter->data; child_rsc->cmds->rsc_location(child_rsc, constraint); } } void container_expand(resource_t * rsc, pe_working_set_t * data_set) { container_variant_data_t *container_data = NULL; CRM_CHECK(rsc != NULL, return); get_container_variant_data(container_data, rsc); for (GListPtr gIter = container_data->tuples; gIter != NULL; gIter = gIter->next) { container_grouping_t *tuple = (container_grouping_t *)gIter->data; CRM_ASSERT(tuple); if(tuple->ip) { tuple->ip->cmds->expand(tuple->ip, data_set); } if(tuple->child) { tuple->child->cmds->expand(tuple->child, data_set); } if(tuple->docker) { tuple->docker->cmds->expand(tuple->docker, data_set); } if(tuple->remote) { tuple->remote->cmds->expand(tuple->remote, data_set); } } } gboolean container_create_probe(resource_t * rsc, node_t * node, action_t * complete, gboolean force, pe_working_set_t * data_set) { bool any_created = FALSE; container_variant_data_t *container_data = NULL; CRM_CHECK(rsc != NULL, return FALSE); get_container_variant_data(container_data, rsc); for (GListPtr gIter = container_data->tuples; gIter != NULL; gIter = gIter->next) { container_grouping_t *tuple = (container_grouping_t *)gIter->data; CRM_ASSERT(tuple); if(tuple->ip) { any_created |= tuple->ip->cmds->create_probe(tuple->ip, node, complete, force, data_set); } if(tuple->child && node->details == tuple->node->details) { any_created |= tuple->child->cmds->create_probe(tuple->child, node, complete, force, data_set); } if(tuple->docker) { bool created = tuple->docker->cmds->create_probe(tuple->docker, node, complete, force, data_set); if(created) { any_created = TRUE; /* If we're limited to one replica per host (due to * the lack of an IP range probably), then we don't * want any of our peer containers starting until * we've established that no other copies are already * running. * * Partly this is to ensure that replicas_per_host is * observed, but also to ensure that the containers * don't fail to start because the necessary port - * mappings (which wont include an IP for uniqueness) + * mappings (which won't include an IP for uniqueness) * are already taken */ for (GListPtr tIter = container_data->tuples; tIter != NULL && container_data->replicas_per_host == 1; tIter = tIter->next) { container_grouping_t *other = (container_grouping_t *)tIter->data; - if(other != tuple) { + + if ((other != tuple) && (other != NULL) + && (other->docker != NULL)) { + custom_action_order(tuple->docker, generate_op_key(tuple->docker->id, RSC_STATUS, 0), NULL, other->docker, generate_op_key(other->docker->id, RSC_START, 0), NULL, pe_order_optional, data_set); } } } } if(FALSE && tuple->remote) { // TODO: Needed? any_created |= tuple->remote->cmds->create_probe(tuple->remote, node, complete, force, data_set); } } return any_created; } void container_append_meta(resource_t * rsc, xmlNode * xml) { } GHashTable * container_merge_weights(resource_t * rsc, const char *rhs, GHashTable * nodes, const char *attr, float factor, enum pe_weights flags) { return rsc_merge_weights(rsc, rhs, nodes, attr, factor, flags); } void container_LogActions( resource_t * rsc, pe_working_set_t * data_set, gboolean terminal) { container_variant_data_t *container_data = NULL; CRM_CHECK(rsc != NULL, return); get_container_variant_data(container_data, rsc); for (GListPtr gIter = container_data->tuples; gIter != NULL; gIter = gIter->next) { container_grouping_t *tuple = (container_grouping_t *)gIter->data; CRM_ASSERT(tuple); if(tuple->ip) { LogActions(tuple->ip, data_set, terminal); } if(tuple->docker) { LogActions(tuple->docker, data_set, terminal); } if(tuple->remote) { LogActions(tuple->remote, data_set, terminal); } if(tuple->child) { LogActions(tuple->child, data_set, terminal); } } } diff --git a/xml/Readme.md b/xml/Readme.md index fdcf2aef7a..0db2731884 100644 --- a/xml/Readme.md +++ b/xml/Readme.md @@ -1,85 +1,85 @@ # Schema Reference Besides the version of Pacemaker itself, the XML schema of the Pacemaker configuration has its own version. ## Versioned Schema Evolution A versioned schema offers transparent backward/forward compatibility. - It reflects the timeline of schema-backed features (introduction, changes to the syntax, possibly deprecation) through the versioned stable schema increments, while keeping schema versions used by default by older Pacemaker versions untouched. - Pacemaker internally uses the latest stable schema version, and relies on supplemental transformations to promote cluster configurations based on older, incompatible schema versions into the desired form. - It allows experimental features with a possibly unstable configuration interface to be developed using the special `next` version of the schema. ## Mapping Pacemaker Versions to Schema Versions | Pacemaker | Latest Schema | Changed | --------- | ------------- | ---------------------------------------------- | `1.1.16` | `2.6` | `constraints` | `1.1.15` | `2.5` | `alerts` | `1.1.14` | `2.4` | `fencing` | `1.1.13` | `2.3` | `constraints` | `1.1.12` | `2.0` | `nodes`, `nvset`, `resources`, `tags` + `acls` | `1.1.8`+ | `1.2` | # Updating schema files # ## Experimental features ## Experimental features go into `${base}-next.rng` Create from the most recent `${base}-${X}.${Y}.rng` if it does not already exist ## Stable features ## The current stable version is determined at runtime when __xml_build_schema_list() interrogates the CRM_DTD_DIRECTORY. It will have the form `pacemaker-${X}.${Y}` and the highest `${X}.${Y}` wins. ### Simple Additions When the new syntax is a simple addition to the previous one, create a new entry with `${Y} = ${Yold} + 1` ### Feature Removal or otherwise Incompatible Changes When the new syntax is not a simple addition to the previous one, create a new entry with `${X} = ${Xold} + 1` and `${Y} = 0`. An XSLT file is also required that converts an old syntax to the new one and must be named `upgrade-${Xold}.${Yold}.xsl`. See `xml/upgrade06.xsl` for an example. -### General Proceedure +### General Procedure 1. Copy the most recent version of `${base}-*.rng` to `${base}-${X}.${Y}.rng` 1. Commit the copy, eg. `"Clone the latest ${base} schema in preparation for changes"`. This way the actual change will be obvious in the commit history. 1. Modify `${base}-${X}.${Y}.rng` as required 1. Add an XSLT file if required and update `xslt_SCRIPTS` in `xml/Makefile.am` 1. Commit ## Admin Tasks New features will not be available until the admin 1. Updates all the nodes 1. Runs the equivalent of `cibadmin --upgrade` ## Random Notes From the source directory, run `make -C xml diff` to see the changes in the current schema (compared to the previous ones) and also the pending changes in `pacemaker-next`. Alternatively, if the intention is to grok the overall historical schema evolution, use `make -C xml fulldiff`. diff --git a/xml/crm_mon.rng b/xml/crm_mon.rng index 653c15e9d1..731c118615 100644 --- a/xml/crm_mon.rng +++ b/xml/crm_mon.rng @@ -1,301 +1,327 @@ unknown unknown member remote ping + + + + + + + + + docker + + + + + + + + + + + + + + + + +