diff --git a/cts/lab/patterns.py b/cts/lab/patterns.py index 7ba1f6de2a..495bf0f0dd 100644 --- a/cts/lab/patterns.py +++ b/cts/lab/patterns.py @@ -1,414 +1,415 @@ """ Pattern-holding classes for Pacemaker's Cluster Test Suite (CTS) """ __copyright__ = "Copyright 2008-2022 the Pacemaker project contributors" __license__ = "GNU General Public License version 2 or later (GPLv2+) WITHOUT ANY WARRANTY" import sys, os from cts.CTSvars import * patternvariants = {} class BasePatterns(object): def __init__(self, name): self.name = name patternvariants[name] = self self.ignore = [ "avoid confusing Valgrind", # Logging bug in some versions of libvirtd r"libvirtd.*: internal error: Failed to parse PCI config address", # pcs can log this when node is fenced, but fencing is OK in some # tests (and we will catch it in pacemaker logs when not OK) r"pcs.daemon:No response from: .* request: get_configs, error:", ] self.BadNews = [] self.components = {} self.commands = { "StatusCmd" : "crmadmin -t 60 -S %s 2>/dev/null", "CibQuery" : "cibadmin -Ql", "CibAddXml" : "cibadmin --modify -c --xml-text %s", "CibDelXpath" : "cibadmin --delete --xpath %s", # 300,000 == 5 minutes "RscRunning" : CTSvars.CRM_DAEMON_DIR + "/cts-exec-helper -R -r %s", "CIBfile" : "%s:"+CTSvars.CRM_CONFIG_DIR+"/cib.xml", "TmpDir" : "/tmp", "BreakCommCmd" : "iptables -A INPUT -s %s -j DROP >/dev/null 2>&1", "FixCommCmd" : "iptables -D INPUT -s %s -j DROP >/dev/null 2>&1", # tc qdisc add dev lo root handle 1: cbq avpkt 1000 bandwidth 1000mbit # tc class add dev lo parent 1: classid 1:1 cbq rate "$RATE"kbps allot 17000 prio 5 bounded isolated # tc filter add dev lo parent 1: protocol ip prio 16 u32 match ip dst 127.0.0.1 match ip sport $PORT 0xFFFF flowid 1:1 # tc qdisc add dev lo parent 1: netem delay "$LATENCY"msec "$(($LATENCY/4))"msec 10% 2> /dev/null > /dev/null "ReduceCommCmd" : "", "RestoreCommCmd" : "tc qdisc del dev lo root", "MaintenanceModeOn" : "cibadmin --modify -c --xml-text ''", "MaintenanceModeOff" : "cibadmin --delete --xpath \"//nvpair[@name='maintenance-mode']\"", "StandbyCmd" : "crm_attribute -Vq -U %s -n standby -l forever -v %s 2>/dev/null", "StandbyQueryCmd" : "crm_attribute -qG -U %s -n standby -l forever -d off 2>/dev/null", } self.search = { "Pat:DC_IDLE" : "pacemaker-controld.*State transition.*-> S_IDLE", # This won't work if we have multiple partitions "Pat:Local_started" : "%s\W.*controller successfully started", "Pat:NonDC_started" : r"%s\W.*State transition.*-> S_NOT_DC", "Pat:DC_started" : r"%s\W.*State transition.*-> S_IDLE", "Pat:We_stopped" : "%s\W.*OVERRIDE THIS PATTERN", "Pat:They_stopped" : "%s\W.*LOST:.* %s ", "Pat:They_dead" : "node %s.*: is dead", "Pat:They_up" : "%s %s\W.*OVERRIDE THIS PATTERN", "Pat:TransitionComplete" : "Transition status: Complete: complete", "Pat:Fencing_start" : r"Requesting peer fencing .* targeting %s", "Pat:Fencing_ok" : r"pacemaker-fenced.*:\s*Operation .* targeting %s by .* for .*@.*: OK", "Pat:Fencing_recover" : r"pacemaker-schedulerd.*: Recover %s", "Pat:Fencing_active" : r"stonith resource .* is active on 2 nodes (attempting recovery)", "Pat:Fencing_probe" : r"pacemaker-controld.* Result of probe operation for %s on .*: Error", "Pat:RscOpOK" : r"pacemaker-controld.*:\s+Result of %s operation for %s.*: (0 \()?ok", "Pat:RscOpFail" : r"pacemaker-schedulerd.*:.*Unexpected result .* recorded for %s of %s ", "Pat:CloneOpFail" : r"pacemaker-schedulerd.*:.*Unexpected result .* recorded for %s of (%s|%s) ", "Pat:RscRemoteOpOK" : r"pacemaker-controld.*:\s+Result of %s operation for %s on %s: (0 \()?ok", "Pat:NodeFenced" : r"pacemaker-controld.*:\s* Peer %s was terminated \(.*\) by .* on behalf of .*: OK", } def get_component(self, key): if key in self.components: return self.components[key] print("Unknown component '%s' for %s" % (key, self.name)) return [] def get_patterns(self, key): if key == "BadNews": return self.BadNews elif key == "BadNewsIgnore": return self.ignore elif key == "Commands": return self.commands elif key == "Search": return self.search elif key == "Components": return self.components def __getitem__(self, key): if key == "Name": return self.name elif key in self.commands: return self.commands[key] elif key in self.search: return self.search[key] else: print("Unknown template '%s' for %s" % (key, self.name)) return None class crm_corosync(BasePatterns): ''' Patterns for Corosync version 2 cluster manager class ''' def __init__(self, name): BasePatterns.__init__(self, name) self.commands.update({ "StartCmd" : "service corosync start && service pacemaker start", "StopCmd" : "service pacemaker stop; [ ! -e /usr/sbin/pacemaker-remoted ] || service pacemaker_remote stop; service corosync stop", "EpochCmd" : "crm_node -e", "QuorumCmd" : "crm_node -q", "PartitionCmd" : "crm_node -p", }) self.search.update({ # Close enough ... "Corosync Cluster Engine exiting normally" isn't # printed reliably. "Pat:We_stopped" : "%s\W.*Unloading all Corosync service engines", "Pat:They_stopped" : "%s\W.*pacemaker-controld.*Node %s(\[|\s).*state is now lost", "Pat:They_dead" : "pacemaker-controld.*Node %s(\[|\s).*state is now lost", "Pat:They_up" : "\W%s\W.*pacemaker-controld.*Node %s state is now member", "Pat:ChildExit" : r"\[[0-9]+\] exited with status [0-9]+ \(", # "with signal 9" == pcmk_child_exit(), "$" == check_active_before_startup_processes() "Pat:ChildKilled" : r"%s\W.*pacemakerd.*%s\[[0-9]+\] terminated( with signal 9|$)", "Pat:ChildRespawn" : "%s\W.*pacemakerd.*Respawning %s subdaemon after unexpected exit", "Pat:InfraUp" : "%s\W.*corosync.*Initializing transport", "Pat:PacemakerUp" : "%s\W.*pacemakerd.*Starting Pacemaker", }) self.ignore = self.ignore + [ r"crm_mon:", r"crmadmin:", r"update_trace_data", r"async_notify:.*strange, client not found", r"Parse error: Ignoring unknown option .*nodename", r"error.*: Operation 'reboot' .* using FencingFail returned ", r"getinfo response error: 1$", r"sbd.* error: inquisitor_child: DEBUG MODE IS ACTIVE", r"sbd.* pcmk:\s*error:.*Connection to cib_ro.* (failed|closed)", ] self.BadNews = [ r"[^(]error:", r"crit:", r"ERROR:", r"CRIT:", r"Shutting down...NOW", r"Timer I_TERMINATE just popped", r"input=I_ERROR", r"input=I_FAIL", r"input=I_INTEGRATED cause=C_TIMER_POPPED", r"input=I_FINALIZED cause=C_TIMER_POPPED", r"input=I_ERROR", r"(pacemakerd|pacemaker-execd|pacemaker-controld):.*, exiting", r"schedulerd.*Attempting recovery of resource", r"is taking more than 2x its timeout", r"Confirm not received from", r"Welcome reply not received from", r"Attempting to schedule .* after a stop", r"Resource .* was active at shutdown", r"duplicate entries for call_id", r"Search terminated:", r":global_timer_callback", r"Faking parameter digest creation", r"Parameters to .* action changed:", r"Parameters to .* changed", r"pacemakerd.*\[[0-9]+\] terminated( with signal| as IPC server|$)", r"pacemaker-schedulerd.*Recover .*\(.* -\> .*\)", r"rsyslogd.* imuxsock lost .* messages from pid .* due to rate-limiting", r"Peer is not part of our cluster", r"We appear to be in an election loop", r"Unknown node -> we will not deliver message", r"(Blackbox dump requested|Problem detected)", r"pacemakerd.*Could not connect to Cluster Configuration Database API", r"Receiving messages from a node we think is dead", r"share the same cluster nodeid", r"share the same name", #r"crm_ipc_send:.*Request .* failed", #r"crm_ipc_send:.*Sending to .* is disabled until pending reply is received", # Not inherently bad, but worth tracking #r"No need to invoke the TE", #r"ping.*: DEBUG: Updated connected = 0", #r"Digest mis-match:", r"pacemaker-controld:.*Transition failed: terminated", r"Local CIB .* differs from .*:", r"warn.*:\s*Continuing but .* will NOT be used", r"warn.*:\s*Cluster configuration file .* is corrupt", #r"Executing .* fencing operation", r"Election storm", r"stalled the FSA with pending inputs", ] self.components["common-ignore"] = [ r"Pending action:", r"resource( was|s were) active at shutdown", r"pending LRM operations at shutdown", r"Lost connection to the CIB manager", r"pacemaker-controld.*:\s*Action A_RECOVER .* not supported", r"pacemaker-controld.*:\s*Performing A_EXIT_1 - forcefully exiting ", r".*:\s*Requesting fencing \([^)]+\) of node ", r"(Blackbox dump requested|Problem detected)", ] self.components["corosync-ignore"] = [ r"Could not connect to Corosync CFG: CS_ERR_LIBRARY", r"error:.*Connection to the CPG API failed: Library error", r"\[[0-9]+\] exited with status [0-9]+ \(", r"\[[0-9]+\] terminated with signal 15", r"pacemaker-based.*error:.*Corosync connection lost", r"pacemaker-fenced.*error:.*Corosync connection terminated", r"pacemaker-controld.*State transition .* S_RECOVERY", r"pacemaker-controld.*error:.*Input (I_ERROR|I_TERMINATE ) .*received in state", r"pacemaker-controld.*error:.*Could not recover from internal error", r"error:.*Connection to cib_(shm|rw).* (failed|closed)", r"error:.*cib_(shm|rw) IPC provider disconnected while waiting", r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"crit: Fencing daemon connection failed", # This is overbroad, but we don't have a way to say that only # certain transition errors are acceptable (if the fencer respawns, # fence devices may appear multiply active). We have to rely on # other causes of a transition error logging their own error # message, which is the usual practice. r"pacemaker-schedulerd.* Calculated transition .*/pe-error", ] self.components["corosync"] = [ # We expect each daemon to lose its cluster connection. # However, if the CIB manager loses its connection first, # it's possible for another daemon to lose that connection and # exit before losing the cluster connection. r"pacemakerd.*:\s*warning:.*Lost connection to cluster layer", r"pacemaker-attrd.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"pacemaker-based.*:\s*(crit|error):.*Lost connection to cluster layer", r"pacemaker-controld.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"pacemaker-fenced.*:\s*(crit|error):.*Lost connection to (cluster layer|the CIB manager)", r"schedulerd.*Scheduling node .* for fencing", r"pacemaker-controld.*:\s*Peer .* was terminated \(.*\) by .* on behalf of .*:\s*OK", ] self.components["pacemaker-based"] = [ r"pacemakerd.* pacemaker-attrd\[[0-9]+\] exited with status 102", r"pacemakerd.* pacemaker-controld\[[0-9]+\] exited with status 1", r"pacemakerd.* Respawning pacemaker-attrd subdaemon after unexpected exit", r"pacemakerd.* Respawning pacemaker-based subdaemon after unexpected exit", r"pacemakerd.* Respawning pacemaker-controld subdaemon after unexpected exit", r"pacemakerd.* Respawning pacemaker-fenced subdaemon after unexpected exit", r"pacemaker-.* Connection to cib_.* (failed|closed)", r"pacemaker-attrd.*:.*Lost connection to the CIB manager", r"pacemaker-controld.*:.*Lost connection to the CIB manager", r"pacemaker-controld.*I_ERROR.*crmd_cib_connection_destroy", r"pacemaker-controld.* State transition .* S_RECOVERY", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", ] self.components["pacemaker-based-ignore"] = [ r"pacemaker-execd.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error (Lost connection to fencer)", + r"pacemaker-controld.*:Could not connect to attrd: Connection refused", # This is overbroad, but we don't have a way to say that only # certain transition errors are acceptable (if the fencer respawns, # fence devices may appear multiply active). We have to rely on # other causes of a transition error logging their own error # message, which is the usual practice. r"pacemaker-schedulerd.* Calculated transition .*/pe-error", ] self.components["pacemaker-execd"] = [ r"pacemaker-controld.*Connection to executor failed", r"pacemaker-controld.*I_ERROR.*lrm_connection_destroy", r"pacemaker-controld.*State transition .* S_RECOVERY", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", r"pacemaker-controld.*Could not recover from internal error", r"pacemakerd.*pacemaker-controld\[[0-9]+\] exited with status 1", r"pacemakerd.* Respawning pacemaker-execd subdaemon after unexpected exit", r"pacemakerd.* Respawning pacemaker-controld subdaemon after unexpected exit", ] self.components["pacemaker-execd-ignore"] = [ r"pacemaker-(attrd|controld).*Connection to lrmd.* (failed|closed)", r"pacemaker-(attrd|controld).*Could not execute alert", ] self.components["pacemaker-controld"] = [ # "WARN: determine_online_status: Node .* is unclean", # "Scheduling node .* for fencing", # Only if the node wasn't the DC: "State transition S_IDLE", "State transition .* -> S_IDLE", ] self.components["pacemaker-controld-ignore"] = [] self.components["pacemaker-attrd"] = [] self.components["pacemaker-attrd-ignore"] = [] self.components["pacemaker-schedulerd"] = [ "State transition .* S_RECOVERY", r"pacemakerd.* Respawning pacemaker-controld subdaemon after unexpected exit", r"pacemaker-controld\[[0-9]+\] exited with status 1 \(", r"Connection to the scheduler failed", "pacemaker-controld.*I_ERROR.*save_cib_contents", r"pacemaker-controld.*: Input I_TERMINATE .*from do_recover", "pacemaker-controld.*Could not recover from internal error", ] self.components["pacemaker-schedulerd-ignore"] = [ r"Connection to pengine.* (failed|closed)", ] self.components["pacemaker-fenced"] = [ r"error:.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"Fencing daemon connection failed", r"pacemaker-controld.*Fencer successfully connected", ] self.components["pacemaker-fenced-ignore"] = [ r"(error|warning):.*Connection to (fencer|stonith-ng).* (closed|failed|lost)", r"crit:.*Fencing daemon connection failed", r"error:.*Fencer connection failed \(will retry\)", r"pacemaker-controld.*:\s+Result of .* operation for Fencing.*Error (Lost connection to fencer)", # This is overbroad, but we don't have a way to say that only # certain transition errors are acceptable (if the fencer respawns, # fence devices may appear multiply active). We have to rely on # other causes of a transition error logging their own error # message, which is the usual practice. r"pacemaker-schedulerd.* Calculated transition .*/pe-error", ] self.components["pacemaker-fenced-ignore"].extend(self.components["common-ignore"]) class crm_corosync_docker(crm_corosync): ''' Patterns for Corosync version 2 cluster manager class ''' def __init__(self, name): crm_corosync.__init__(self, name) self.commands.update({ "StartCmd" : "pcmk_start", "StopCmd" : "pcmk_stop", }) class PatternSelector(object): def __init__(self, name=None): self.name = name self.base = BasePatterns("crm-base") if not name: crm_corosync("crm-corosync") elif name == "crm-corosync": crm_corosync(name) elif name == "crm-corosync-docker": crm_corosync_docker(name) def get_variant(self, variant): if variant in patternvariants: return patternvariants[variant] print("defaulting to crm-base for %s" % variant) return self.base def get_patterns(self, variant, kind): return self.get_variant(variant).get_patterns(kind) def get_template(self, variant, key): v = self.get_variant(variant) return v[key] def get_component(self, variant, kind): return self.get_variant(variant).get_component(kind) def __getitem__(self, key): return self.get_template(self.name, key) # python cts/CTSpatt.py -k crm-corosync -t StartCmd if __name__ == '__main__': pdir=os.path.dirname(sys.path[0]) sys.path.insert(0, pdir) # So that things work from the source directory kind=None template=None skipthis=None args=sys.argv[1:] for i in range(0, len(args)): if skipthis: skipthis=None continue elif args[i] == "-k" or args[i] == "--kind": skipthis=1 kind = args[i+1] elif args[i] == "-t" or args[i] == "--template": skipthis=1 template = args[i+1] else: print("Illegal argument " + args[i]) print(PatternSelector(kind)[template]) diff --git a/daemons/pacemakerd/pacemakerd.h b/daemons/pacemakerd/pacemakerd.h index 424dbbcc5d..b2a6864f4e 100644 --- a/daemons/pacemakerd/pacemakerd.h +++ b/daemons/pacemakerd/pacemakerd.h @@ -1,37 +1,35 @@ /* * Copyright 2010-2022 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include #define MAX_RESPAWN 100 extern GMainLoop *mainloop; extern struct qb_ipcs_service_handlers mcp_ipc_callbacks; extern const char *pacemakerd_state; extern gboolean running_with_sbd; extern unsigned int shutdown_complete_state_reported_to; extern gboolean shutdown_complete_state_reported_client_closed; extern crm_trigger_t *shutdown_trigger; extern crm_trigger_t *startup_trigger; extern time_t subdaemon_check_progress; gboolean mcp_read_config(void); gboolean cluster_connect_cfg(void); void cluster_disconnect_cfg(void); int find_and_track_existing_processes(void); gboolean init_children_processes(void *user_data); void restart_cluster_subdaemons(void); void pcmk_shutdown(int nsig); -void pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id); -void pcmk_handle_shutdown_request(pcmk__client_t *c, xmlNode *msg, uint32_t id, uint32_t flags); void pcmkd_shutdown_corosync(void); bool pcmkd_corosync_connected(void); diff --git a/daemons/pacemakerd/pcmkd_messages.c b/daemons/pacemakerd/pcmkd_messages.c index 6f0e48bec2..7ed9899d67 100644 --- a/daemons/pacemakerd/pcmkd_messages.c +++ b/daemons/pacemakerd/pcmkd_messages.c @@ -1,202 +1,278 @@ /* * Copyright 2010-2022 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU General Public License version 2 * or later (GPLv2+) WITHOUT ANY WARRANTY. */ #include #include "pacemakerd.h" +#include #include #include #include #include #include #include #include -void -pcmk_handle_ping_request(pcmk__client_t *c, xmlNode *msg, uint32_t id) +static GHashTable *pcmkd_handlers = NULL; + +static xmlNode * +handle_node_cache_request(pcmk__request_t *request) +{ + crm_trace("Ignoring request from client %s to purge node " + "because peer cache is not used", + pcmk__client_name(request->ipc_client)); + + pcmk__ipc_send_ack(request->ipc_client, request->ipc_id, request->ipc_flags, + "ack", NULL, CRM_EX_OK); + return NULL; +} + +static xmlNode * +handle_ping_request(pcmk__request_t *request) { + xmlNode *msg = request->xml; + const char *value = NULL; xmlNode *ping = NULL; xmlNode *reply = NULL; const char *from = crm_element_value(msg, F_CRM_SYS_FROM); /* Pinged for status */ crm_trace("Pinged from " F_CRM_SYS_FROM "='%s' " F_CRM_ORIGIN "='%s'", pcmk__s(from, ""), pcmk__s(crm_element_value(msg, F_CRM_ORIGIN), "")); + + pcmk__ipc_send_ack(request->ipc_client, request->ipc_id, request->ipc_flags, + "ack", NULL, CRM_EX_INDETERMINATE); + ping = create_xml_node(NULL, XML_CRM_TAG_PING); value = crm_element_value(msg, F_CRM_SYS_TO); crm_xml_add(ping, XML_PING_ATTR_SYSFROM, value); crm_xml_add(ping, XML_PING_ATTR_PACEMAKERDSTATE, pacemakerd_state); crm_xml_add_ll(ping, XML_ATTR_TSTAMP, (long long) subdaemon_check_progress); crm_xml_add(ping, XML_PING_ATTR_STATUS, "ok"); reply = create_reply(msg, ping); + free_xml(ping); - if (reply) { - if (pcmk__ipc_send_xml(c, id, reply, crm_ipc_server_event) != - pcmk_rc_ok) { - crm_err("Failed sending ping reply to client %s", - pcmk__client_name(c)); - } - free_xml(reply); + + if (reply == NULL) { + pcmk__format_result(&request->result, CRM_EX_ERROR, PCMK_EXEC_ERROR, + "Failed building ping reply for client %s", + pcmk__client_name(request->ipc_client)); } else { - crm_err("Failed building ping reply for client %s", - pcmk__client_name(c)); + pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); } + /* just proceed state on sbd pinging us */ if (from && strstr(from, "sbd")) { if (pcmk__str_eq(pacemakerd_state, XML_PING_ATTR_PACEMAKERDSTATE_SHUTDOWNCOMPLETE, pcmk__str_none)) { if (pcmk__get_sbd_sync_resource_startup()) { crm_notice("Shutdown-complete-state passed to SBD."); } - shutdown_complete_state_reported_to = c->pid; + + shutdown_complete_state_reported_to = request->ipc_client->pid; + } else if (pcmk__str_eq(pacemakerd_state, XML_PING_ATTR_PACEMAKERDSTATE_WAITPING, pcmk__str_none)) { crm_notice("Received startup-trigger from SBD."); pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_STARTINGDAEMONS; mainloop_set_trigger(startup_trigger); } } + + return reply; } -void -pcmk_handle_shutdown_request(pcmk__client_t *c, xmlNode *msg, uint32_t id, uint32_t flags) +static xmlNode * +handle_shutdown_request(pcmk__request_t *request) { + xmlNode *msg = request->xml; + xmlNode *shutdown = NULL; xmlNode *reply = NULL; /* Only allow privileged users (i.e. root or hacluster) to shut down * Pacemaker from the command line (or direct IPC), so that other users * are forced to go through the CIB and have ACLs applied. */ - bool allowed = pcmk_is_set(c->flags, pcmk__client_privileged); + bool allowed = pcmk_is_set(request->ipc_client->flags, pcmk__client_privileged); + + pcmk__ipc_send_ack(request->ipc_client, request->ipc_id, request->ipc_flags, + "ack", NULL, CRM_EX_INDETERMINATE); shutdown = create_xml_node(NULL, XML_CIB_ATTR_SHUTDOWN); if (allowed) { crm_notice("Shutting down in response to IPC request %s from %s", crm_element_value(msg, F_CRM_REFERENCE), crm_element_value(msg, F_CRM_ORIGIN)); crm_xml_add_int(shutdown, XML_LRM_ATTR_OPSTATUS, CRM_EX_OK); } else { crm_warn("Ignoring shutdown request from unprivileged client %s", - pcmk__client_name(c)); + pcmk__client_name(request->ipc_client)); crm_xml_add_int(shutdown, XML_LRM_ATTR_OPSTATUS, CRM_EX_INSUFFICIENT_PRIV); } reply = create_reply(msg, shutdown); free_xml(shutdown); - if (reply) { - if (pcmk__ipc_send_xml(c, id, reply, crm_ipc_server_event) != pcmk_rc_ok) { - crm_err("Failed sending shutdown reply to client %s", - pcmk__client_name(c)); - } - free_xml(reply); + + if (reply == NULL) { + pcmk__format_result(&request->result, CRM_EX_ERROR, PCMK_EXEC_ERROR, + "Failed building shutdown reply for client %s", + pcmk__client_name(request->ipc_client)); } else { - crm_err("Failed building shutdown reply for client %s", - pcmk__client_name(c)); + pcmk__set_result(&request->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL); } if (allowed) { pcmk_shutdown(15); } + + return reply; +} + +static xmlNode * +handle_unknown_request(pcmk__request_t *request) +{ + pcmk__ipc_send_ack(request->ipc_client, request->ipc_id, request->ipc_flags, + "ack", NULL, CRM_EX_INVALID_PARAM); + + pcmk__format_result(&request->result, CRM_EX_PROTOCOL, PCMK_EXEC_INVALID, + "Unknown IPC request type '%s' (bug?)", + pcmk__client_name(request->ipc_client)); + return NULL; +} + +static void +pcmkd_register_handlers(void) +{ + pcmk__server_command_t handlers[] = { + { CRM_OP_RM_NODE_CACHE, handle_node_cache_request }, + { CRM_OP_PING, handle_ping_request }, + { CRM_OP_QUIT, handle_shutdown_request }, + { NULL, handle_unknown_request }, + }; + + pcmkd_handlers = pcmk__register_handlers(handlers); } static int32_t pcmk_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid) { crm_trace("Connection %p", c); if (pcmk__new_client(c, uid, gid) == NULL) { return -EIO; } return 0; } /* Error code means? */ static int32_t pcmk_ipc_closed(qb_ipcs_connection_t * c) { pcmk__client_t *client = pcmk__find_client(c); if (client == NULL) { return 0; } crm_trace("Connection %p", c); if (shutdown_complete_state_reported_to == client->pid) { shutdown_complete_state_reported_client_closed = TRUE; if (shutdown_trigger) { mainloop_set_trigger(shutdown_trigger); } } pcmk__free_client(client); return 0; } static void pcmk_ipc_destroy(qb_ipcs_connection_t * c) { crm_trace("Connection %p", c); pcmk_ipc_closed(c); } /* Exit code means? */ static int32_t pcmk_ipc_dispatch(qb_ipcs_connection_t * qbc, void *data, size_t size) { uint32_t id = 0; uint32_t flags = 0; - const char *task = NULL; xmlNode *msg = NULL; pcmk__client_t *c = pcmk__find_client(qbc); CRM_CHECK(c != NULL, return 0); + if (pcmkd_handlers == NULL) { + pcmkd_register_handlers(); + } + msg = pcmk__client_data2xml(c, data, &id, &flags); if (msg == NULL) { pcmk__ipc_send_ack(c, id, flags, "ack", NULL, CRM_EX_PROTOCOL); return 0; - } - task = crm_element_value(msg, F_CRM_TASK); - if (pcmk__str_empty(task)) { - crm_debug("IPC command from client %s is missing task", - pcmk__client_name(c)); - pcmk__ipc_send_ack(c, id, flags, "ack", NULL, CRM_EX_INVALID_PARAM); + } else { + char *log_msg = NULL; + const char *reason = NULL; + xmlNode *reply = NULL; - } else if (pcmk__str_eq(task, CRM_OP_QUIT, pcmk__str_none)) { - pcmk__ipc_send_ack(c, id, flags, "ack", NULL, CRM_EX_INDETERMINATE); - pcmk_handle_shutdown_request(c, msg, id, flags); + pcmk__request_t request = { + .ipc_client = c, + .ipc_id = id, + .ipc_flags = flags, + .peer = NULL, + .xml = msg, + .call_options = 0, + .result = PCMK__UNKNOWN_RESULT, + }; - } else if (pcmk__str_eq(task, CRM_OP_RM_NODE_CACHE, pcmk__str_none)) { - crm_trace("Ignoring request from client %s to purge node " - "because peer cache is not used", pcmk__client_name(c)); - pcmk__ipc_send_ack(c, id, flags, "ack", NULL, CRM_EX_OK); + request.op = crm_element_value_copy(request.xml, F_CRM_TASK); + CRM_CHECK(request.op != NULL, return 0); - } else if (pcmk__str_eq(task, CRM_OP_PING, pcmk__str_none)) { - pcmk__ipc_send_ack(c, id, flags, "ack", NULL, CRM_EX_INDETERMINATE); - pcmk_handle_ping_request(c, msg, id); + reply = pcmk__process_request(&request, pcmkd_handlers); - } else { - crm_debug("Unrecognized IPC command '%s' from client %s", - task, pcmk__client_name(c)); - pcmk__ipc_send_ack(c, id, flags, "ack", NULL, CRM_EX_INVALID_PARAM); + if (reply != NULL) { + pcmk__ipc_send_xml(c, id, reply, crm_ipc_server_event); + free_xml(reply); + } + + reason = request.result.exit_reason; + + log_msg = crm_strdup_printf("Processed %s request from %s %s: %s%s%s%s", + request.op, pcmk__request_origin_type(&request), + pcmk__request_origin(&request), + pcmk_exec_status_str(request.result.execution_status), + (reason == NULL)? "" : " (", + (reason == NULL)? "" : reason, + (reason == NULL)? "" : ")"); + + if (!pcmk__result_ok(&request.result)) { + crm_warn("%s", log_msg); + } else { + crm_debug("%s", log_msg); + } + + free(log_msg); + pcmk__reset_request(&request); } free_xml(msg); return 0; } struct qb_ipcs_service_handlers mcp_ipc_callbacks = { .connection_accept = pcmk_ipc_accept, .connection_created = NULL, .msg_process = pcmk_ipc_dispatch, .connection_closed = pcmk_ipc_closed, .connection_destroyed = pcmk_ipc_destroy }; diff --git a/lib/common/ipc_attrd.c b/lib/common/ipc_attrd.c index 5496cd12c0..3d7e7ff904 100644 --- a/lib/common/ipc_attrd.c +++ b/lib/common/ipc_attrd.c @@ -1,569 +1,569 @@ /* * Copyright 2011-2022 the Pacemaker project contributors * * The version control history for this file may have further details. * * This source code is licensed under the GNU Lesser General Public License * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ #ifndef _GNU_SOURCE # define _GNU_SOURCE #endif #include #include #include #include #include #include #include #include "crmcommon_private.h" static void set_pairs_data(pcmk__attrd_api_reply_t *data, xmlNode *msg_data) { const char *name = NULL; pcmk__attrd_query_pair_t *pair; name = crm_element_value(msg_data, PCMK__XA_ATTR_NAME); for (xmlNode *node = first_named_child(msg_data, XML_CIB_TAG_NODE); node != NULL; node = crm_next_same_xml(node)) { pair = calloc(1, sizeof(pcmk__attrd_query_pair_t)); CRM_ASSERT(pair != NULL); pair->node = crm_element_value(node, PCMK__XA_ATTR_NODE_NAME); pair->name = name; pair->value = crm_element_value(node, PCMK__XA_ATTR_VALUE); data->data.pairs = g_list_prepend(data->data.pairs, pair); } } static bool reply_expected(pcmk_ipc_api_t *api, xmlNode *request) { const char *command = crm_element_value(request, PCMK__XA_TASK); return pcmk__str_any_of(command, PCMK__ATTRD_CMD_UPDATE, PCMK__ATTRD_CMD_UPDATE_BOTH, PCMK__ATTRD_CMD_UPDATE_DELAY, PCMK__ATTRD_CMD_QUERY, NULL); } static bool dispatch(pcmk_ipc_api_t *api, xmlNode *reply) { const char *value = NULL; crm_exit_t status = CRM_EX_OK; pcmk__attrd_api_reply_t reply_data = { pcmk__attrd_reply_unknown }; if (pcmk__str_eq((const char *) reply->name, "ack", pcmk__str_none)) { return false; } /* Do some basic validation of the reply */ value = crm_element_value(reply, F_TYPE); if (pcmk__str_empty(value) || !pcmk__str_eq(value, T_ATTRD, pcmk__str_none)) { crm_info("Unrecognizable message from attribute manager: " "message type '%s' not '" T_ATTRD "'", pcmk__s(value, "")); status = CRM_EX_PROTOCOL; goto done; } value = crm_element_value(reply, F_SUBTYPE); /* Only the query command gets a reply for now. NULL counts as query for * backward compatibility with attribute managers <2.1.3 that didn't set it. */ if (pcmk__str_eq(value, PCMK__ATTRD_CMD_QUERY, pcmk__str_null_matches)) { if (!xmlHasProp(reply, (pcmkXmlStr) PCMK__XA_ATTR_NAME)) { status = ENXIO; // Most likely, the attribute doesn't exist goto done; } reply_data.reply_type = pcmk__attrd_reply_query; set_pairs_data(&reply_data, reply); } else { crm_info("Unrecognizable message from attribute manager: " "message subtype '%s' unknown", pcmk__s(value, "")); status = CRM_EX_PROTOCOL; goto done; } done: pcmk__call_ipc_callback(api, pcmk_ipc_event_reply, status, &reply_data); /* Free any reply data that was allocated */ if (reply_data.data.pairs) { g_list_free_full(reply_data.data.pairs, free); } return false; } pcmk__ipc_methods_t * pcmk__attrd_api_methods() { pcmk__ipc_methods_t *cmds = calloc(1, sizeof(pcmk__ipc_methods_t)); if (cmds != NULL) { cmds->new_data = NULL; cmds->free_data = NULL; cmds->post_connect = NULL; cmds->reply_expected = reply_expected; cmds->dispatch = dispatch; } return cmds; } /*! * \internal * \brief Create a generic pacemaker-attrd operation * * \param[in] user_name If not NULL, ACL user to set for operation * * \return XML of pacemaker-attrd operation */ static xmlNode * create_attrd_op(const char *user_name) { xmlNode *attrd_op = create_xml_node(NULL, __func__); crm_xml_add(attrd_op, F_TYPE, T_ATTRD); crm_xml_add(attrd_op, F_ORIG, (crm_system_name? crm_system_name: "unknown")); crm_xml_add(attrd_op, PCMK__XA_ATTR_USER, user_name); return attrd_op; } static int create_api(pcmk_ipc_api_t **api) { int rc = pcmk_new_ipc_api(api, pcmk_ipc_attrd); if (rc != pcmk_rc_ok) { - crm_err("error: Could not connect to attrd: %s", pcmk_rc_str(rc)); + crm_err("Could not connect to attrd: %s", pcmk_rc_str(rc)); } return rc; } static void destroy_api(pcmk_ipc_api_t *api) { pcmk_disconnect_ipc(api); pcmk_free_ipc_api(api); api = NULL; } static int connect_and_send_attrd_request(pcmk_ipc_api_t *api, xmlNode *request) { int rc = pcmk_rc_ok; int max = 5; while (max > 0) { crm_info("Connecting to cluster... %d retries remaining", max); rc = pcmk_connect_ipc(api, pcmk_ipc_dispatch_sync); if (rc == pcmk_rc_ok) { rc = pcmk__send_ipc_request(api, request); break; } else if (rc == EAGAIN || rc == EALREADY) { sleep(5 - max); max--; } else { - crm_err("error: Could not connect to attrd: %s", pcmk_rc_str(rc)); + crm_err("Could not connect to attrd: %s", pcmk_rc_str(rc)); break; } } return rc; } static int send_attrd_request(pcmk_ipc_api_t *api, xmlNode *request) { return pcmk__send_ipc_request(api, request); } int pcmk__attrd_api_clear_failures(pcmk_ipc_api_t *api, const char *node, const char *resource, const char *operation, const char *interval_spec, const char *user_name, uint32_t options) { int rc = pcmk_rc_ok; xmlNode *request = create_attrd_op(user_name); const char *interval_desc = NULL; const char *op_desc = NULL; const char *target = pcmk__node_attr_target(node); if (target != NULL) { node = target; } crm_xml_add(request, PCMK__XA_TASK, PCMK__ATTRD_CMD_CLEAR_FAILURE); crm_xml_add(request, PCMK__XA_ATTR_NODE_NAME, node); crm_xml_add(request, PCMK__XA_ATTR_RESOURCE, resource); crm_xml_add(request, PCMK__XA_ATTR_OPERATION, operation); crm_xml_add(request, PCMK__XA_ATTR_INTERVAL, interval_spec); crm_xml_add_int(request, PCMK__XA_ATTR_IS_REMOTE, pcmk_is_set(options, pcmk__node_attr_remote)); if (api == NULL) { rc = create_api(&api); if (rc != pcmk_rc_ok) { return rc; } rc = connect_and_send_attrd_request(api, request); destroy_api(api); } else if (!pcmk_ipc_is_connected(api)) { rc = connect_and_send_attrd_request(api, request); } else { rc = send_attrd_request(api, request); } free_xml(request); if (operation) { interval_desc = interval_spec? interval_spec : "nonrecurring"; op_desc = operation; } else { interval_desc = "all"; op_desc = "operations"; } crm_debug("Asked pacemaker-attrd to clear failure of %s %s for %s on %s: %s (%d)", interval_desc, op_desc, (resource? resource : "all resources"), (node? node : "all nodes"), pcmk_rc_str(rc), rc); return rc; } int pcmk__attrd_api_delete(pcmk_ipc_api_t *api, const char *node, const char *name, uint32_t options) { const char *target = NULL; if (name == NULL) { return EINVAL; } target = pcmk__node_attr_target(node); if (target != NULL) { node = target; } /* Make sure the right update option is set. */ options &= ~pcmk__node_attr_delay; options |= pcmk__node_attr_value; return pcmk__attrd_api_update(api, node, name, NULL, NULL, NULL, NULL, options); } int pcmk__attrd_api_purge(pcmk_ipc_api_t *api, const char *node) { int rc = pcmk_rc_ok; xmlNode *request = NULL; const char *display_host = (node ? node : "localhost"); const char *target = pcmk__node_attr_target(node); if (target != NULL) { node = target; } request = create_attrd_op(NULL); crm_xml_add(request, PCMK__XA_TASK, PCMK__ATTRD_CMD_PEER_REMOVE); crm_xml_add(request, PCMK__XA_ATTR_NODE_NAME, node); if (api == NULL) { rc = create_api(&api); if (rc != pcmk_rc_ok) { return rc; } rc = connect_and_send_attrd_request(api, request); destroy_api(api); } else if (!pcmk_ipc_is_connected(api)) { rc = connect_and_send_attrd_request(api, request); } else { rc = send_attrd_request(api, request); } free_xml(request); crm_debug("Asked pacemaker-attrd to purge %s: %s (%d)", display_host, pcmk_rc_str(rc), rc); return rc; } int pcmk__attrd_api_query(pcmk_ipc_api_t *api, const char *node, const char *name, uint32_t options) { int rc = pcmk_rc_ok; xmlNode *request = NULL; const char *target = NULL; if (name == NULL) { return EINVAL; } target = pcmk__node_attr_target(node); if (target != NULL) { node = target; } request = create_attrd_op(NULL); crm_xml_add(request, PCMK__XA_ATTR_NAME, name); crm_xml_add(request, PCMK__XA_TASK, PCMK__ATTRD_CMD_QUERY); crm_xml_add(request, PCMK__XA_ATTR_NODE_NAME, node); rc = send_attrd_request(api, request); free_xml(request); if (node) { crm_debug("Queried pacemaker-attrd for %s on %s: %s (%d)", name, node, pcmk_rc_str(rc), rc); } else { crm_debug("Queried pacemaker-attrd for %s: %s (%d)", name, pcmk_rc_str(rc), rc); } return rc; } int pcmk__attrd_api_refresh(pcmk_ipc_api_t *api, const char *node) { int rc = pcmk_rc_ok; xmlNode *request = NULL; const char *display_host = (node ? node : "localhost"); const char *target = pcmk__node_attr_target(node); if (target != NULL) { node = target; } request = create_attrd_op(NULL); crm_xml_add(request, PCMK__XA_TASK, PCMK__ATTRD_CMD_REFRESH); crm_xml_add(request, PCMK__XA_ATTR_NODE_NAME, node); if (api == NULL) { rc = create_api(&api); if (rc != pcmk_rc_ok) { return rc; } rc = connect_and_send_attrd_request(api, request); destroy_api(api); } else if (!pcmk_ipc_is_connected(api)) { rc = connect_and_send_attrd_request(api, request); } else { rc = send_attrd_request(api, request); } free_xml(request); crm_debug("Asked pacemaker-attrd to refresh %s: %s (%d)", display_host, pcmk_rc_str(rc), rc); return rc; } static void add_op_attr(xmlNode *op, uint32_t options) { if (pcmk_all_flags_set(options, pcmk__node_attr_value | pcmk__node_attr_delay)) { crm_xml_add(op, PCMK__XA_TASK, PCMK__ATTRD_CMD_UPDATE_BOTH); } else if (pcmk_is_set(options, pcmk__node_attr_value)) { crm_xml_add(op, PCMK__XA_TASK, PCMK__ATTRD_CMD_UPDATE); } else if (pcmk_is_set(options, pcmk__node_attr_delay)) { crm_xml_add(op, PCMK__XA_TASK, PCMK__ATTRD_CMD_UPDATE_DELAY); } } static void populate_update_op(xmlNode *op, const char *node, const char *name, const char *value, const char *dampen, const char *set, uint32_t options) { if (pcmk_is_set(options, pcmk__node_attr_pattern)) { crm_xml_add(op, PCMK__XA_ATTR_PATTERN, name); } else { crm_xml_add(op, PCMK__XA_ATTR_NAME, name); } add_op_attr(op, options); crm_xml_add(op, PCMK__XA_ATTR_VALUE, value); crm_xml_add(op, PCMK__XA_ATTR_DAMPENING, dampen); crm_xml_add(op, PCMK__XA_ATTR_NODE_NAME, node); crm_xml_add(op, PCMK__XA_ATTR_SET, set); crm_xml_add_int(op, PCMK__XA_ATTR_IS_REMOTE, pcmk_is_set(options, pcmk__node_attr_remote)); crm_xml_add_int(op, PCMK__XA_ATTR_IS_PRIVATE, pcmk_is_set(options, pcmk__node_attr_private)); } int pcmk__attrd_api_update(pcmk_ipc_api_t *api, const char *node, const char *name, const char *value, const char *dampen, const char *set, const char *user_name, uint32_t options) { int rc = pcmk_rc_ok; xmlNode *request = NULL; const char *display_host = (node ? node : "localhost"); const char *target = NULL; if (name == NULL) { return EINVAL; } target = pcmk__node_attr_target(node); if (target != NULL) { node = target; } request = create_attrd_op(user_name); populate_update_op(request, node, name, value, dampen, set, options); if (api == NULL) { rc = create_api(&api); if (rc != pcmk_rc_ok) { return rc; } rc = connect_and_send_attrd_request(api, request); destroy_api(api); } else if (!pcmk_ipc_is_connected(api)) { rc = connect_and_send_attrd_request(api, request); } else { rc = send_attrd_request(api, request); } free_xml(request); crm_debug("Asked pacemaker-attrd to update %s on %s: %s (%d)", name, display_host, pcmk_rc_str(rc), rc); return rc; } int pcmk__attrd_api_update_list(pcmk_ipc_api_t *api, GList *attrs, const char *dampen, const char *set, const char *user_name, uint32_t options) { int rc = pcmk_rc_ok; xmlNode *request = NULL; if (attrs == NULL) { return EINVAL; } /* There are two different ways of handling a list of attributes: * * (1) For messages originating from some command line tool, we have to send * them one at a time. In this loop, we just call pcmk__attrd_api_update * for each, letting it deal with creating the API object if it doesn't * already exist. * * The reason we can't use a single message in this case is that we can't * trust that the server supports it. Remote nodes could be involved * here, and there's no guarantee that a newer client running on a remote * node is talking to (or proxied through) a cluster node with a newer * attrd. We also can't just try sending a single message and then falling * back on multiple. There's no handshake with the attrd server to * determine its version. And then we would need to do that fallback in the * dispatch function for this to work for all connection types (mainloop in * particular), and at that point we won't know what the original message * was in order to break it apart and resend as individual messages. * * (2) For messages between daemons, we can be assured that the local attrd * will support the new message and that it can send to the other attrds * as one request or split up according to the minimum supported version. */ for (GList *iter = attrs; iter != NULL; iter = iter->next) { pcmk__attrd_query_pair_t *pair = (pcmk__attrd_query_pair_t *) iter->data; if (pcmk__is_daemon) { const char *target = NULL; xmlNode *child = NULL; /* First time through this loop - create the basic request. */ if (request == NULL) { request = create_attrd_op(user_name); add_op_attr(request, options); } /* Add a child node for this operation. We add the task to the top * level XML node so attrd_ipc_dispatch doesn't need changes. And * then we also add the task to each child node in populate_update_op * so attrd_client_update knows what form of update is taking place. */ child = create_xml_node(request, XML_ATTR_OP); target = pcmk__node_attr_target(pair->node); if (target != NULL) { pair->node = target; } populate_update_op(child, pair->node, pair->name, pair->value, dampen, set, options); } else { rc = pcmk__attrd_api_update(api, pair->node, pair->name, pair->value, dampen, set, user_name, options); } } /* If we were doing multiple attributes at once, we still need to send the * request. Do that now, creating and destroying the API object if needed. */ if (pcmk__is_daemon) { bool created_api = false; if (api == NULL) { rc = create_api(&api); if (rc != pcmk_rc_ok) { return rc; } created_api = true; } rc = connect_and_send_attrd_request(api, request); free_xml(request); if (created_api) { destroy_api(api); } } return rc; }