diff --git a/daemons/controld/Makefile.am b/daemons/controld/Makefile.am index 9949b17917..11ff9c59d3 100644 --- a/daemons/controld/Makefile.am +++ b/daemons/controld/Makefile.am @@ -1,47 +1,47 @@ # # Copyright 2004-2018 Andrew Beekhof # # This source code is licensed under the GNU General Public License version 2 # or later (GPLv2+) WITHOUT ANY WARRANTY. # include $(top_srcdir)/Makefile.common halibdir = $(CRM_DAEMON_DIR) ## binary progs halib_PROGRAMS = pacemaker-controld ## SOURCES noinst_HEADERS = controld_alerts.h \ controld_callbacks.h \ crmd_fsa.h crmd.h \ - crmd_lrm.h crmd_messages.h crmd_utils.h fsa_defines.h \ + crmd_lrm.h crmd_messages.h crmd_utils.h \ fsa_matrix.h fsa_proto.h membership.h te_callbacks.h \ tengine.h throttle.h crmd_metadata.h pacemaker_controld_CFLAGS = $(CFLAGS_HARDENED_EXE) pacemaker_controld_LDFLAGS = $(LDFLAGS_HARDENED_EXE) pacemaker_controld_LDADD = $(top_builddir)/lib/fencing/libstonithd.la \ $(top_builddir)/lib/transition/libtransitioner.la \ $(top_builddir)/lib/pengine/libpe_rules.la \ $(top_builddir)/lib/cib/libcib.la \ $(top_builddir)/lib/cluster/libcrmcluster.la \ $(top_builddir)/lib/common/libcrmcommon.la \ $(top_builddir)/lib/services/libcrmservice.la \ $(top_builddir)/lib/lrmd/liblrmd.la \ $(CLUSTERLIBS) pacemaker_controld_SOURCES = main.c corosync.c crmd_metadata.c \ fsa.c control.c messages.c membership.c callbacks.c attrd.c \ election.c join_client.c join_dc.c throttle.c \ cib.c pengine.c tengine.c lrm.c lrm_state.c remote_ra.c \ utils.c misc.c te_events.c te_actions.c te_utils.c te_callbacks.c if BUILD_XML_HELP man7_MANS = pacemaker-controld.7 endif CLEANFILES = $(man7_MANS) diff --git a/daemons/controld/crmd_fsa.h b/daemons/controld/crmd_fsa.h index dc00485e27..2040eaf391 100644 --- a/daemons/controld/crmd_fsa.h +++ b/daemons/controld/crmd_fsa.h @@ -1,106 +1,545 @@ /* - * Copyright (C) 2004 Andrew Beekhof + * Copyright 2004-2018 Andrew Beekhof * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This software is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * This source code is licensed under the GNU Lesser General Public License + * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. */ + #ifndef CRMD_FSA__H # define CRMD_FSA__H -# include - # include # include # include # include # include # include # include +/*! States the controller can be in */ +enum crmd_fsa_state { + S_IDLE = 0, /* Nothing happening */ + + S_ELECTION, /* Take part in the election algorithm as + * described below + */ + S_INTEGRATION, /* integrate that status of new nodes (which is + * all of them if we have just been elected DC) + * to form a complete and up-to-date picture of + * the CIB + */ + S_FINALIZE_JOIN, /* integrate that status of new nodes (which is + * all of them if we have just been elected DC) + * to form a complete and up-to-date picture of + * the CIB + */ + S_NOT_DC, /* we are in non-DC mode */ + S_POLICY_ENGINE, /* Determine next stable state of the cluster */ + S_RECOVERY, /* Something bad happened, check everything is ok + * before continuing and attempt to recover if + * required + */ + S_RELEASE_DC, /* we were the DC, but now we arent anymore, + * possibly by our own request, and we should + * release all unnecessary sub-systems, finish + * any pending actions, do general cleanup and + * unset anything that makes us think we are + * special :) + */ + S_STARTING, /* we are just starting out */ + S_PENDING, /* we are not a full/active member yet */ + S_STOPPING, /* We are in the final stages of shutting down */ + S_TERMINATE, /* We are going to shutdown, this is the equiv of + * "Sending TERM signal to all processes" in Linux + * and in worst case scenarios could be considered + * a self STONITH + */ + S_TRANSITION_ENGINE, /* Attempt to make the calculated next stable + * state of the cluster a reality + */ + + S_HALT, /* Freeze - don't do anything + * Something bad happened that needs the admin to fix + * Wait for I_ELECTION + */ + + /* ----------- Last input found in table is above ---------- */ + S_ILLEGAL /* This is an illegal FSA state */ + /* (must be last) */ +}; + +# define MAXSTATE S_ILLEGAL + +/* + Once we start and do some basic sanity checks, we go into the + S_NOT_DC state and await instructions from the DC or input from + the cluster layer which indicates the election algorithm needs to run. + + If the election algorithm is triggered, we enter the S_ELECTION state + from where we can either go back to the S_NOT_DC state or progress + to the S_INTEGRATION state (or S_RELEASE_DC if we used to be the DC + but aren't anymore). + + The election algorithm has been adapted from + http://www.cs.indiana.edu/cgi-bin/techreports/TRNNN.cgi?trnum=TR521 + + Loosely known as the Bully Algorithm, its major points are: + - Election is initiated by any node (N) that notices that the controller + is no longer responding + - Concurrent multiple elections are possible + - Algorithm: + + N sends ELECTION messages to all nodes that occur earlier in the + cluster layer's membership list + + If no one responds, N wins and becomes controller + + N sends out CONTROLLER messages to all other nodes in the partition + + If one of higher-ups answers, it takes over. N is done. + + Once the election is complete, if we are the DC, we enter the + S_INTEGRATION state which is a DC-in-waiting style state. We are + the DC, but we shouldn't do anything yet because we may not have an + up-to-date picture of the cluster. There may of course be times + when this fails, so we should go back to the S_RECOVERY stage and + check everything is ok. We may also end up here if a new node came + online, since each node is authorative on itself and we would want + to incorporate its information into the CIB. + + Once we have the latest CIB, we then enter the S_POLICY_ENGINE state + where invoke the Policy Engine. It is possible that between + invoking the Policy Engine and receiving an answer, that we receive + more input. In this case, we would discard the orginal result and + invoke it again. + + Once we are satisfied with the output from the Policy Engine, we + enter S_TRANSITION_ENGINE and feed the Policy Engine's output to the + Transition Engine who attempts to make the Policy Engine's + calculation a reality. If the transition completes successfully, + we enter S_IDLE, otherwise we go back to S_POLICY_ENGINE with the + current unstable state and try again. + + Of course, we may be asked to shutdown at any time, however we must + progress to S_NOT_DC before doing so. Once we have handed over DC + duties to another node, we can then shut down like everyone else, + that is, by asking the DC for permission and waiting for it to take all + our resources away. + + The case where we are the DC and the only node in the cluster is a + special case and handled as an escalation which takes us to + S_SHUTDOWN. Similarly, if any other point in the shutdown + fails or stalls, this is escalated and we end up in S_TERMINATE. + + At any point, the controller can relay messages for its subsystems, + but outbound messages (from subsystems) should probably be blocked + until S_INTEGRATION (for the DC) or the join protocol has + completed (for non-DC controllers). +*/ + +/*====================================== + * + * Inputs/Events/Stimuli to be given to the finite state machine + * + * Some of these a true events, and others are synthesised based on + * the "register" (see below) and the contents or source of messages. + * + * The machine keeps processing until receiving I_NULL + * + *======================================*/ +enum crmd_fsa_input { +/* 0 */ + I_NULL, /* Nothing happened */ +/* 1 */ + + I_CIB_OP, /* An update to the CIB occurred */ + I_CIB_UPDATE, /* An update to the CIB occurred */ + I_DC_TIMEOUT, /* We have lost communication with the DC */ + I_ELECTION, /* Someone started an election */ + I_PE_CALC, /* The Policy Engine needs to be invoked */ + I_RELEASE_DC, /* The election completed and we were not + * elected, but we were the DC beforehand + */ + I_ELECTION_DC, /* The election completed and we were (re-)elected + * DC + */ + I_ERROR, /* Something bad happened (more serious than + * I_FAIL) and may not have been due to the action + * being performed. For example, we may have lost + * our connection to the CIB. + */ +/* 9 */ + I_FAIL, /* The action failed to complete successfully */ + I_INTEGRATED, + I_FINALIZED, + I_NODE_JOIN, /* A node has entered the cluster */ + I_NOT_DC, /* We are not and were not the DC before or after + * the current operation or state + */ + I_RECOVERED, /* The recovery process completed successfully */ + I_RELEASE_FAIL, /* We could not give up DC status for some reason + */ + I_RELEASE_SUCCESS, /* We are no longer the DC */ + I_RESTART, /* The current set of actions needs to be + * restarted + */ + I_TE_SUCCESS, /* Some non-resource, non-cluster-layer action + * is required of us, e.g. ping + */ +/* 20 */ + I_ROUTER, /* Do our job as router and forward this to the + * right place + */ + I_SHUTDOWN, /* We are asking to shutdown */ + I_STOP, /* We have been told to shutdown */ + I_TERMINATE, /* Actually exit */ + I_STARTUP, + I_PE_SUCCESS, /* The action completed successfully */ + + I_JOIN_OFFER, /* The DC is offering membership */ + I_JOIN_REQUEST, /* The client is requesting membership */ + I_JOIN_RESULT, /* If not the DC: The result of a join request + * Else: A client is responding with its local state info + */ + + I_WAIT_FOR_EVENT, /* we may be waiting for an async task to "happen" + * and until it does, we can't do anything else + */ + + I_DC_HEARTBEAT, /* The DC is telling us that it is alive and well */ + + I_LRM_EVENT, + +/* 30 */ + I_PENDING, + I_HALT, + + /* ------------ Last input found in table is above ----------- */ + I_ILLEGAL /* This is an illegal value for an FSA input */ + /* (must be last) */ +}; + +# define MAXINPUT I_ILLEGAL + +# define I_MESSAGE I_ROUTER + +/*====================================== + * + * actions + * + * Some of the actions below will always occur together for now, but this may + * not always be the case, so they are split up so that they can easily be + * called independently in the future, if necessary. + * + * For example, separating A_LRM_CONNECT from A_STARTUP might be useful + * if we ever try to recover from a faulty or disconnected executor. + * + *======================================*/ + + /* Don't do anything */ +# define A_NOTHING 0x0000000000000000ULL + +/* -- Startup actions -- */ + /* Hook to perform any actions (other than connecting to other daemons) + * that might be needed as part of the startup. + */ +# define A_STARTUP 0x0000000000000001ULL + /* Hook to perform any actions that might be needed as part + * after startup is successful. + */ +# define A_STARTED 0x0000000000000002ULL + /* Connect to cluster layer */ +# define A_HA_CONNECT 0x0000000000000004ULL +# define A_HA_DISCONNECT 0x0000000000000008ULL + +# define A_INTEGRATE_TIMER_START 0x0000000000000010ULL +# define A_INTEGRATE_TIMER_STOP 0x0000000000000020ULL +# define A_FINALIZE_TIMER_START 0x0000000000000040ULL +# define A_FINALIZE_TIMER_STOP 0x0000000000000080ULL + +/* -- Election actions -- */ +# define A_DC_TIMER_START 0x0000000000000100ULL +# define A_DC_TIMER_STOP 0x0000000000000200ULL +# define A_ELECTION_COUNT 0x0000000000000400ULL +# define A_ELECTION_VOTE 0x0000000000000800ULL + +# define A_ELECTION_START 0x0000000000001000ULL + +/* -- Message processing -- */ + /* Process the queue of requests */ +# define A_MSG_PROCESS 0x0000000000002000ULL + /* Send the message to the correct recipient */ +# define A_MSG_ROUTE 0x0000000000004000ULL + + /* Send a welcome message to new node(s) */ +# define A_DC_JOIN_OFFER_ONE 0x0000000000008000ULL + +/* -- Server Join protocol actions -- */ + /* Send a welcome message to all nodes */ +# define A_DC_JOIN_OFFER_ALL 0x0000000000010000ULL + /* Process the remote node's ack of our join message */ +# define A_DC_JOIN_PROCESS_REQ 0x0000000000020000ULL + /* Send out the results of the Join phase */ +# define A_DC_JOIN_FINALIZE 0x0000000000040000ULL + /* Send out the results of the Join phase */ +# define A_DC_JOIN_PROCESS_ACK 0x0000000000080000ULL + +/* -- Client Join protocol actions -- */ +# define A_CL_JOIN_QUERY 0x0000000000100000ULL +# define A_CL_JOIN_ANNOUNCE 0x0000000000200000ULL + /* Request membership to the DC list */ +# define A_CL_JOIN_REQUEST 0x0000000000400000ULL + /* Did the DC accept or reject the request */ +# define A_CL_JOIN_RESULT 0x0000000000800000ULL + +/* -- Recovery, DC start/stop -- */ + /* Something bad happened, try to recover */ +# define A_RECOVER 0x0000000001000000ULL + /* Hook to perform any actions (apart from starting, the TE, PE + * and gathering the latest CIB) that might be necessary before + * giving up the responsibilities of being the DC. + */ +# define A_DC_RELEASE 0x0000000002000000ULL + /* */ +# define A_DC_RELEASED 0x0000000004000000ULL + /* Hook to perform any actions (apart from starting, the TE, PE + * and gathering the latest CIB) that might be necessary before + * taking over the responsibilities of being the DC. + */ +# define A_DC_TAKEOVER 0x0000000008000000ULL + +/* -- Shutdown actions -- */ +# define A_SHUTDOWN 0x0000000010000000ULL +# define A_STOP 0x0000000020000000ULL +# define A_EXIT_0 0x0000000040000000ULL +# define A_EXIT_1 0x0000000080000000ULL + +# define A_SHUTDOWN_REQ 0x0000000100000000ULL +# define A_ELECTION_CHECK 0x0000000200000000ULL +# define A_DC_JOIN_FINAL 0x0000000400000000ULL + +/* -- CIB actions -- */ +# define A_CIB_START 0x0000020000000000ULL +# define A_CIB_STOP 0x0000040000000000ULL + +/* -- Transition Engine actions -- */ + /* Attempt to reach the newly calculated cluster state. This is + * only called once per transition (except if it is asked to + * stop the transition or start a new one). + * Once given a cluster state to reach, the TE will determine + * tasks that can be performed in parallel, execute them, wait + * for replies and then determine the next set until the new + * state is reached or no further tasks can be taken. + */ +# define A_TE_INVOKE 0x0000100000000000ULL +# define A_TE_START 0x0000200000000000ULL +# define A_TE_STOP 0x0000400000000000ULL +# define A_TE_CANCEL 0x0000800000000000ULL +# define A_TE_HALT 0x0001000000000000ULL + +/* -- Policy Engine actions -- */ + /* Calculate the next state for the cluster. This is only + * invoked once per needed calculation. + */ +# define A_PE_INVOKE 0x0002000000000000ULL +# define A_PE_START 0x0004000000000000ULL +# define A_PE_STOP 0x0008000000000000ULL +/* -- Misc actions -- */ + /* Add a system generate "block" so that resources arent moved + * to or are activly moved away from the affected node. This + * way we can return quickly even if busy with other things. + */ +# define A_NODE_BLOCK 0x0010000000000000ULL + /* Update our information in the local CIB */ +# define A_UPDATE_NODESTATUS 0x0020000000000000ULL +# define A_READCONFIG 0x0080000000000000ULL + +/* -- LRM Actions -- */ + /* Connect to pacemaker-execd */ +# define A_LRM_CONNECT 0x0100000000000000ULL + /* Disconnect from pacemaker-execd */ +# define A_LRM_DISCONNECT 0x0200000000000000ULL +# define A_LRM_INVOKE 0x0400000000000000ULL +# define A_LRM_EVENT 0x0800000000000000ULL + +/* -- Logging actions -- */ +# define A_LOG 0x1000000000000000ULL +# define A_ERROR 0x2000000000000000ULL +# define A_WARN 0x4000000000000000ULL + +# define O_EXIT (A_SHUTDOWN|A_STOP|A_LRM_DISCONNECT|A_HA_DISCONNECT|A_EXIT_0|A_CIB_STOP) +# define O_RELEASE (A_DC_TIMER_STOP|A_DC_RELEASE|A_PE_STOP|A_TE_STOP|A_DC_RELEASED) +# define O_PE_RESTART (A_PE_START|A_PE_STOP) +# define O_TE_RESTART (A_TE_START|A_TE_STOP) +# define O_CIB_RESTART (A_CIB_START|A_CIB_STOP) +# define O_LRM_RECONNECT (A_LRM_CONNECT|A_LRM_DISCONNECT) +# define O_DC_TIMER_RESTART (A_DC_TIMER_STOP|A_DC_TIMER_START) +/*====================================== + * + * "register" contents + * + * Things we may want to remember regardless of which state we are in. + * + * These also count as inputs for synthesizing I_* + * + *======================================*/ +# define R_THE_DC 0x00000001ULL + /* Are we the DC? */ +# define R_STARTING 0x00000002ULL + /* Are we starting up? */ +# define R_SHUTDOWN 0x00000004ULL + /* Are we trying to shut down? */ +# define R_STAYDOWN 0x00000008ULL + /* Should we restart? */ + +# define R_JOIN_OK 0x00000010ULL /* Have we completed the join process */ +# define R_READ_CONFIG 0x00000040ULL +# define R_INVOKE_PE 0x00000080ULL + /* Does the PE needed to be invoked at + the next appropriate point? */ + +# define R_CIB_CONNECTED 0x00000100ULL + /* Is the CIB connected? */ +# define R_PE_CONNECTED 0x00000200ULL + /* Is the Policy Engine connected? */ +# define R_TE_CONNECTED 0x00000400ULL + /* Is the Transition Engine connected? */ +# define R_LRM_CONNECTED 0x00000800ULL // Is pacemaker-execd connected? + +# define R_CIB_REQUIRED 0x00001000ULL + /* Is the CIB required? */ +# define R_PE_REQUIRED 0x00002000ULL + /* Is the Policy Engine required? */ +# define R_TE_REQUIRED 0x00004000ULL + /* Is the Transition Engine required? */ +# define R_ST_REQUIRED 0x00008000ULL + /* Is the Stonith daemon required? */ + +# define R_CIB_DONE 0x00010000ULL + /* Have we calculated the CIB? */ +# define R_HAVE_CIB 0x00020000ULL /* Do we have an up-to-date CIB */ +# define R_CIB_ASKED 0x00040000ULL /* Have we asked for an up-to-date CIB */ + +# define R_MEMBERSHIP 0x00100000ULL /* Have we got cluster layer data yet */ +# define R_PEER_DATA 0x00200000ULL /* Have we got T_CL_STATUS data yet */ + +# define R_HA_DISCONNECTED 0x00400000ULL /* did we sign out of our own accord */ + +# define R_REQ_PEND 0x01000000ULL + /* Are there Requests waiting for + processing? */ +# define R_PE_PEND 0x02000000ULL + /* Has the PE been invoked and we're + awaiting a reply? */ +# define R_TE_PEND 0x04000000ULL + /* Has the TE been invoked and we're + awaiting completion? */ +# define R_RESP_PEND 0x08000000ULL + /* Do we have clients waiting on a + response? if so perhaps we shouldn't + stop yet */ + +# define R_IN_TRANSITION 0x10000000ULL + /* */ +# define R_SENT_RSC_STOP 0x20000000ULL /* Have we sent a stop action to all + * resources in preparation for + * shutting down */ + +# define R_IN_RECOVERY 0x80000000ULL + +/* + * Magic RC used within the controller to indicate direct nacks + * (operation is invalid in current state) + */ +#define CRM_DIRECT_NACK_RC (99) + +enum crmd_fsa_cause { + C_UNKNOWN = 0, + C_STARTUP, + C_IPC_MESSAGE, + C_HA_MESSAGE, + C_CRMD_STATUS_CALLBACK, + C_LRM_OP_CALLBACK, + C_TIMER_POPPED, + C_SHUTDOWN, + C_FSA_INTERNAL, +}; + typedef struct fsa_timer_s fsa_timer_t; struct fsa_timer_s { guint source_id; /* timer source id */ int period_ms; /* timer period */ enum crmd_fsa_input fsa_input; gboolean(*callback) (gpointer data); gboolean repeat; int counter; }; enum fsa_data_type { fsa_dt_none, fsa_dt_ha_msg, fsa_dt_xml, fsa_dt_lrm, }; typedef struct fsa_data_s fsa_data_t; struct fsa_data_s { int id; enum crmd_fsa_input fsa_input; enum crmd_fsa_cause fsa_cause; long long actions; const char *origin; void *data; enum fsa_data_type data_type; }; -extern enum crmd_fsa_state s_crmd_fsa(enum crmd_fsa_cause cause); - /* Global FSA stuff */ extern volatile gboolean do_fsa_stall; extern volatile enum crmd_fsa_state fsa_state; extern volatile long long fsa_input_register; extern volatile long long fsa_actions; extern cib_t *fsa_cib_conn; extern char *fsa_our_uname; extern char *fsa_our_uuid; extern char *fsa_pe_ref; /* the last invocation of the PE */ extern char *fsa_our_dc; extern char *fsa_our_dc_version; extern GListPtr fsa_message_queue; extern char *fsa_cluster_name; -extern election_t *fsa_election; /* */ -extern fsa_timer_t *election_trigger; /* */ -extern fsa_timer_t *election_timeout; /* */ -extern fsa_timer_t *shutdown_escalation_timer; /* */ +extern election_t *fsa_election; +extern fsa_timer_t *election_trigger; +extern fsa_timer_t *election_timeout; +extern fsa_timer_t *shutdown_escalation_timer; extern fsa_timer_t *transition_timer; extern fsa_timer_t *integration_timer; extern fsa_timer_t *finalization_timer; extern fsa_timer_t *wait_timer; extern fsa_timer_t *recheck_timer; extern crm_trigger_t *fsa_source; extern crm_trigger_t *config_read; -/* these two should be moved elsewhere... */ -extern void do_update_cib_nodes(gboolean overwrite, const char *caller); +extern unsigned long long saved_ccm_membership_id; +extern gboolean ever_had_quorum; + +// These two should be moved elsewhere +void do_update_cib_nodes(gboolean overwrite, const char *caller); int crmd_cib_smart_opt(void); +const char *fsa_input2string(enum crmd_fsa_input input); +const char *fsa_state2string(enum crmd_fsa_state state); +const char *fsa_cause2string(enum crmd_fsa_cause cause); +const char *fsa_action2string(long long action); + +enum crmd_fsa_state s_crmd_fsa(enum crmd_fsa_cause cause); + # define AM_I_DC is_set(fsa_input_register, R_THE_DC) -# define AM_I_OPERATIONAL (is_set(fsa_input_register, R_STARTING)==FALSE) -extern unsigned long long saved_ccm_membership_id; -extern gboolean ever_had_quorum; +# define AM_I_OPERATIONAL (is_set(fsa_input_register, R_STARTING) == FALSE) +# define trigger_fsa(source) do { \ + crm_trace("Triggering FSA: %s", __FUNCTION__); \ + mainloop_set_trigger(source); \ + } while(0) # include # include - -# define trigger_fsa(source) crm_trace("Triggering FSA: %s", __FUNCTION__); \ - mainloop_set_trigger(source); - #endif diff --git a/daemons/controld/fsa_defines.h b/daemons/controld/fsa_defines.h deleted file mode 100644 index 79c1da4f3a..0000000000 --- a/daemons/controld/fsa_defines.h +++ /dev/null @@ -1,472 +0,0 @@ -/* - * Copyright 2004-2018 Andrew Beekhof - * - * This source code is licensed under the GNU Lesser General Public License - * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY. - */ - -#ifndef FSA_DEFINES__H -# define FSA_DEFINES__H - -/*====================================== - * States the controller can be in - *======================================*/ -enum crmd_fsa_state { - S_IDLE = 0, /* Nothing happening */ - - S_ELECTION, /* Take part in the election algorithm as - * described below - */ - S_INTEGRATION, /* integrate that status of new nodes (which is - * all of them if we have just been elected DC) - * to form a complete and up-to-date picture of - * the CIB - */ - S_FINALIZE_JOIN, /* integrate that status of new nodes (which is - * all of them if we have just been elected DC) - * to form a complete and up-to-date picture of - * the CIB - */ - S_NOT_DC, /* we are in non-DC mode */ - S_POLICY_ENGINE, /* Determine next stable state of the cluster */ - S_RECOVERY, /* Something bad happened, check everything is ok - * before continuing and attempt to recover if - * required - */ - S_RELEASE_DC, /* we were the DC, but now we arent anymore, - * possibly by our own request, and we should - * release all unnecessary sub-systems, finish - * any pending actions, do general cleanup and - * unset anything that makes us think we are - * special :) - */ - S_STARTING, /* we are just starting out */ - S_PENDING, /* we are not a full/active member yet */ - S_STOPPING, /* We are in the final stages of shutting down */ - S_TERMINATE, /* We are going to shutdown, this is the equiv of - * "Sending TERM signal to all processes" in Linux - * and in worst case scenarios could be considered - * a self STONITH - */ - S_TRANSITION_ENGINE, /* Attempt to make the calculated next stable - * state of the cluster a reality - */ - - S_HALT, /* Freeze - don't do anything - * Something bad happened that needs the admin to fix - * Wait for I_ELECTION - */ - - /* ----------- Last input found in table is above ---------- */ - S_ILLEGAL /* This is an illegal FSA state */ - /* (must be last) */ -}; - -# define MAXSTATE S_ILLEGAL -/* - A state diagram can be constructed from the dc_fsa.dot with the - following command: - - dot -Tpng crmd_fsa.dot > crmd_fsa.png - -Description: - - Once we start and do some basic sanity checks, we go into the - S_NOT_DC state and await instructions from the DC or input from - the cluster layer which indicates the election algorithm needs to run. - - If the election algorithm is triggered we enter the S_ELECTION state - from where we can either go back to the S_NOT_DC state or progress - to the S_INTEGRATION state (or S_RELEASE_DC if we used to be the DC - but arent anymore). - - The election algorithm has been adapted from - http://www.cs.indiana.edu/cgi-bin/techreports/TRNNN.cgi?trnum=TR521 - - Loosely known as the Bully Algorithm, its major points are: - - Election is initiated by any node (N) notices that the controller - is no longer responding - - Concurrent multiple elections are possible - - Algorithm - + N sends ELECTION messages to all nodes that occur earlier in - the cluster layer's membership list. - + If no one responds, N wins and becomes controller - + N sends out CONTROLLER messages to all other nodes in the - partition - + If one of higher-ups answers, it takes over. N is done. - - Once the election is complete, if we are the DC, we enter the - S_INTEGRATION state which is a DC-in-waiting style state. We are - the DC, but we shouldn't do anything yet because we may not have an - up-to-date picture of the cluster. There may of course be times - when this fails, so we should go back to the S_RECOVERY stage and - check everything is ok. We may also end up here if a new node came - online, since each node is authorative on itself and we would want - to incorporate its information into the CIB. - - Once we have the latest CIB, we then enter the S_POLICY_ENGINE state - where invoke the Policy Engine. It is possible that between - invoking the Policy Engine and receiving an answer, that we receive - more input. In this case we would discard the orginal result and - invoke it again. - - Once we are satisfied with the output from the Policy Engine we - enter S_TRANSITION_ENGINE and feed the Policy Engine's output to the - Transition Engine who attempts to make the Policy Engine's - calculation a reality. If the transition completes successfully, - we enter S_IDLE, otherwise we go back to S_POLICY_ENGINE with the - current unstable state and try again. - - Of course we may be asked to shutdown at any time, however we must - progress to S_NOT_DC before doing so. Once we have handed over DC - duties to another node, we can then shut down like everyone else, - that is by asking the DC for permission and waiting it to take all - our resources away. - - The case where we are the DC and the only node in the cluster is a - special case and handled as an escalation which takes us to - S_SHUTDOWN. Similarly if any other point in the shutdown - fails or stalls, this is escalated and we end up in S_TERMINATE. - - At any point, the controller can relay messages for its subsystems, - but outbound messages (from subsystems) should probably be blocked - until S_INTEGRATION (for the DC) or the join protocol has - completed (for non-DC controllers). -*/ - -/*====================================== - * - * Inputs/Events/Stimuli to be given to the finite state machine - * - * Some of these a true events, and others a synthesised based on - * the "register" (see below) and the contents or source of messages. - * - * At this point, my plan is to have a loop of some sort that keeps - * going until receiving I_NULL - * - *======================================*/ -enum crmd_fsa_input { -/* 0 */ - I_NULL, /* Nothing happened */ -/* 1 */ - - I_CIB_OP, /* An update to the CIB occurred */ - I_CIB_UPDATE, /* An update to the CIB occurred */ - I_DC_TIMEOUT, /* We have lost communication with the DC */ - I_ELECTION, /* Someone started an election */ - I_PE_CALC, /* The Policy Engine needs to be invoked */ - I_RELEASE_DC, /* The election completed and we were not - * elected, but we were the DC beforehand - */ - I_ELECTION_DC, /* The election completed and we were (re-)elected - * DC - */ - I_ERROR, /* Something bad happened (more serious than - * I_FAIL) and may not have been due to the action - * being performed. For example, we may have lost - * our connection to the CIB. - */ -/* 9 */ - I_FAIL, /* The action failed to complete successfully */ - I_INTEGRATED, - I_FINALIZED, - I_NODE_JOIN, /* A node has entered the cluster */ - I_NOT_DC, /* We are not and were not the DC before or after - * the current operation or state - */ - I_RECOVERED, /* The recovery process completed successfully */ - I_RELEASE_FAIL, /* We could not give up DC status for some reason - */ - I_RELEASE_SUCCESS, /* We are no longer the DC */ - I_RESTART, /* The current set of actions needs to be - * restarted - */ - I_TE_SUCCESS, /* Some non-resource, non-cluster-layer action is required - * of us, eg. ping - */ -/* 20 */ - I_ROUTER, /* Do our job as router and forward this to the - * right place - */ - I_SHUTDOWN, /* We are asking to shutdown */ - I_STOP, /* We have been told to shutdown */ - I_TERMINATE, /* Actually exit */ - I_STARTUP, - I_PE_SUCCESS, /* The action completed successfully */ - - I_JOIN_OFFER, /* The DC is offering membership */ - I_JOIN_REQUEST, /* The client is requesting membership */ - I_JOIN_RESULT, /* If not the DC: The result of a join request - * Else: A client is responding with its local state info - */ - - I_WAIT_FOR_EVENT, /* we may be waiting for an async task to "happen" - * and until it does, we can't do anything else - */ - - I_DC_HEARTBEAT, /* The DC is telling us that it is alive and well */ - - I_LRM_EVENT, - -/* 30 */ - I_PENDING, - I_HALT, - - /* ------------ Last input found in table is above ----------- */ - I_ILLEGAL /* This is an illegal value for an FSA input */ - /* (must be last) */ -}; - -# define MAXINPUT I_ILLEGAL - -# define I_MESSAGE I_ROUTER - -/*====================================== - * - * actions - * - * Some of the actions below will always occur together for now, but I can - * foresee that this may not always be the case. So I've split them up so - * that if they ever do need to be called independently in the future, it - * won't be a problem. - * - * For example, separating A_LRM_CONNECT from A_STARTUP might be useful - * if we ever try to recover from a faulty or disconnected LRM. - * - *======================================*/ - - /* Don't do anything */ -# define A_NOTHING 0x0000000000000000ULL - -/* -- Startup actions -- */ - /* Hook to perform any actions (other than connecting to other daemons) - * that might be needed as part of the startup. - */ -# define A_STARTUP 0x0000000000000001ULL - /* Hook to perform any actions that might be needed as part - * after startup is successful. - */ -# define A_STARTED 0x0000000000000002ULL - /* Connect to cluster layer */ -# define A_HA_CONNECT 0x0000000000000004ULL -# define A_HA_DISCONNECT 0x0000000000000008ULL - -# define A_INTEGRATE_TIMER_START 0x0000000000000010ULL -# define A_INTEGRATE_TIMER_STOP 0x0000000000000020ULL -# define A_FINALIZE_TIMER_START 0x0000000000000040ULL -# define A_FINALIZE_TIMER_STOP 0x0000000000000080ULL - -/* -- Election actions -- */ -# define A_DC_TIMER_START 0x0000000000000100ULL -# define A_DC_TIMER_STOP 0x0000000000000200ULL -# define A_ELECTION_COUNT 0x0000000000000400ULL -# define A_ELECTION_VOTE 0x0000000000000800ULL - -# define A_ELECTION_START 0x0000000000001000ULL - -/* -- Message processing -- */ - /* Process the queue of requests */ -# define A_MSG_PROCESS 0x0000000000002000ULL - /* Send the message to the correct recipient */ -# define A_MSG_ROUTE 0x0000000000004000ULL - - /* Send a welcome message to new node(s) */ -# define A_DC_JOIN_OFFER_ONE 0x0000000000008000ULL - -/* -- Server Join protocol actions -- */ - /* Send a welcome message to all nodes */ -# define A_DC_JOIN_OFFER_ALL 0x0000000000010000ULL - /* Process the remote node's ack of our join message */ -# define A_DC_JOIN_PROCESS_REQ 0x0000000000020000ULL - /* Send out the reults of the Join phase */ -# define A_DC_JOIN_FINALIZE 0x0000000000040000ULL - /* Send out the reults of the Join phase */ -# define A_DC_JOIN_PROCESS_ACK 0x0000000000080000ULL - -/* -- Client Join protocol actions -- */ -# define A_CL_JOIN_QUERY 0x0000000000100000ULL -# define A_CL_JOIN_ANNOUNCE 0x0000000000200000ULL - /* Request membership to the DC list */ -# define A_CL_JOIN_REQUEST 0x0000000000400000ULL - /* Did the DC accept or reject the request */ -# define A_CL_JOIN_RESULT 0x0000000000800000ULL - -/* -- Recovery, DC start/stop -- */ - /* Something bad happened, try to recover */ -# define A_RECOVER 0x0000000001000000ULL - /* Hook to perform any actions (apart from starting, the TE, PE - * and gathering the latest CIB) that might be necessary before - * giving up the responsibilities of being the DC. - */ -# define A_DC_RELEASE 0x0000000002000000ULL - /* */ -# define A_DC_RELEASED 0x0000000004000000ULL - /* Hook to perform any actions (apart from starting, the TE, PE - * and gathering the latest CIB) that might be necessary before - * taking over the responsibilities of being the DC. - */ -# define A_DC_TAKEOVER 0x0000000008000000ULL - -/* -- Shutdown actions -- */ -# define A_SHUTDOWN 0x0000000010000000ULL -# define A_STOP 0x0000000020000000ULL -# define A_EXIT_0 0x0000000040000000ULL -# define A_EXIT_1 0x0000000080000000ULL - -# define A_SHUTDOWN_REQ 0x0000000100000000ULL -# define A_ELECTION_CHECK 0x0000000200000000ULL -# define A_DC_JOIN_FINAL 0x0000000400000000ULL - -/* -- CIB actions -- */ -# define A_CIB_START 0x0000020000000000ULL -# define A_CIB_STOP 0x0000040000000000ULL - -/* -- Transition Engine actions -- */ - /* Attempt to reach the newly calculated cluster state. This is - * only called once per transition (except if it is asked to - * stop the transition or start a new one). - * Once given a cluster state to reach, the TE will determine - * tasks that can be performed in parallel, execute them, wait - * for replies and then determine the next set until the new - * state is reached or no further tasks can be taken. - */ -# define A_TE_INVOKE 0x0000100000000000ULL -# define A_TE_START 0x0000200000000000ULL -# define A_TE_STOP 0x0000400000000000ULL -# define A_TE_CANCEL 0x0000800000000000ULL -# define A_TE_HALT 0x0001000000000000ULL - -/* -- Policy Engine actions -- */ - /* Calculate the next state for the cluster. This is only - * invoked once per needed calculation. - */ -# define A_PE_INVOKE 0x0002000000000000ULL -# define A_PE_START 0x0004000000000000ULL -# define A_PE_STOP 0x0008000000000000ULL -/* -- Misc actions -- */ - /* Add a system generate "block" so that resources arent moved - * to or are activly moved away from the affected node. This - * way we can return quickly even if busy with other things. - */ -# define A_NODE_BLOCK 0x0010000000000000ULL - /* Update our information in the local CIB */ -# define A_UPDATE_NODESTATUS 0x0020000000000000ULL -# define A_READCONFIG 0x0080000000000000ULL - -/* -- LRM Actions -- */ - /* Connect to pacemaker-execd */ -# define A_LRM_CONNECT 0x0100000000000000ULL - /* Disconnect from pacemaker-execd */ -# define A_LRM_DISCONNECT 0x0200000000000000ULL -# define A_LRM_INVOKE 0x0400000000000000ULL -# define A_LRM_EVENT 0x0800000000000000ULL - -/* -- Logging actions -- */ -# define A_LOG 0x1000000000000000ULL -# define A_ERROR 0x2000000000000000ULL -# define A_WARN 0x4000000000000000ULL - -# define O_EXIT (A_SHUTDOWN|A_STOP|A_LRM_DISCONNECT|A_HA_DISCONNECT|A_EXIT_0|A_CIB_STOP) -# define O_RELEASE (A_DC_TIMER_STOP|A_DC_RELEASE|A_PE_STOP|A_TE_STOP|A_DC_RELEASED) -# define O_PE_RESTART (A_PE_START|A_PE_STOP) -# define O_TE_RESTART (A_TE_START|A_TE_STOP) -# define O_CIB_RESTART (A_CIB_START|A_CIB_STOP) -# define O_LRM_RECONNECT (A_LRM_CONNECT|A_LRM_DISCONNECT) -# define O_DC_TIMER_RESTART (A_DC_TIMER_STOP|A_DC_TIMER_START) -/*====================================== - * - * "register" contents - * - * Things we may want to remember regardless of which state we are in. - * - * These also count as inputs for synthesizing I_* - * - *======================================*/ -# define R_THE_DC 0x00000001ULL - /* Are we the DC? */ -# define R_STARTING 0x00000002ULL - /* Are we starting up? */ -# define R_SHUTDOWN 0x00000004ULL - /* Are we trying to shut down? */ -# define R_STAYDOWN 0x00000008ULL - /* Should we restart? */ - -# define R_JOIN_OK 0x00000010ULL /* Have we completed the join process */ -# define R_READ_CONFIG 0x00000040ULL -# define R_INVOKE_PE 0x00000080ULL - /* Does the PE needed to be invoked at - the next appropriate point? */ - -# define R_CIB_CONNECTED 0x00000100ULL - /* Is the CIB connected? */ -# define R_PE_CONNECTED 0x00000200ULL - /* Is the Policy Engine connected? */ -# define R_TE_CONNECTED 0x00000400ULL - /* Is the Transition Engine connected? */ -# define R_LRM_CONNECTED 0x00000800ULL // Is pacemaker-execd connected? - -# define R_CIB_REQUIRED 0x00001000ULL - /* Is the CIB required? */ -# define R_PE_REQUIRED 0x00002000ULL - /* Is the Policy Engine required? */ -# define R_TE_REQUIRED 0x00004000ULL - /* Is the Transition Engine required? */ -# define R_ST_REQUIRED 0x00008000ULL - /* Is the Stonith daemon required? */ - -# define R_CIB_DONE 0x00010000ULL - /* Have we calculated the CIB? */ -# define R_HAVE_CIB 0x00020000ULL /* Do we have an up-to-date CIB */ -# define R_CIB_ASKED 0x00040000ULL /* Have we asked for an up-to-date CIB */ - -# define R_MEMBERSHIP 0x00100000ULL /* Have we got cluster layer data yet */ -# define R_PEER_DATA 0x00200000ULL /* Have we got T_CL_STATUS data yet */ - -# define R_HA_DISCONNECTED 0x00400000ULL /* did we sign out of our own accord */ - -# define R_REQ_PEND 0x01000000ULL - /* Are there Requests waiting for - processing? */ -# define R_PE_PEND 0x02000000ULL - /* Has the PE been invoked and we're - awaiting a reply? */ -# define R_TE_PEND 0x04000000ULL - /* Has the TE been invoked and we're - awaiting completion? */ -# define R_RESP_PEND 0x08000000ULL - /* Do we have clients waiting on a - response? if so perhaps we shouldn't - stop yet */ - -# define R_IN_TRANSITION 0x10000000ULL - /* */ -# define R_SENT_RSC_STOP 0x20000000ULL /* Have we sent a stop action to all - * resources in preparation for - * shutting down */ - -# define R_IN_RECOVERY 0x80000000ULL - -/* - * Magic RC used within the controller to indicate direct nacks - * (operation is invalid in current state) - */ -#define CRM_DIRECT_NACK_RC (99) - -enum crmd_fsa_cause { - C_UNKNOWN = 0, - C_STARTUP, - C_IPC_MESSAGE, - C_HA_MESSAGE, - C_CRMD_STATUS_CALLBACK, - C_LRM_OP_CALLBACK, - C_TIMER_POPPED, - C_SHUTDOWN, - C_FSA_INTERNAL, -}; - -extern const char *fsa_input2string(enum crmd_fsa_input input); -extern const char *fsa_state2string(enum crmd_fsa_state state); -extern const char *fsa_cause2string(enum crmd_fsa_cause cause); -extern const char *fsa_action2string(long long action); - -#endif