diff --git a/README.devmap b/README.devmap index 6fab62c6..5ccd07cf 100644 --- a/README.devmap +++ b/README.devmap @@ -1,1223 +1,1223 @@ Copyright (c) 2002-2004 MontaVista Software, Inc. Copyright (c) 2006 Red Hat, Inc. All rights reserved. This software licensed under BSD license, the text of which follows: Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: - Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. - Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. - Neither the name of the MontaVista Software, Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ------------------------------------------------------------------------------- This file provides a map for developers to understand how to contribute to the corosync project. The purpose of this document is to prepare a developer to write a service for corosync, or understand the architecture of corosync. The following is described in this document: * all files, purpose, and dependencies * architecture of corosync * taking advantage of virtual synchrony * adding libraries * adding services ------------------------------------------------------------------------------- all files, purpose, and dependencies. ------------------------------------------------------------------------------- *----------------* *- AIS INCLUDES -* *----------------* include/saAmf.h ----------------- Definitions for AMF interface. include/saCkpt.h ------------------ Definitions for CKPT interface. include/saClm.h ----------------- Definitions for CLM interface. include/saAmf.h ----------------- Definitions for the AMF interface. include/saEvt.h ----------------- Defintiions for the EVT interface. include/saLck.h ----------------- Definitions for the LCK interface. include/cfg.h Definitions for the CFG interface. include/cpg.h Definitions for the CPG interface. include/evs.h Definitions for the EVS interface. include/ipc_amf.h IPC interface between client and server for AMF service. include/ipc_cfg.h IPC interface between client and server for CFG service. include/ipc_ckpt.h IPC interface between client and server for CKPT service. include/ipc_clm.h IPC interface between client and server for CLM service. include/ipc_cpg.h IPC interface between client and server for CPG service. include/ipc_evs.h IPC interface between client and server for EVS service. include/ipc_evt.h IPC interface between client and server for EVT service. include/ipc_gen.h IPC interface for generic operations. include/ipc_lck.h IPC interface between client and server for LCK service. include/ipc_msg.h IPC interface between client and server for MSG service. include/hdb.h Handle database implementation. include/list.h Linked list implementation. include/swab.h Byte swapping implementation. include/queue.h FIFO queue implementation. include/sq.h Sort queue where items are sorted according to a sequence number. Avoids Sort, hence, install of a new element takes is O(1). Inline implementation. depends on list. *---------------* * AIS LIBRARIES * *---------------* lib/amf.c --------- AMF user library linked into user application. lib/cfg.c --------- CFG user library linked into user application. lib/ckpt.c --------- CKPT user library linked into user application. lib/clm.c --------- CLM user library linked into user application. lib/cpg.c --------- CPG user library linked into user application. lib/evs.c --------- EVS user library linked into user application. lib/evt.c --------- EVT user library linked into user application. lib/lck.c --------- LCK user library linked into user application. lib/msg.c --------- MSG user library linked into uer application. lib/amf.c --------- AMF user library linked into user application. lib/ckpt.c ---------- CKPT user library linked into user application. lib/evt.c ---------- EVT user library linked into user application. lib/util.c ---------- Utility functions used by all libraries. *-----------------* *- AIS EXECUTIVE -* *-----------------* exec/aisparser.{h|c} Parser plugin for default configuration file format. exec/aispoll.{h|c} Poll abstraction interface. exec/amfapp.c AMF application handling. exec/amfcluster.c AMF cluster handling. exec/amfcomp.c AMF component level handling. exec/amf.h Defines all AMF symbol names. exec/amfnode.c AMF node level handling. exec/amfsg.c AMF service group handling. exec/amfsi.c AMF Service instance handling. exec/amfsu.c AMF service unit handling. exec/amfutil.c AMF utility functions. exec/cfg.c Server side implementation of CFG service which is used to display redundant ring status and reenabling redundant rings. exec/ckpt.c Server side implementation of Checkpointing (CKPT API). exec/clm.c Server side implementation of Cluster Membership (CLM API). exec/cpg.c Server side implementation of closed procss groups (CPG API). exec/crypto.{c|h} Cryptography functions used by corosync. exec/evs.c Server side implementation of extended virtual synchrony passthrough (EVS API). exec/evt.c Server side implementation of Event Service (EVT API). exec/ipc.{c|h} All IPC operations used by corosync. exec/jhash.h A hash routine. exec/keygen.c Secret key generator used by corosync encryption tools. exec/lck.c Server side implementation of the distributed lock service (LCK API). exec/main.{c|h} Main function which connects all components together. exec/mainconfig.{c|h} Reads main configuration that is set in the configuration parser. exec/mempool.{c|h} Currently unused. exec/msg.c Server side implementation of message service (MSG API). exec/objdb.{c|h} Object database used to configure services. exec/corosync-instantiate.c instantiates a component by forking and exec'ing it and writing its pid to a pid file. exec/print.{c|h} Non-blocking thread-based logging service with overflow protection. exec/service.{c|h} Service handling routines including the default service handler description. exec/sync.{c|h} The synchronization service implementation. exec/timer.{c|h} Threaded based timer service. exec/tlist.h Timer list used to expire timers. exec/totemconfig.{c.h} The totem configuration configurator from data parsed with aisparser in the configuration file. exec/totem.h General definitions for the totem protocol used by the totem stack. exec/totemip.{c.h} IP handling functions for totem - lowest on stack. exec/{totemrrp.{c.h} The totem multi ring protocool and currently unimplemented. Between totemsrp and totempg. exec/totemnet.{c.h} Network handling functions for totem - between totemip and totemrrp. exec/totempg.{c|h} Process groups interface which is used by all applications - highest on stack. exec/totemrrp.{c.h} Redundant ring functions for totem - between totemnet and totemsrp. exec/util.{c|h} Utility functions used by corosync executive. exec/version.h Defines build version. exec/vsf.h Virtual Synchrony plugin API. exec/vsf_ykd.c Virtual Synchrony YKD Dynamic Linear Voting algorithm. exec/wthread.{c|h} Worker threads API. loc --- Counts the lines of code in the AIS implementation. ------------------------------------------------------------------------------- architecture of corosync ------------------------------------------------------------------------------- The corosync standards based cluster framework is a generic cluster plugin architecture used to create cluster APIs and services. Usually there are libraries which implement APIs and are linked into the end user application. The libraries request services from the aisexec process, called the AIS executive. The AIS executive uses the Totem protocol stack to communicate within the cluster and execute operations on behalf of the user. Finally the response of the API is delivered once the operation has completed. -------------------------------------------------- | AMF and more services libraries | -------------------------------------------------- | IPC API | -------------------------------------------------- | corosync Executive | | | | +---------+ +--------+ +---------+ | | | Object | | AIS | | Service | | | | Datbase | | Config | | Handler | | | | Service | | Parser | | Manager | | | +---------+ +--------+ +---------+ | | +-------+ +-------+ | | | AMF | | more | | | |Service| |svcs...| | | +-------+ +-------+ | | +---------+ | | | Sync | | | | Service | | | +---------+ | | +---------+ | | | VSF | | | | Service | | | +---------+ | | +--------------------------------+ +--------+ | | | Totem | | Timers | | | | Stack | | API | | | +--------------------------------+ +--------+ | | +-----------+ | | | Poll | | | | Interface | | | +-----------+ | | | ------------------------------------------------- Figure 1: corosync Architecture Every application that intends to use corosync links with the libais library. This library uses IPC, or more specifically BSD unix sockets, to communicate with the executive. The library is a small program responsible only for packaging the request into a message. This message is sent, using IPC, to the executive which then processes it. The library then waits for a response. The library itself contains very little intelligence. Some utility services are provided: * create a connection to the executive * send messages to the executive * retrieve messages from the executive * Poll on a fd * create a handle instance * destroy a handle instance * get a reference to a handle instance * release a reference to a handle instance When a library connects, it sends via a message, the service type. The service type is stored and used later to reference the message handlers for both the library message handlers and executive message handlers. Every message sent contains an integer identifier, which is used to index into an array of message handlers to determine the correct message handler to execute For the library. Hence a message is uniquely identified by the message handler ID number and the service handler ID number. When a library sends a message via IPC, the delivery of the message occurs to the proper library message handler. The library message handler is responsible for sending the message via the totem process groups API to all nodes in the system. This simplifies the library handler significantly. The main purpose of the library handler should be to package the library request into a message that can be sent to all nodes. The totem process groups API sends the message according to the extended virtual synchrony model. The group messaging interface also delivers the message according to the extended virtual synchrony model. This has several advantages which are described in the virtual synchrony section. One advantage that must be described now is that messages are self-delivered; if a node sends a message, that same message is delivered back to that node. When the executive message is delivered, it is processed by the executive message handler. The executive message handler contains the brains of AIS and is responsible for making all decisions relating to the request from the libais library user. ------------------------------------------------------------------------------- taking advantage of virtual synchrony ------------------------------------------------------------------------------- definitions: processor: a system responsible for executing the virtual synchrony model configuration: the list of processors under which messages are delivered partition: one or more processors leave the configuration merge: one or more processors join the configuration group messaging: sending a message from one sender to many receivers Virtual synchrony is a model for group messaging. This is often confused with particular implementations of virtual synchrony. Try to focus on what virtual syncrhony provides, not how it provides it, unless interested in working on the group messaging interface of corosync. Virtual synchrony provides several advantages: * integrated membership * strong membership guarantees * agreed ordering of delivered messages * same delivery of configuration changes and messages on every node * self-delivery * reliable communication in the face of unreliable networks * recovery of messages sent within a configuration where possible * use of network multicast using standard UDP/IP Integrated membership allows the group messaging interface to give configuration change events to the API services. This is obviously beneficial to the cluster membership service (and its respective API0, but is helpful to other services as described later. Strong membership guarantees allow a distributed application to make decisions based upon the configuration (membership). Every service in corosync registers a configuration change function. This function is called whenever a configuration change occurs. The information passed is the current processors, the processors that have left the configuration, and the processors that have joined the configuration. This information is then used to make decisions within a distributed state machine. One example usage is that an AMF component running a specific processor has left the configuration, so failover actions must now be taken with the new configuration (and known components). Virtual synchrony requires that messages may be delivered in agreed order. FIFO order indicates that one sender and one receiver agree on the order of messages sent. Agreed ordering takes this requirement to groups, requiring that one sender and all receivers agree on the order of messages sent. Consider a lock service. The service is responsible for arbitrating locks between multiple processors in the system. With fifo ordering, this is very difficult because a request at about the same time for a lock from two seperate processors may arrive at all the receivers in different order. Agreed ordering ensures that all the processors are delivered the message in the same order. In this case the first lock message will always be from processor X, while the second lock message will always be from processor Y. Hence the first request is always honored by all processors, and the second request is rejected (since the lock is taken). This is how race conditions are avoided in distributed systems. Every processor is delivered a configuration change and messages within a configuration in the same order. This ensures that any distributed state machine will make the same decisions on every processor within the configuration. This also allows the configuration and the messages to be considered when making decisions. Virtual synchrony requires that every node is delivered messages that it sends. This enables the logic to be placed in one location (the handler for the delivery of the group message) instead of two seperate places. This also allows messages that are sent to be ordered in the stream of other messages within the configuration. Certain guarantees are required by virtual synchrony. If a message is sent, it must be delivered by every processor unless that processor fails. If a particular processor fails, a configuration change occurs creating a new configuration under which a new set of decisions may be made. This implies that even unreliable networks must reliably deliver messages. The mplementation in corosync works on unreliable as well as reliable networks. Every message sent must be delivered, unless a configuration change occurs. In the case of a configuration change, every message that can be recovered must be recovered before the new configuration is installed. Some systems during partition won't continue to recover messages within the old configuration even though those messages can be recovered. Virtual synchrony makes that impossible, except for those members that are no longer part of a configuration. Finally virtual syncrhony takes advantage of hardware multicast to avoid duplicated packets and scale to large transmit rates. On 100mbit network, corosync can approach wire speeds depending on the number of messages queued for a particular processor. What does all of this mean for the developer? * messages are delivered reliably * messages are delivered in the same order to all nodes * configuration and messages can both be used to make decisions ------------------------------------------------------------------------------- adding libraries ------------------------------------------------------------------------------- The first stage in adding a library to the system is to develop the library. Library code should follow these guidelines: * use SA Forum coding style for SA Forum APIs to aid in debugging * use corosync coding guidelines for APIs that are not SA Forum that are to be merged into the corosync tree. * implement all library code within one file named after the api. examples are ckpt.c, clm.c, amf.c. * use parallel structure as much as possible between different APIs * make use of utility services provided by util.c. * if something is needed that is generic and useful by all services, submit patches for other libraries to use these services. * use the reference counting handle manager for handle management. ------------------ Version checking ------------------ struct saVersionDatabase { int versionCount; SaVersionT *versionsSupported; }; The versionCount number describes how many entries are in the version database. The versionsSupported member is an array of SaVersionT describing the acceptable versions this API supports. An api developer specifies versions supported by adding the following C code to the library file: /* * Versions supported */ static SaVersionT clmVersionsSupported[] = { { 'B', 1, 1 }, { 'b', 1, 1 } }; static struct saVersionDatabase clmVersionDatabase = { sizeof (clmVersionsSupported) / sizeof (SaVersionT), clmVersionsSupported }; After this is specified, the following API is used to check versions: SaErrorT saVersionVerify ( struct saVersionDatabase *versionDatabase, const SaVersionT *version); An example usage of this is SaErrorT error; error = saVersioNVerify (&clmVersionDatabase, version); where version is a pointer to an SaVersionT passed into the API. error will return SA_OK if the version is valid as specified in the version database. ------------------ Handle Instances ------------------ Every handle instance is stored in a handle database. The handle database stores instance information for every handle used by libraries. The system includes reference counting and is safe for use in threaded applications. The handle database structure is: struct saHandleDatabase { unsigned int handleCount; struct saHandle *handles; pthread_mutex_t mutex; void (*handleInstanceDestructor) (void *); }; handleCount is the number of handles handles is an array of handles mutex is a pthread mutex used to mutually exclude access to the handle db handleInstanceDestructor is a callback that is called when the handle should be freed because its reference count as dropped to zero. The handle database is defined in a library as follows: static void clmHandleInstanceDestructor (void *); static struct saHandleDatabase clmHandleDatabase = { .handleCount = 0, .handles = 0, .mutex = PTHREAD_MUTEX_INITIALIZER, .handleInstanceDestructor = clmHandleInstanceDestructor }; There are several APIs to access the handle database: SaErrorT saHandleCreate ( struct saHandleDatabase *handleDatabase, int instanceSize, int *handleOut); Creates an instance of size instanceSize in the handleDatabase paraemter returning the handle number in handleOut. The handle instance reference count starts at the value 1. SaErrorT saHandleDestroy ( struct saHandleDatabase *handleDatabase, unsigned int handle); Destroys further access to the handle. Once the handle reference count drops to zero, the database destructor is called for the handle. The handle instance reference count is decremented by 1. SaErrorT saHandleInstanceGet ( struct saHandleDatabase *handleDatabase, unsigned int handle, void **instance); Gets an instance specified handle from the handleDatabase and returns it in the instance member. If the handle is valid SA_OK is returned otherwise an error is returned. This is used to ensure a handle is valid. Eveyr get call increases the reference count on a handle instance by one. SaErrorT saHandleInstancePut ( struct saHandleDatabase *handleDatabase, unsigned int handle); Decrements the reference count by 1. If the reference count indicates the handle has been destroyed, it will then be removed from the database and the destructor called on the instance data. The put call takes care of freeing the handle instance data. Create a data structure for the instance, and use it within the libraries to store state information about the instance. This information can be the handle, a mutex for protecting I/O, a queue for queueing async messages or whatever is needed by the API. ----------------------------------- communicating with the executive ----------------------------------- A service connection is created with the following API; SaErrorT saServiceConnect ( int *responseOut, int *callbackOut, enum service_types service); The responseOut parameter specifies the file descriptor where response messages will be delivered. The callback out parameter describes the file descriptor where callback messages are delivered. The service specifies the service to use. Messages are sent and received from the executive with the following functions: SaAisErrorT saSendMsgRetry ( int s, struct iovec *iov, - int iov_len); + unsigned int iov_len); the s member is the socket to use retrieved with saServiceConnect The iov is the iovector used to send a message. the iov_len is the number of elements in iov. This sends an IO-vectorized message. SaErrorT saSendRetry ( int s, const void *msg, size_t len, int flags); the s member is the socket to use retrieved with saServiceConnect the msg member is a pointer to the message to send to the service the len member is the length of the message to send the flags parameter is the flags to use with the sendmsg system call This sends a data blob to the exective. A message is received from the executive with the function: SaErrorT saRecvRetry ( int s, void *msg, size_t len, int flags); the s member is the socket to use retrieved with saServiceConnect the msg member is a pointer to the message to receive to the service the len member is the length of the message to receive the flags parameter is the flags to use with the sendmsg system call A message may be send and a reply waited for with the following function: SaAisErrorT saSendMsgReceiveReply ( int s, struct iovec *iov, - int iov_len, + unsigned int iov_len, void *responseMessage, int responseLen) s is the socket to send and receive the response. iov is the iovector to send. iov_len is the number of elements in iov. responseMessage is the data block used to store the response. responesLen is the length of the data block that is expected to be received. Waiting for a file descriptor using poll systemcall is done with the api: SaErrorT saPollRetry ( struct pollfd *ufds, unsigned int nfds, int timeout); where the parameters are the standard poll parameters. Messages can be received out of order searching for a specific message id with: ---------- messages ---------- Please follow the style of the messages. It makes debugging much easier if parallel style is used. An service should be added to service_types enumeration in ipc_gen or in the case of an external project, a number should be registered with the project. enum service_types { EVS_SERVICE = 0, CLM_SERVICE = 1, AMF_SERVICE = 2, CKPT_SERVICE = 3, EVT_SERVICE = 4, LCK_SERVICE = 5, MSG_SERVICE = 6, CFG_SERVICE = 7, CPG_SERVICE = 8 }; These are the request CLM message identifiers: Each library should have an ipc_APINAME.h file in include. It should define request types and response types. enum req_clm_types { MESSAGE_REQ_CLM_TRACKSTART = 0, MESSAGE_REQ_CLM_TRACKSTOP = 1, MESSAGE_REQ_CLM_NODEGET = 2, MESSAGE_REQ_CLM_NODEGETASYNC = 3 }; These are the response CLM message identifiers: enum res_clm_types { MESSAGE_RES_CLM_TRACKCALLBACK = 0, MESSAGE_RES_CLM_TRACKSTART = 1, MESSAGE_RES_CLM_TRACKSTOP = 2, MESSAGE_RES_CLM_NODEGET = 3, MESSAGE_RES_CLM_NODEGETASYNC = 4, MESSAGE_RES_CLM_NODEGETCALLBACK = 5 }; A request header should be placed at the front of every message send by the library. typedef struct { int size __attribute__((aligned(8))); int id __attribute__((aligned(8))); } mar_req_header_t __attribute__((aligned(8))); There is also a response message header which should start every response message: typedef struct { int size; __attribute__((aligned(8))) int id __attribute__((aligned(8))); SaAisErrorT error __attribute__((aligned(8))); } mar_res_header_t __attribute__((aligned(8))); the error parameter is used to pass errors from the executive to the library, including SA_ERR_TRY_AGAIN for flow control, which is described later. This is described later: typedef struct { mar_uint32_t nodeid __attribute__((aligned(8))); void *conn __attribute__((aligned(8))); } mar_message_source_t __attribute__((aligned(8))); This is the MESSAGE_REQ_CLM_TRACKSTART message id above: struct req_clm_trackstart { mar_req_header_t header; SaUint8T trackFlags; SaClmClusterNotificationT *notificationBufferAddress; SaUint32T numberOfItems; }; The saClmClusterTrackStart api should create this message and send it to the executive. responses should be of: struct res_clm_trackstart ------------ some notes ------------ * Avoid doing anything tricky in the library itself. Let the executive handler do all of the work of the system. minimize what the API does. * Once an api is developed, it must be added to the makefile. Just add a line for the file to EXECOBJS build line. * protect I/O send/recv with a mutex. * always look at other libraries when there is a question about how to do something. It has likely been thought out in another library. ------------------------------------------------------------------------------- adding services ------------------------------------------------------------------------------- Services are defined by service handlers and messages described in include/ipc_SERVICE.h. These two peices of information are used by the executive to dispatch the correct messages to the correct receipients. ------------------------------- the service handler structure ------------------------------- A service is added by defining a structure defined in exec/service.h. The structure is a little daunting: struct libais_handler { int (*libais_handler_fn) (void *conn, void *msg); int response_size; int response_id; enum corosync_flow_control flow_control; }; The response_size, response_id, and flow_control for a library handler are used for flow control. A response message will be sent to the library of the size response_size, with the header id of response_id if the totem message queue is full. Some library APIs may not need to block in this condition (because they don't have to use totem), so they should specify COROSYNC_FLOW_CONTROL_NOT_REQUIREDin the flow control field. The libais_handler_fn is a function to be called when the library handler is requested to be executed. struct corosync_exec_handler { void (*exec_handler_fn) (void *msg, unsigned int nodeid); void (*exec_endian_convert_fn) (void *msg); }; The exec_handler_fn is a function to be called when the executive handler is requested to execute. The exec_endian_convert_fn is a function to be called to convert the endianess of the executive message. Note messages are not stored in big or little endian format before transmit. Instead they are transmitted in either big endian or little endian depending on the byte order of the transmitter and converted to the host machine order on receipt of the message. struct corosync_service_handler { unsigned char *name; unsigned short id; unsigned int private_data_size; int (*lib_init_fn) (void *conn); int (*lib_exit_fn) (void *conn); struct corosync_lib_handler *lib_service; int lib_service_count; struct corosync_exec_handler *exec_service; int (*exec_init_fn) (struct objdb_iface_ver0 *); int (*config_init_fn) (struct objdb_iface_ver0 *); void (*exec_dump_fn) (void); int exec_service_count; void (*confchg_fn) ( enum totem_configuration_type configuration_type, unsigned int *member_list, int member_list_entries, unsigned int *left_list, int left_list_entries, unsigned int *joined_list, int joined_list_entries, struct memb_ring_id *ring_id); void (*sync_init) (void); int (*sync_process) (void); void (*sync_activate) (void); void (*sync_abort) (void); }; name is the name of the service. id is the identifier of the service. private_data_size is the size of the private data used by the connection which the library and executive handlers can reference. lib_init_fn is the function executed when a library connection is made to the service handler. lib_exit_fn is the function executed when a library connection is exited either because the application closed the file descriptor, or the OS closed the file descriptor. lib_service is an array of corosync_lib_handler data structures which define the library service handler. lib_service_count is the number of elements in lib_service. exec_service is an array of corosync_exec_handler data structures which define the executive service handler. exec_init_fn is a function used to initialize the executive service. This is only called once. config_init_fn is called to parse config files and populate the object database. exec_dump_fn is called when SIGUSR2 is sent to the executive to dump the current state of the service. exec_service_count is the number of entries in the exec_service array. confchg_fn is called every time a configuration change occurs. sync_init is called when the service should begin synchronization. sync_process is called to process synchronization messages. sync_activate is called to activate the current service synchronization. sync_abort is called to abort the current service synchronization. -------------- flow control -------------- The totem protocol includes flow control so that it doesn't send too many messages when the network is completely full. But the library can still send messages to the executive much faster then the executive can send them over totem. So the library relies on the group messaging flow control to control flow of messages sent from the library. If the totem queues are full, no more messages may be sent, so the executive in ipc.c automatically detects this scenario and returns an SA_ERR_TRY_AGAIN error. When a library gets SA_ERR_TRY_AGAIN, the library may either retry, or return this error to the user if the error is allowed by the API definitions. The The other information is critical to ensuring that the library reads the correct message and size of message. Make sure the libais_handler matches the messages used in the handler function. ------------------------------------------------ dynamically linking the service handler plugin ------------------------------------------------ The service handler needs some special magic to dynamically be linked into corosync. /* * Dynamic loader definition */ static struct corosync_service_handler *clm_get_service_handler_ver0 (void); static struct corosync_service_handler_iface_ver0 clm_service_handler_iface = { .corosync_get_service_handler_ver0 = clm_get_service_handler_ver0 }; static struct lcr_iface corosync_clm_ver0[1] = { { .name = "corosync_clm", .version = 0, .versions_replace = 0, .versions_replace_count = 0, .dependencies = 0, .dependency_count = 0, .constructor = NULL, .destructor = NULL, .interfaces = NULL } }; static struct lcr_comp clm_comp_ver0 = { .iface_count = 1, .ifaces = corosync_clm_ver0 }; static struct corosync_service_handler *clm_get_service_handler_ver0 (void) { return (&clm_service_handler); } __attribute__ ((constructor)) static void clm_comp_register (void) { lcr_interfaces_set (&corosync_clm_ver0[0], &clm_service_handler_iface); lcr_component_register (&clm_comp_ver0); } Once this code is added (substitute clm for the service being implemented), the service will be loaded if its in the default services list. The default service list is specified in service.c:default_services. If creating an external plugin, there are configuration parameters which may be used to add your plugin into the corosync scanning of plugins. --------------------------------- Connection specific information --------------------------------- Every connection may have specific connection information if private data is greater then zero for the service handler. This is used to allow each library connection to maintain private state to that connection. The private data for a connection can be retrieved with: struct service_pd service_pd = (struct service_pd *)corosync_conn_private_data_get (conn); where service is the name of the service implemented and conn is the connection information likely passed into the library handler or stored in a message_source structure for later use by an executive handler. ------------------------------ sending responses to the api ------------------------------ A message is sent to the library from the executive message handler using the function: extern int corosync_conn_send_response (void *conn_info, void *msg, int mlen); conn_info is passed into the library message handler or stored in the executive message. This member describes the connection to send the response. msg is the message to send mlen is the length of the message to send Keep in mind that struct res_message should be at the beginning of the response message so that it follows the style used in the rest of corosync. -------------------------------------------- deferring response to an executive message -------------------------------------------- The message source structure is used to store information about the source of a message so a later executive message can respond to a library request. In a library handler, the source field should be set up with: message_source_set (&req_exec_ZZZZZZZ.source, conn); gmi_mcast (req_exec_ZZZZZZZ) In this case conn_info is passed into the library message handler Then the executive message handler determines if this processor is responsible for responding: if (message_source_is_local (conn)) { corosync_conn_send_response (); } --------------- Using totempg --------------- To send a message to every processor and the local processor for self delivery according to virtual synchrony semantics use: The totempg interface supports multiple users at one time and if you need to use a full totempg interface (defined in totempg.h) please ask for assistance on the mailing list. If you simply want to use multicast transmissions in corosync, do the following: assert (totempg_groups_mcast_joined (corosync_group_handle, &req_exec_clm_iovec, 1, TOTEMPG_AGREED) == 0); ----------------- library handler ----------------- Every library handler has the prototype: static int message_handler_req_clm_init (void *conn, void *msg); The start of the handler function should look something like this: int message_handler_req_clm_trackstart (void *conn *conn, void *msg) { struct req_clm_trackstart *req_clm_trackstart = (struct req_clm_trackstart *)message; { package up library handler message into executive message } { multicast message using totempg interface } } This assigns the void *message to a structure that can be used by the library handler. The conn field is used to indicate where the response should respond to. Use the tricks described in deferring a response to the executive handler to have the executive handler respond to the message. avoid doing anything tricky in a library handler. Do all the work in the executive handler at first. If later, it is possible to optimize, optimize away. ------------------- executive handler ------------------- Every executive handler has the prototype: static int message_handler_req_exec_clm_nodejoin (void *msg, unsigned int nodeid); The start of the handler function should look something like this: static int message_handler_req_exec_clm_nodejoin (void *msg, unsigned int nodeid); { struct req_exec_clm_nodejoin *req_exec_clm_nodejoin = (struct req_exec_clm_nodejoin *)message; { do real work of executing request, this is done on every node } } The conn_info structure is not available. If it is needed, it can be stored in the message sent by the library message handler in a source structure. The msg field contains the message sent by the library handler The nodeid is a unique node identifier of the node that originated the message. -------------------- the libais_init_fn -------------------- This should be used to initialize any state for the connection. -------------------- the libais_exit_fn -------------------- This function is called every time a service connection is disconnected by the executive. Free memory, change structures, or whatever work needs to be done to clean up. If the exit_fn couldn't complete because it is waiting for some event, it may return -1, which will allow the executive to make some forward progress. Then exit_fn will be called again. Return 0 when the exit was completed. This is most useful when toteom should be used to queue a message, but the queue is full. In this case, waiting a few more seconds may open up the queue, so return -1, and then the executive will try again to call exit_fn. Do NOT return -1 forever or the ais executive will spin. If -1 is returned, ENSURE that the state of the library hasn't changed so much that exit_fn cannot be called again. If exit_fn returns -1, it WILL be called again so expect it in the code. ---------------- the confchg_fn ---------------- This function is called whenever a configuration change occurs. Some services may not need this function, while others may. This is a good way to sync up joining nodes with the current state of the information stored on a particular processor. ------------------------------------------------------------------------------- Final comments ------------------------------------------------------------------------------- GDB is your friend, especially the "where" command. But it stops execution. This has a nasty side effect of killing the current configuration. In this case GDB may become your enemy. printf is your friend when GDB is your enemy. If stuck, ask on the mailing list, send your patches. Alot of time has been spent designing corosync, and even more time debugging it. There are people that can help you debug problems, especially around things like message delivery. Submit patches early to get feedback, especially around things like parallel style. Parallel style is very important to ensure maintainability by the corosync community. If this document is wrong or incomplete, complain so we can get it fixed for other people. Have fun! diff --git a/exec/coroipcs.c b/exec/coroipcs.c index ddf3d4fe..f60fc4dd 100644 --- a/exec/coroipcs.c +++ b/exec/coroipcs.c @@ -1,1097 +1,1097 @@ /* * Copyright (c) 2006-2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #ifndef _GNU_SOURCE #define _GNU_SOURCE 1 #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #if defined(HAVE_GETPEERUCRED) #include #endif #include #include #include #include #include "coroipcs.h" #include #ifndef MSG_NOSIGNAL #define MSG_NOSIGNAL 0 #endif #define SERVER_BACKLOG 5 #define MSG_SEND_LOCKED 0 #define MSG_SEND_UNLOCKED 1 static struct coroipcs_init_state *api; DECLARE_LIST_INIT (conn_info_list_head); struct outq_item { void *msg; size_t mlen; struct list_head list; }; #if defined(_SEM_SEMUN_UNDEFINED) union semun { int val; struct semid_ds *buf; unsigned short int *array; struct seminfo *__buf; }; #endif enum conn_state { CONN_STATE_THREAD_INACTIVE = 0, CONN_STATE_THREAD_ACTIVE = 1, CONN_STATE_THREAD_REQUEST_EXIT = 2, CONN_STATE_THREAD_DESTROYED = 3, CONN_STATE_LIB_EXIT_CALLED = 4, CONN_STATE_DISCONNECT_INACTIVE = 5 }; struct conn_info { int fd; pthread_t thread; pthread_attr_t thread_attr; unsigned int service; enum conn_state state; int notify_flow_control_enabled; int refcount; key_t shmkey; key_t semkey; int shmid; int semid; unsigned int pending_semops; pthread_mutex_t mutex; struct shared_memory *mem; struct list_head outq_head; void *private_data; struct list_head list; char setup_msg[sizeof (mar_req_setup_t)]; unsigned int setup_bytes_read; char *sending_allowed_private_data[64]; }; static int shared_mem_dispatch_bytes_left (struct conn_info *conn_info); static void outq_flush (struct conn_info *conn_info); static int priv_change (struct conn_info *conn_info); static void ipc_disconnect (struct conn_info *conn_info); -static void msg_send (void *conn, const struct iovec *iov, int iov_len, +static void msg_send (void *conn, const struct iovec *iov, unsigned int iov_len, int locked); static int memcpy_dwrap (struct conn_info *conn_info, void *msg, int len); static int ipc_thread_active (void *conn) { struct conn_info *conn_info = (struct conn_info *)conn; int retval = 0; pthread_mutex_lock (&conn_info->mutex); if (conn_info->state == CONN_STATE_THREAD_ACTIVE) { retval = 1; } pthread_mutex_unlock (&conn_info->mutex); return (retval); } static int ipc_thread_exiting (void *conn) { struct conn_info *conn_info = (struct conn_info *)conn; int retval = 1; pthread_mutex_lock (&conn_info->mutex); if (conn_info->state == CONN_STATE_THREAD_INACTIVE) { retval = 0; } else if (conn_info->state == CONN_STATE_THREAD_ACTIVE) { retval = 0; } pthread_mutex_unlock (&conn_info->mutex); return (retval); } /* * returns 0 if should be called again, -1 if finished */ static inline int conn_info_destroy (struct conn_info *conn_info) { unsigned int res; void *retval; list_del (&conn_info->list); list_init (&conn_info->list); if (conn_info->state == CONN_STATE_THREAD_REQUEST_EXIT) { res = pthread_join (conn_info->thread, &retval); conn_info->state = CONN_STATE_THREAD_DESTROYED; return (0); } if (conn_info->state == CONN_STATE_THREAD_INACTIVE || conn_info->state == CONN_STATE_DISCONNECT_INACTIVE) { list_del (&conn_info->list); close (conn_info->fd); api->free (conn_info); return (-1); } if (conn_info->state == CONN_STATE_THREAD_ACTIVE) { pthread_kill (conn_info->thread, SIGUSR1); return (0); } api->serialize_lock (); /* * Retry library exit function if busy */ if (conn_info->state == CONN_STATE_THREAD_DESTROYED) { res = api->exit_fn_get (conn_info->service) (conn_info); if (res == -1) { api->serialize_unlock (); return (0); } else { conn_info->state = CONN_STATE_LIB_EXIT_CALLED; } } pthread_mutex_lock (&conn_info->mutex); if (conn_info->refcount > 0) { pthread_mutex_unlock (&conn_info->mutex); api->serialize_unlock (); return (0); } list_del (&conn_info->list); pthread_mutex_unlock (&conn_info->mutex); /* * Destroy shared memory segment and semaphore */ shmdt (conn_info->mem); res = shmctl (conn_info->shmid, IPC_RMID, NULL); semctl (conn_info->semid, 0, IPC_RMID); /* * Free allocated data needed to retry exiting library IPC connection */ if (conn_info->private_data) { api->free (conn_info->private_data); } close (conn_info->fd); api->free (conn_info); api->serialize_unlock (); return (-1); } struct res_overlay { mar_res_header_t header __attribute__((aligned(8))); char buf[4096]; }; static void *pthread_ipc_consumer (void *conn) { struct conn_info *conn_info = (struct conn_info *)conn; struct sembuf sop; int res; mar_req_header_t *header; struct res_overlay res_overlay; int send_ok; if (api->sched_priority != 0) { struct sched_param sched_param; sched_param.sched_priority = api->sched_priority; res = pthread_setschedparam (conn_info->thread, SCHED_RR, &sched_param); } for (;;) { sop.sem_num = 0; sop.sem_op = -1; sop.sem_flg = 0; retry_semop: if (ipc_thread_active (conn_info) == 0) { coroipcs_refcount_dec (conn_info); pthread_exit (0); } res = semop (conn_info->semid, &sop, 1); if ((res == -1) && (errno == EINTR || errno == EAGAIN)) { goto retry_semop; } else if ((res == -1) && (errno == EINVAL || errno == EIDRM)) { coroipcs_refcount_dec (conn_info); pthread_exit (0); } coroipcs_refcount_inc (conn_info); header = (mar_req_header_t *)conn_info->mem->req_buffer; send_ok = api->sending_allowed (conn_info->service, header->id, header, conn_info->sending_allowed_private_data); if (send_ok) { api->serialize_lock(); api->handler_fn_get (conn_info->service, header->id) (conn_info, header); api->serialize_unlock(); } else { /* * Overload, tell library to retry */ res_overlay.header.size = api->response_size_get (conn_info->service, header->id); res_overlay.header.id = api->response_id_get (conn_info->service, header->id); res_overlay.header.error = CS_ERR_TRY_AGAIN; coroipcs_response_send (conn_info, &res_overlay, res_overlay.header.size); } api->sending_allowed_release (conn_info->sending_allowed_private_data); coroipcs_refcount_dec (conn); } pthread_exit (0); } static int req_setup_send ( struct conn_info *conn_info, int error) { mar_res_setup_t res_setup; unsigned int res; res_setup.error = error; retry_send: res = send (conn_info->fd, &res_setup, sizeof (mar_res_setup_t), MSG_WAITALL); if (res == -1 && errno == EINTR) { goto retry_send; } else if (res == -1 && errno == EAGAIN) { goto retry_send; } return (0); } static int req_setup_recv ( struct conn_info *conn_info) { int res; struct msghdr msg_recv; struct iovec iov_recv; #ifdef COROSYNC_LINUX struct cmsghdr *cmsg; char cmsg_cred[CMSG_SPACE (sizeof (struct ucred))]; struct ucred *cred; int off = 0; int on = 1; #endif msg_recv.msg_iov = &iov_recv; msg_recv.msg_iovlen = 1; msg_recv.msg_name = 0; msg_recv.msg_namelen = 0; #ifdef COROSYNC_LINUX msg_recv.msg_control = (void *)cmsg_cred; msg_recv.msg_controllen = sizeof (cmsg_cred); #endif #ifdef PORTABILITY_WORK_TODO #ifdef COROSYNC_SOLARIS msg_recv.msg_flags = 0; uid_t euid; gid_t egid; euid = -1; egid = -1; if (getpeereid(conn_info->fd, &euid, &egid) != -1 && (api->security_valid (euid, egid)) { if (conn_info->state == CONN_IO_STATE_INITIALIZING) { api->log_printf ("Invalid security authentication\n"); return (-1); } } msg_recv.msg_accrights = 0; msg_recv.msg_accrightslen = 0; #else /* COROSYNC_SOLARIS */ #ifdef HAVE_GETPEERUCRED ucred_t *uc; uid_t euid = -1; gid_t egid = -1; if (getpeerucred (conn_info->fd, &uc) == 0) { euid = ucred_geteuid (uc); egid = ucred_getegid (uc); if (api->security_valid (euid, egid) { conn_info->authenticated = 1; } ucred_free(uc); } if (conn_info->authenticated == 0) { api->log_printf ("Invalid security authentication\n"); } #else /* HAVE_GETPEERUCRED */ api->log_printf (LOG_LEVEL_SECURITY, "Connection not authenticated " "because platform does not support " "authentication with sockets, continuing " "with a fake authentication\n"); #endif /* HAVE_GETPEERUCRED */ #endif /* COROSYNC_SOLARIS */ #endif iov_recv.iov_base = &conn_info->setup_msg[conn_info->setup_bytes_read]; iov_recv.iov_len = sizeof (mar_req_setup_t) - conn_info->setup_bytes_read; #ifdef COROSYNC_LINUX setsockopt(conn_info->fd, SOL_SOCKET, SO_PASSCRED, &on, sizeof (on)); #endif retry_recv: res = recvmsg (conn_info->fd, &msg_recv, MSG_NOSIGNAL); if (res == -1 && errno == EINTR) { goto retry_recv; } else if (res == -1 && errno != EAGAIN) { return (0); } else if (res == 0) { #if defined(COROSYNC_SOLARIS) || defined(COROSYNC_BSD) || defined(COROSYNC_DARWIN) /* On many OS poll never return POLLHUP or POLLERR. * EOF is detected when recvmsg return 0. */ ipc_disconnect (conn_info); #endif return (-1); } conn_info->setup_bytes_read += res; #ifdef COROSYNC_LINUX cmsg = CMSG_FIRSTHDR (&msg_recv); assert (cmsg); cred = (struct ucred *)CMSG_DATA (cmsg); if (cred) { if (api->security_valid (cred->uid, cred->gid)) { } else { ipc_disconnect (conn_info); api->log_printf ("Invalid security authentication\n"); return (-1); } } #endif if (conn_info->setup_bytes_read == sizeof (mar_req_setup_t)) { #ifdef COROSYNC_LINUX setsockopt(conn_info->fd, SOL_SOCKET, SO_PASSCRED, &off, sizeof (off)); #endif return (1); } return (0); } static void ipc_disconnect (struct conn_info *conn_info) { if (conn_info->state == CONN_STATE_THREAD_INACTIVE) { conn_info->state = CONN_STATE_DISCONNECT_INACTIVE; return; } if (conn_info->state != CONN_STATE_THREAD_ACTIVE) { return; } pthread_mutex_lock (&conn_info->mutex); conn_info->state = CONN_STATE_THREAD_REQUEST_EXIT; pthread_mutex_unlock (&conn_info->mutex); pthread_kill (conn_info->thread, SIGUSR1); } static int conn_info_create (int fd) { struct conn_info *conn_info; conn_info = api->malloc (sizeof (struct conn_info)); if (conn_info == NULL) { return (-1); } memset (conn_info, 0, sizeof (struct conn_info)); conn_info->fd = fd; conn_info->service = SOCKET_SERVICE_INIT; conn_info->state = CONN_STATE_THREAD_INACTIVE; list_init (&conn_info->outq_head); list_init (&conn_info->list); list_add (&conn_info->list, &conn_info_list_head); api->poll_dispatch_add (fd, conn_info); return (0); } #if defined(COROSYNC_LINUX) || defined(COROSYNC_SOLARIS) /* SUN_LEN is broken for abstract namespace */ #define COROSYNC_SUN_LEN(a) sizeof(*(a)) #else #define COROSYNC_SUN_LEN(a) SUN_LEN(a) #endif /* * Exported functions */ extern void coroipcs_ipc_init ( struct coroipcs_init_state *init_state) { int server_fd; struct sockaddr_un un_addr; int res; api = init_state; /* * Create socket for IPC clients, name socket, listen for connections */ server_fd = socket (PF_UNIX, SOCK_STREAM, 0); if (server_fd == -1) { api->log_printf ("Cannot create client connections socket.\n"); api->fatal_error ("Can't create library listen socket"); }; res = fcntl (server_fd, F_SETFL, O_NONBLOCK); if (res == -1) { api->log_printf ("Could not set non-blocking operation on server socket: %s\n", strerror (errno)); api->fatal_error ("Could not set non-blocking operation on server socket"); } memset (&un_addr, 0, sizeof (struct sockaddr_un)); un_addr.sun_family = AF_UNIX; #if defined(COROSYNC_BSD) || defined(COROSYNC_DARWIN) un_addr.sun_len = sizeof(struct sockaddr_un); #endif #if defined(COROSYNC_LINUX) sprintf (un_addr.sun_path + 1, "%s", api->socket_name); #else sprintf (un_addr.sun_path, "%s/%s", SOCKETDIR, api->socket_name); unlink (un_addr.sun_path); #endif res = bind (server_fd, (struct sockaddr *)&un_addr, COROSYNC_SUN_LEN(&un_addr)); if (res) { api->log_printf ("Could not bind AF_UNIX: %s.\n", strerror (errno)); api->fatal_error ("Could not bind to AF_UNIX socket\n"); } listen (server_fd, SERVER_BACKLOG); /* * Setup connection dispatch routine */ api->poll_accept_add (server_fd); } void coroipcs_ipc_exit (void) { struct list_head *list; struct conn_info *conn_info; for (list = conn_info_list_head.next; list != &conn_info_list_head; list = list->next) { conn_info = list_entry (list, struct conn_info, list); shmdt (conn_info->mem); shmctl (conn_info->shmid, IPC_RMID, NULL); semctl (conn_info->semid, 0, IPC_RMID); pthread_kill (conn_info->thread, SIGUSR1); } } /* * Get the conn info private data */ void *coroipcs_private_data_get (void *conn) { struct conn_info *conn_info = (struct conn_info *)conn; return (conn_info->private_data); } int coroipcs_response_send (void *conn, const void *msg, int mlen) { struct conn_info *conn_info = (struct conn_info *)conn; struct sembuf sop; int res; memcpy (conn_info->mem->res_buffer, msg, mlen); sop.sem_num = 1; sop.sem_op = 1; sop.sem_flg = 0; retry_semop: res = semop (conn_info->semid, &sop, 1); if ((res == -1) && (errno == EINTR || errno == EAGAIN)) { goto retry_semop; } else if ((res == -1) && (errno == EINVAL || errno == EIDRM)) { return (0); } return (0); } -int coroipcs_response_iov_send (void *conn, const struct iovec *iov, int iov_len) +int coroipcs_response_iov_send (void *conn, const struct iovec *iov, unsigned int iov_len) { struct conn_info *conn_info = (struct conn_info *)conn; struct sembuf sop; int res; int write_idx = 0; int i; for (i = 0; i < iov_len; i++) { memcpy (&conn_info->mem->res_buffer[write_idx], iov[i].iov_base, iov[i].iov_len); write_idx += iov[i].iov_len; } sop.sem_num = 1; sop.sem_op = 1; sop.sem_flg = 0; retry_semop: res = semop (conn_info->semid, &sop, 1); if ((res == -1) && (errno == EINTR || errno == EAGAIN)) { goto retry_semop; } else if ((res == -1) && (errno == EINVAL || errno == EIDRM)) { return (0); } return (0); } static int shared_mem_dispatch_bytes_left (struct conn_info *conn_info) { unsigned int read; unsigned int write; unsigned int bytes_left; read = conn_info->mem->read; write = conn_info->mem->write; if (read <= write) { bytes_left = DISPATCH_SIZE - write + read; } else { bytes_left = read - write; } return (bytes_left); } static int memcpy_dwrap (struct conn_info *conn_info, void *msg, int len) { char *dest_char = (char *)conn_info->mem->dispatch_buffer; char *src_char = msg; unsigned int first_write; unsigned int second_write; first_write = len; second_write = 0; if (len + conn_info->mem->write >= DISPATCH_SIZE) { first_write = DISPATCH_SIZE - conn_info->mem->write; second_write = len - first_write; } memcpy (&dest_char[conn_info->mem->write], src_char, first_write); if (second_write) { memcpy (dest_char, &src_char[first_write], second_write); } conn_info->mem->write = (conn_info->mem->write + len) % DISPATCH_SIZE; return (0); } -static void msg_send (void *conn, const struct iovec *iov, int iov_len, +static void msg_send (void *conn, const struct iovec *iov, unsigned int iov_len, int locked) { struct conn_info *conn_info = (struct conn_info *)conn; struct sembuf sop; int res; int i; char buf; for (i = 0; i < iov_len; i++) { memcpy_dwrap (conn_info, iov[i].iov_base, iov[i].iov_len); } buf = !list_empty (&conn_info->outq_head); res = send (conn_info->fd, &buf, 1, MSG_NOSIGNAL); if (res == -1 && errno == EAGAIN) { if (locked == 0) { pthread_mutex_lock (&conn_info->mutex); } conn_info->pending_semops += 1; if (locked == 0) { pthread_mutex_unlock (&conn_info->mutex); } api->poll_dispatch_modify (conn_info->fd, POLLIN|POLLOUT|POLLNVAL); } else if (res == -1) { ipc_disconnect (conn_info); } sop.sem_num = 2; sop.sem_op = 1; sop.sem_flg = 0; retry_semop: res = semop (conn_info->semid, &sop, 1); if ((res == -1) && (errno == EINTR || errno == EAGAIN)) { goto retry_semop; } else if ((res == -1) && (errno == EINVAL || errno == EIDRM)) { return; } } static void outq_flush (struct conn_info *conn_info) { struct list_head *list, *list_next; struct outq_item *outq_item; unsigned int bytes_left; struct iovec iov; char buf; int res; pthread_mutex_lock (&conn_info->mutex); if (list_empty (&conn_info->outq_head)) { buf = 3; res = send (conn_info->fd, &buf, 1, MSG_NOSIGNAL); pthread_mutex_unlock (&conn_info->mutex); return; } for (list = conn_info->outq_head.next; list != &conn_info->outq_head; list = list_next) { list_next = list->next; outq_item = list_entry (list, struct outq_item, list); bytes_left = shared_mem_dispatch_bytes_left (conn_info); if (bytes_left > outq_item->mlen) { iov.iov_base = outq_item->msg; iov.iov_len = outq_item->mlen; msg_send (conn_info, &iov, 1, MSG_SEND_UNLOCKED); list_del (list); api->free (iov.iov_base); api->free (outq_item); } else { break; } } pthread_mutex_unlock (&conn_info->mutex); } static int priv_change (struct conn_info *conn_info) { mar_req_priv_change req_priv_change; unsigned int res; union semun semun; struct semid_ds ipc_set; int i; retry_recv: res = recv (conn_info->fd, &req_priv_change, sizeof (mar_req_priv_change), MSG_NOSIGNAL); if (res == -1 && errno == EINTR) { goto retry_recv; } if (res == -1 && errno == EAGAIN) { goto retry_recv; } if (res == -1 && errno != EAGAIN) { return (-1); } #if defined(COROSYNC_SOLARIS) || defined(COROSYNC_BSD) || defined(COROSYNC_DARWIN) /* Error on socket, EOF is detected when recv return 0 */ if (res == 0) { return (-1); } #endif ipc_set.sem_perm.uid = req_priv_change.euid; ipc_set.sem_perm.gid = req_priv_change.egid; ipc_set.sem_perm.mode = 0600; semun.buf = &ipc_set; for (i = 0; i < 3; i++) { res = semctl (conn_info->semid, 0, IPC_SET, semun); if (res == -1) { return (-1); } } return (0); } -static void msg_send_or_queue (void *conn, const struct iovec *iov, int iov_len) +static void msg_send_or_queue (void *conn, const struct iovec *iov, unsigned int iov_len) { struct conn_info *conn_info = (struct conn_info *)conn; unsigned int bytes_left; unsigned int bytes_msg = 0; int i; struct outq_item *outq_item; char *write_buf = 0; /* * Exit transmission if the connection is dead */ if (ipc_thread_active (conn) == 0) { return; } bytes_left = shared_mem_dispatch_bytes_left (conn_info); for (i = 0; i < iov_len; i++) { bytes_msg += iov[i].iov_len; } if (bytes_left < bytes_msg || list_empty (&conn_info->outq_head) == 0) { outq_item = api->malloc (sizeof (struct outq_item)); if (outq_item == NULL) { ipc_disconnect (conn); return; } outq_item->msg = api->malloc (bytes_msg); if (outq_item->msg == 0) { api->free (outq_item); ipc_disconnect (conn); return; } write_buf = outq_item->msg; for (i = 0; i < iov_len; i++) { memcpy (write_buf, iov[i].iov_base, iov[i].iov_len); write_buf += iov[i].iov_len; } outq_item->mlen = bytes_msg; list_init (&outq_item->list); pthread_mutex_lock (&conn_info->mutex); if (list_empty (&conn_info->outq_head)) { conn_info->notify_flow_control_enabled = 1; api->poll_dispatch_modify (conn_info->fd, POLLIN|POLLOUT|POLLNVAL); } list_add_tail (&outq_item->list, &conn_info->outq_head); pthread_mutex_unlock (&conn_info->mutex); return; } msg_send (conn, iov, iov_len, MSG_SEND_LOCKED); } void coroipcs_refcount_inc (void *conn) { struct conn_info *conn_info = (struct conn_info *)conn; pthread_mutex_lock (&conn_info->mutex); conn_info->refcount++; pthread_mutex_unlock (&conn_info->mutex); } void coroipcs_refcount_dec (void *conn) { struct conn_info *conn_info = (struct conn_info *)conn; pthread_mutex_lock (&conn_info->mutex); conn_info->refcount--; pthread_mutex_unlock (&conn_info->mutex); } int coroipcs_dispatch_send (void *conn, const void *msg, int mlen) { struct iovec iov; iov.iov_base = msg; iov.iov_len = mlen; msg_send_or_queue (conn, &iov, 1); return (0); } -int coroipcs_dispatch_iov_send (void *conn, const struct iovec *iov, int iov_len) +int coroipcs_dispatch_iov_send (void *conn, const struct iovec *iov, unsigned int iov_len) { msg_send_or_queue (conn, iov, iov_len); return (0); } int coroipcs_handler_accept ( int fd, int revent, void *data) { socklen_t addrlen; struct sockaddr_un un_addr; int new_fd; #ifdef COROSYNC_LINUX int on = 1; #endif int res; addrlen = sizeof (struct sockaddr_un); retry_accept: new_fd = accept (fd, (struct sockaddr *)&un_addr, &addrlen); if (new_fd == -1 && errno == EINTR) { goto retry_accept; } if (new_fd == -1) { api->log_printf ("Could not accept Library connection: %s\n", strerror (errno)); return (0); /* This is an error, but -1 would indicate disconnect from poll loop */ } res = fcntl (new_fd, F_SETFL, O_NONBLOCK); if (res == -1) { api->log_printf ("Could not set non-blocking operation on library connection: %s\n", strerror (errno)); close (new_fd); return (0); /* This is an error, but -1 would indicate disconnect from poll loop */ } /* * Valid accept */ /* * Request credentials of sender provided by kernel */ #ifdef COROSYNC_LINUX setsockopt(new_fd, SOL_SOCKET, SO_PASSCRED, &on, sizeof (on)); #endif res = conn_info_create (new_fd); if (res != 0) { close (new_fd); } return (0); } int coroipcs_handler_dispatch ( int fd, int revent, void *context) { mar_req_setup_t *req_setup; struct conn_info *conn_info = (struct conn_info *)context; int res; char buf; if (ipc_thread_exiting (conn_info)) { return conn_info_destroy (conn_info); } /* * If an error occurs, request exit */ if (revent & (POLLERR|POLLHUP)) { ipc_disconnect (conn_info); return (0); } /* * Read the header and process it */ if (conn_info->service == SOCKET_SERVICE_INIT && (revent & POLLIN)) { /* * Receive in a nonblocking fashion the request * IF security invalid, send TRY_AGAIN, otherwise * send OK */ res = req_setup_recv (conn_info); if (res == -1) { req_setup_send (conn_info, CS_ERR_TRY_AGAIN); } if (res != 1) { return (0); } req_setup_send (conn_info, CS_OK); pthread_mutex_init (&conn_info->mutex, NULL); req_setup = (mar_req_setup_t *)conn_info->setup_msg; /* * Is the service registered ? */ if (api->service_available (req_setup->service) == 0) { ipc_disconnect (conn_info); return (0); } conn_info->shmkey = req_setup->shmkey; conn_info->semkey = req_setup->semkey; conn_info->service = req_setup->service; conn_info->refcount = 0; conn_info->notify_flow_control_enabled = 0; conn_info->setup_bytes_read = 0; conn_info->shmid = shmget (conn_info->shmkey, sizeof (struct shared_memory), 0600); conn_info->mem = shmat (conn_info->shmid, NULL, 0); conn_info->semid = semget (conn_info->semkey, 3, 0600); conn_info->pending_semops = 0; /* * ipc thread is the only reference at startup */ conn_info->refcount = 1; conn_info->state = CONN_STATE_THREAD_ACTIVE; conn_info->private_data = api->malloc (api->private_data_size_get (conn_info->service)); memset (conn_info->private_data, 0, api->private_data_size_get (conn_info->service)); api->init_fn_get (conn_info->service) (conn_info); pthread_attr_init (&conn_info->thread_attr); /* * IA64 needs more stack space then other arches */ #if defined(__ia64__) pthread_attr_setstacksize (&conn_info->thread_attr, 400000); #else pthread_attr_setstacksize (&conn_info->thread_attr, 200000); #endif pthread_attr_setdetachstate (&conn_info->thread_attr, PTHREAD_CREATE_JOINABLE); res = pthread_create (&conn_info->thread, &conn_info->thread_attr, pthread_ipc_consumer, conn_info); /* * Security check - disallow multiple configurations of * the ipc connection */ if (conn_info->service == SOCKET_SERVICE_INIT) { conn_info->service = -1; } } else if (revent & POLLIN) { coroipcs_refcount_inc (conn_info); res = recv (fd, &buf, 1, MSG_NOSIGNAL); if (res == 1) { switch (buf) { case MESSAGE_REQ_OUTQ_FLUSH: outq_flush (conn_info); break; case MESSAGE_REQ_CHANGE_EUID: if (priv_change (conn_info) == -1) { ipc_disconnect (conn_info); } break; default: res = 0; break; } coroipcs_refcount_dec (conn_info); } #if defined(COROSYNC_SOLARIS) || defined(COROSYNC_BSD) || defined(COROSYNC_DARWIN) /* On many OS poll never return POLLHUP or POLLERR. * EOF is detected when recvmsg return 0. */ if (res == 0) { ipc_disconnect (conn_info); return (0); } #endif } coroipcs_refcount_inc (conn_info); pthread_mutex_lock (&conn_info->mutex); if ((conn_info->state == CONN_STATE_THREAD_ACTIVE) && (revent & POLLOUT)) { buf = !list_empty (&conn_info->outq_head); for (; conn_info->pending_semops;) { res = send (conn_info->fd, &buf, 1, MSG_NOSIGNAL); if (res == 1) { conn_info->pending_semops--; } else { break; } } if (conn_info->notify_flow_control_enabled) { buf = 2; res = send (conn_info->fd, &buf, 1, MSG_NOSIGNAL); if (res == 1) { conn_info->notify_flow_control_enabled = 0; } } if (conn_info->notify_flow_control_enabled == 0 && conn_info->pending_semops == 0) { api->poll_dispatch_modify (conn_info->fd, POLLIN|POLLNVAL); } } pthread_mutex_unlock (&conn_info->mutex); coroipcs_refcount_dec (conn_info); return (0); } diff --git a/exec/coroipcs.h b/exec/coroipcs.h index 2b60dbe1..c59870d3 100644 --- a/exec/coroipcs.h +++ b/exec/coroipcs.h @@ -1,100 +1,100 @@ /* * Copyright (c) 2006-2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef COROIPCS_H_DEFINED #define COROIPCS_H_DEFINED #include #define SOCKET_SERVICE_INIT 0xFFFFFFFF struct iovec; typedef int (*coroipcs_init_fn_lvalue) (void *conn); typedef int (*coroipcs_exit_fn_lvalue) (void *conn); typedef void (*coroipcs_handler_fn_lvalue) (void *conn, void *msg); struct coroipcs_init_state { const char *socket_name; int sched_priority; void *(*malloc) (size_t size); void (*free) (void *ptr); void (*log_printf) ( const char *format, ...) __attribute__((format(printf, 1, 2))); int (*service_available)(unsigned int service); int (*private_data_size_get)(unsigned int service); int (*security_valid)(int uid, int gid); void (*serialize_lock)(void); void (*serialize_unlock)(void); int (*response_size_get)(unsigned int service, unsigned int id); int (*response_id_get)(unsigned int service, unsigned int id); int (*sending_allowed)(unsigned int service, unsigned int id, void *msg, void *sending_allowed_private_data); void (*sending_allowed_release)(void *sending_allowed_private_data); void (*poll_accept_add)(int fd); void (*poll_dispatch_add)(int fd, void *context); void (*poll_dispatch_modify)(int fd, int events); void (*fatal_error)(const char *error_msg); coroipcs_init_fn_lvalue (*init_fn_get)(unsigned int service); coroipcs_exit_fn_lvalue (*exit_fn_get)(unsigned int service); coroipcs_handler_fn_lvalue (*handler_fn_get)(unsigned int service, unsigned int id); }; extern void coroipcs_ipc_init ( struct coroipcs_init_state *init_state); extern void *coroipcs_private_data_get (void *conn); extern int coroipcs_response_send (void *conn, const void *msg, int mlen); extern int coroipcs_response_iov_send (void *conn, - const struct iovec *iov, int iov_len); + const struct iovec *iov, unsigned int iov_len); extern int coroipcs_dispatch_send (void *conn, const void *msg, int mlen); extern int coroipcs_dispatch_iov_send (void *conn, - const struct iovec *iov, int iov_len); + const struct iovec *iov, unsigned int iov_len); extern void coroipcs_refcount_inc (void *conn); extern void coroipcs_refcount_dec (void *conn); extern void coroipcs_ipc_exit (void); extern int coroipcs_handler_accept (int fd, int revent, void *context); extern int coroipcs_handler_dispatch (int fd, int revent, void *context); #endif /* COROIPCS_H_DEFINED */ diff --git a/exec/main.c b/exec/main.c index cd680ecc..548f7095 100644 --- a/exec/main.c +++ b/exec/main.c @@ -1,924 +1,924 @@ /* * Copyright (c) 2002-2006 MontaVista Software, Inc. * Copyright (c) 2006-2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "quorum.h" #include "totemsrp.h" #include "mempool.h" #include "mainconfig.h" #include "totemconfig.h" #include "main.h" #include "sync.h" #include "tlist.h" #include "coroipcs.h" #include "timer.h" #include "util.h" #include "apidef.h" #include "service.h" #include "version.h" LOGSYS_DECLARE_SYSTEM ("corosync", LOG_MODE_OUTPUT_STDERR | LOG_MODE_THREADED | LOG_MODE_FORK, NULL, LOG_DAEMON, NULL, 1000000); LOGSYS_DECLARE_SUBSYS ("MAIN", LOG_INFO); #define SERVER_BACKLOG 5 static int sched_priority = 0; static unsigned int service_count = 32; #if defined(HAVE_PTHREAD_SPIN_LOCK) static pthread_spinlock_t serialize_spin; #else static pthread_mutex_t serialize_mutex = PTHREAD_MUTEX_INITIALIZER; #endif static struct totem_logging_configuration totem_logging_configuration; static char delivery_data[MESSAGE_SIZE_MAX]; static int num_config_modules; static struct config_iface_ver0 *config_modules[MAX_DYNAMIC_SERVICES]; static struct objdb_iface_ver0 *objdb = NULL; static struct corosync_api_v1 *api = NULL; static struct main_config main_config; unsigned long long *(*main_clm_get_by_nodeid) (unsigned int node_id); hdb_handle_t corosync_poll_handle; static void sigusr2_handler (int num) { int i; for (i = 0; ais_service[i]; i++) { if (ais_service[i]->exec_dump_fn) { ais_service[i]->exec_dump_fn (); } } } static void *corosync_exit (void *arg) __attribute__((__noreturn__)); static void *corosync_exit (void *arg) { if (api) { corosync_service_unlink_all (api); } #ifdef DEBUG_MEMPOOL int stats_inuse[MEMPOOL_GROUP_SIZE]; int stats_avail[MEMPOOL_GROUP_SIZE]; int stats_memoryused[MEMPOOL_GROUP_SIZE]; int i; mempool_getstats (stats_inuse, stats_avail, stats_memoryused); log_printf (LOG_LEVEL_DEBUG, "Memory pools:\n"); for (i = 0; i < MEMPOOL_GROUP_SIZE; i++) { log_printf (LOG_LEVEL_DEBUG, "order %d size %d inuse %d avail %d memory used %d\n", i, 1<name = ais_service[ais_service_index]->name; callbacks->sync_init = ais_service[ais_service_index]->sync_init; callbacks->sync_process = ais_service[ais_service_index]->sync_process; callbacks->sync_activate = ais_service[ais_service_index]->sync_activate; callbacks->sync_abort = ais_service[ais_service_index]->sync_abort; return (0); } static struct memb_ring_id corosync_ring_id; static void confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { int i; serialize_lock (); memcpy (&corosync_ring_id, ring_id, sizeof (struct memb_ring_id)); /* * Call configuration change for all services */ for (i = 0; i < service_count; i++) { if (ais_service[i] && ais_service[i]->confchg_fn) { ais_service[i]->confchg_fn (configuration_type, member_list, member_list_entries, left_list, left_list_entries, joined_list, joined_list_entries, ring_id); } } serialize_unlock (); } static void priv_drop (void) { return; /* TODO: we are still not dropping privs */ setuid (main_config.uid); setegid (main_config.gid); } static void corosync_mempool_init (void) { int res; res = mempool_init (pool_sizes); if (res == ENOMEM) { log_printf (LOG_LEVEL_ERROR, "Couldn't allocate memory pools, not enough memory"); corosync_exit_error (AIS_DONE_MEMPOOL_INIT); } } static void corosync_tty_detach (void) { int fd; /* * Disconnect from TTY if this is not a debug run */ switch (fork ()) { case -1: corosync_exit_error (AIS_DONE_FORK); break; case 0: /* * child which is disconnected, run this process */ /* setset(); close (0); close (1); close (2); */ break; default: exit (0); break; } /* Create new session */ (void)setsid(); /* * Map stdin/out/err to /dev/null. */ fd = open("/dev/null", O_RDWR); if (fd >= 0) { /* dup2 to 0 / 1 / 2 (stdin / stdout / stderr) */ dup2(fd, STDIN_FILENO); /* 0 */ dup2(fd, STDOUT_FILENO); /* 1 */ dup2(fd, STDERR_FILENO); /* 2 */ /* Should be 0, but just in case it isn't... */ if (fd > 2) close(fd); } } static void corosync_setscheduler (void) { #if ! defined(TS_CLASS) && (defined(COROSYNC_BSD) || defined(COROSYNC_LINUX) || defined(COROSYNC_SOLARIS)) struct sched_param sched_param; int res; sched_priority = sched_get_priority_max (SCHED_RR); if (sched_priority != -1) { sched_param.sched_priority = sched_priority; res = sched_setscheduler (0, SCHED_RR, &sched_param); if (res == -1) { log_printf (LOG_LEVEL_WARNING, "Could not set SCHED_RR at priority %d: %s\n", sched_param.sched_priority, strerror (errno)); } } else { log_printf (LOG_LEVEL_WARNING, "Could not get maximum scheduler priority: %s\n", strerror (errno)); sched_priority = 0; } #else log_printf(LOG_LEVEL_WARNING, "Scheduler priority left to default value (no OS support)\n"); #endif } static void corosync_mlockall (void) { #if !defined(COROSYNC_BSD) int res; #endif struct rlimit rlimit; rlimit.rlim_cur = RLIM_INFINITY; rlimit.rlim_max = RLIM_INFINITY; #ifndef COROSYNC_SOLARIS setrlimit (RLIMIT_MEMLOCK, &rlimit); #else setrlimit (RLIMIT_VMEM, &rlimit); #endif #if defined(COROSYNC_BSD) /* under FreeBSD a process with locked page cannot call dlopen * code disabled until FreeBSD bug i386/93396 was solved */ log_printf (LOG_LEVEL_WARNING, "Could not lock memory of service to avoid page faults\n"); #else res = mlockall (MCL_CURRENT | MCL_FUTURE); if (res == -1) { log_printf (LOG_LEVEL_WARNING, "Could not lock memory of service to avoid page faults: %s\n", strerror (errno)); }; #endif } static void deliver_fn ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required) { mar_req_header_t *header; int pos = 0; int i; int service; int fn_id; /* * Build buffer without iovecs to make processing easier * This is only used for messages which are multicast with iovecs * and self-delivered. All other mechanisms avoid the copy. */ if (iov_len > 1) { for (i = 0; i < iov_len; i++) { memcpy (&delivery_data[pos], iovec[i].iov_base, iovec[i].iov_len); pos += iovec[i].iov_len; assert (pos < MESSAGE_SIZE_MAX); } header = (mar_req_header_t *)delivery_data; } else { header = (mar_req_header_t *)iovec[0].iov_base; } if (endian_conversion_required) { header->id = swab32 (header->id); header->size = swab32 (header->size); } // assert(iovec->iov_len == header->size); /* * Call the proper executive handler */ service = header->id >> 16; fn_id = header->id & 0xffff; if (!ais_service[service]) return; serialize_lock(); if (endian_conversion_required) { assert(ais_service[service]->exec_engine[fn_id].exec_endian_convert_fn != NULL); ais_service[service]->exec_engine[fn_id].exec_endian_convert_fn (header); } ais_service[service]->exec_engine[fn_id].exec_handler_fn (header, nodeid); serialize_unlock(); } void main_get_config_modules(struct config_iface_ver0 ***modules, int *num) { *modules = config_modules; *num = num_config_modules; } int main_mcast ( struct iovec *iovec, - int iov_len, + unsigned int iov_len, unsigned int guarantee) { return (totempg_groups_mcast_joined (corosync_group_handle, iovec, iov_len, guarantee)); } int message_source_is_local (const mar_message_source_t *source) { int ret = 0; assert (source != NULL); if (source->nodeid == totempg_my_nodeid_get ()) { ret = 1; } return ret; } void message_source_set ( mar_message_source_t *source, void *conn) { assert ((source != NULL) && (conn != NULL)); memset (source, 0, sizeof (mar_message_source_t)); source->nodeid = totempg_my_nodeid_get (); source->conn = conn; } /* * Provides the glue from corosync to the IPC Service */ static int corosync_private_data_size_get (unsigned int service) { return (ais_service[service]->private_data_size); } static coroipcs_init_fn_lvalue corosync_init_fn_get (unsigned int service) { return (ais_service[service]->lib_init_fn); } static coroipcs_exit_fn_lvalue corosync_exit_fn_get (unsigned int service) { return (ais_service[service]->lib_exit_fn); } static coroipcs_handler_fn_lvalue corosync_handler_fn_get (unsigned int service, unsigned int id) { return (ais_service[service]->lib_engine[id].lib_handler_fn); } static int corosync_security_valid (int euid, int egid) { if (euid == 0 || egid == 0) { return (1); } if (euid == main_config.uid || egid == main_config.gid) { return (1); } return (0); } static int corosync_service_available (unsigned int service) { return (ais_service[service]); } static int corosync_response_size_get (unsigned int service, unsigned int id) { return (ais_service[service]->lib_engine[id].response_size); } static int corosync_response_id_get (unsigned int service, unsigned int id) { return (ais_service[service]->lib_engine[id].response_id); } struct sending_allowed_private_data_struct { int reserved_msgs; }; static int corosync_sending_allowed ( unsigned int service, unsigned int id, void *msg, void *sending_allowed_private_data) { struct sending_allowed_private_data_struct *pd = (struct sending_allowed_private_data_struct *)sending_allowed_private_data; struct iovec reserve_iovec; mar_req_header_t *header = (mar_req_header_t *)msg; int sending_allowed; reserve_iovec.iov_base = (char *)header; reserve_iovec.iov_len = header->size; pd->reserved_msgs = totempg_groups_joined_reserve ( corosync_group_handle, &reserve_iovec, 1); sending_allowed = (corosync_quorum_is_quorate() == 1 || ais_service[service]->allow_inquorate == CS_LIB_ALLOW_INQUORATE) && ((ais_service[service]->lib_engine[id].flow_control == CS_LIB_FLOW_CONTROL_NOT_REQUIRED) || ((ais_service[service]->lib_engine[id].flow_control == CS_LIB_FLOW_CONTROL_REQUIRED) && (pd->reserved_msgs) && (sync_in_process() == 0))); return (sending_allowed); } static void corosync_sending_allowed_release (void *sending_allowed_private_data) { struct sending_allowed_private_data_struct *pd = (struct sending_allowed_private_data_struct *)sending_allowed_private_data; totempg_groups_joined_release (pd->reserved_msgs); } static int ipc_subsys_id = -1; static void ipc_log_printf (const char *format, ...) { va_list ap; va_start (ap, format); _logsys_log_printf (ipc_subsys_id, __FUNCTION__, __FILE__, __LINE__, LOG_LEVEL_ERROR, format, ap); va_end (ap); } static int corosync_poll_handler_accept ( hdb_handle_t handle, int fd, int revent, void *context) { return (coroipcs_handler_accept (fd, revent, context)); } static int corosync_poll_handler_dispatch ( hdb_handle_t handle, int fd, int revent, void *context) { return (coroipcs_handler_dispatch (fd, revent, context)); } static void corosync_poll_accept_add ( int fd) { poll_dispatch_add (corosync_poll_handle, fd, POLLIN|POLLNVAL, 0, corosync_poll_handler_accept); } static void corosync_poll_dispatch_add ( int fd, void *context) { poll_dispatch_add (corosync_poll_handle, fd, POLLIN|POLLNVAL, context, corosync_poll_handler_dispatch); } static void corosync_poll_dispatch_modify ( int fd, int events) { poll_dispatch_modify (corosync_poll_handle, fd, events, corosync_poll_handler_dispatch); } struct coroipcs_init_state ipc_init_state = { .socket_name = IPC_SOCKET_NAME, .malloc = malloc, .free = free, .log_printf = ipc_log_printf, .security_valid = corosync_security_valid, .service_available = corosync_service_available, .private_data_size_get = corosync_private_data_size_get, .serialize_lock = serialize_lock, .serialize_unlock = serialize_unlock, .sending_allowed = corosync_sending_allowed, .sending_allowed_release = corosync_sending_allowed_release, .response_size_get = corosync_response_size_get, .response_id_get = corosync_response_id_get, .poll_accept_add = corosync_poll_accept_add, .poll_dispatch_add = corosync_poll_dispatch_add, .poll_dispatch_modify = corosync_poll_dispatch_modify, .init_fn_get = corosync_init_fn_get, .exit_fn_get = corosync_exit_fn_get, .handler_fn_get = corosync_handler_fn_get }; int main (int argc, char **argv) { const char *error_string; struct totem_config totem_config; hdb_handle_t objdb_handle; hdb_handle_t config_handle; unsigned int config_version = 0; void *objdb_p; struct config_iface_ver0 *config; void *config_p; const char *config_iface_init; char *config_iface; char *iface; int res, ch; int background, setprio; #if defined(HAVE_PTHREAD_SPIN_LOCK) pthread_spin_init (&serialize_spin, 0); #endif /* default configuration */ background = 1; setprio = 1; while ((ch = getopt (argc, argv, "fp")) != EOF) { switch (ch) { case 'f': background = 0; logsys_config_mode_set (LOG_MODE_OUTPUT_STDERR|LOG_MODE_THREADED|LOG_MODE_FORK); break; case 'p': setprio = 0; break; default: fprintf(stderr, \ "usage:\n"\ " -f : Start application in foreground.\n"\ " -p : Do not set process priority. \n"); return EXIT_FAILURE; } } if (background) corosync_tty_detach (); log_printf (LOG_LEVEL_NOTICE, "Corosync Executive Service RELEASE '%s'\n", RELEASE_VERSION); log_printf (LOG_LEVEL_NOTICE, "Copyright (C) 2002-2006 MontaVista Software, Inc and contributors.\n"); log_printf (LOG_LEVEL_NOTICE, "Copyright (C) 2006-2008 Red Hat, Inc.\n"); (void)signal (SIGINT, sigintr_handler); (void)signal (SIGUSR2, sigusr2_handler); (void)signal (SIGSEGV, sigsegv_handler); (void)signal (SIGABRT, sigabrt_handler); (void)signal (SIGQUIT, sigquit_handler); #if MSG_NOSIGNAL == 0 (void)signal (SIGPIPE, SIG_IGN); #endif corosync_timer_init ( serialize_lock, serialize_unlock, sched_priority); log_printf (LOG_LEVEL_NOTICE, "Corosync Executive Service: started and ready to provide service.\n"); corosync_poll_handle = poll_create (); /* * Load the object database interface */ res = lcr_ifact_reference ( &objdb_handle, "objdb", 0, &objdb_p, 0); if (res == -1) { log_printf (LOG_LEVEL_ERROR, "Corosync Executive couldn't open configuration object database component.\n"); corosync_exit_error (AIS_DONE_OBJDB); } objdb = (struct objdb_iface_ver0 *)objdb_p; objdb->objdb_init (); /* * Initialize the corosync_api_v1 definition */ apidef_init (objdb); api = apidef_get (); num_config_modules = 0; /* * Bootstrap in the default configuration parser or use * the corosync default built in parser if the configuration parser * isn't overridden */ config_iface_init = getenv("COROSYNC_DEFAULT_CONFIG_IFACE"); if (!config_iface_init) { config_iface_init = "corosync_parser"; } /* Make a copy so we can deface it with strtok */ config_iface = strdup(config_iface_init); iface = strtok(config_iface, ":"); while (iface) { res = lcr_ifact_reference ( &config_handle, iface, config_version, &config_p, 0); config = (struct config_iface_ver0 *)config_p; if (res == -1) { log_printf (LOG_LEVEL_ERROR, "Corosync Executive couldn't open configuration component '%s'\n", iface); corosync_exit_error (AIS_DONE_MAINCONFIGREAD); } res = config->config_readconfig(objdb, &error_string); if (res == -1) { log_printf (LOG_LEVEL_ERROR, "%s", error_string); corosync_exit_error (AIS_DONE_MAINCONFIGREAD); } log_printf (LOG_LEVEL_NOTICE, "%s", error_string); config_modules[num_config_modules++] = config; iface = strtok(NULL, ":"); } if (config_iface) free(config_iface); res = corosync_main_config_read (objdb, &error_string, &main_config); if (res == -1) { log_printf (LOG_LEVEL_ERROR, "%s", error_string); corosync_exit_error (AIS_DONE_MAINCONFIGREAD); } res = totem_config_read (objdb, &totem_config, &error_string); if (res == -1) { log_printf (LOG_LEVEL_ERROR, "%s", error_string); corosync_exit_error (AIS_DONE_MAINCONFIGREAD); } res = totem_config_keyread (objdb, &totem_config, &error_string); if (res == -1) { log_printf (LOG_LEVEL_ERROR, "%s", error_string); corosync_exit_error (AIS_DONE_MAINCONFIGREAD); } res = totem_config_validate (&totem_config, &error_string); if (res == -1) { log_printf (LOG_LEVEL_ERROR, "%s", error_string); corosync_exit_error (AIS_DONE_MAINCONFIGREAD); } /* * Set round robin realtime scheduling with priority 99 * Lock all memory to avoid page faults which may interrupt * application healthchecking */ if (setprio) corosync_setscheduler (); corosync_mlockall (); totem_config.totem_logging_configuration = totem_logging_configuration; totem_config.totem_logging_configuration.log_subsys_id = _logsys_subsys_create ("TOTEM", LOG_INFO); totem_config.totem_logging_configuration.log_level_security = LOG_LEVEL_SECURITY; totem_config.totem_logging_configuration.log_level_error = LOG_LEVEL_ERROR; totem_config.totem_logging_configuration.log_level_warning = LOG_LEVEL_WARNING; totem_config.totem_logging_configuration.log_level_notice = LOG_LEVEL_NOTICE; totem_config.totem_logging_configuration.log_level_debug = LOG_LEVEL_DEBUG; totem_config.totem_logging_configuration.log_printf = _logsys_log_printf; /* * Sleep for a while to let other nodes in the cluster * understand that this node has been away (if it was * an corosync restart). */ // TODO what is this hack for? usleep(totem_config.token_timeout * 2000); /* * if totempg_initialize doesn't have root priveleges, it cannot * bind to a specific interface. This only matters if * there is more then one interface in a system, so * in this case, only a warning is printed */ /* * Join multicast group and setup delivery * and configuration change functions */ totempg_initialize ( corosync_poll_handle, &totem_config); totempg_groups_initialize ( &corosync_group_handle, deliver_fn, confchg_fn); totempg_groups_join ( corosync_group_handle, &corosync_group, 1); /* * This must occur after totempg is initialized because "this_ip" must be set */ res = corosync_service_defaults_link_and_init (api); if (res == -1) { log_printf (LOG_LEVEL_ERROR, "Could not initialize default services\n"); corosync_exit_error (AIS_DONE_INIT_SERVICES); } sync_register (corosync_sync_callbacks_retrieve, corosync_sync_completed); /* * Drop root privleges to user 'ais' * TODO: Don't really need full root capabilities; * needed capabilities are: * CAP_NET_RAW (bindtodevice) * CAP_SYS_NICE (setscheduler) * CAP_IPC_LOCK (mlockall) */ priv_drop (); corosync_mempool_init (); ipc_subsys_id = _logsys_subsys_create ("IPC", LOG_INFO); ipc_init_state.sched_priority = sched_priority; coroipcs_ipc_init (&ipc_init_state); /* * Start main processing loop */ poll_run (corosync_poll_handle); return EXIT_SUCCESS; } diff --git a/exec/main.h b/exec/main.h index a0a5c968..55885134 100644 --- a/exec/main.h +++ b/exec/main.h @@ -1,67 +1,67 @@ /* * Copyright (c) 2002-2006 MontaVista Software, Inc. * Copyright (c) 2006-2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef MAIN_H_DEFINED #define MAIN_H_DEFINED #define TRUE 1 #define FALSE 0 #include #include #include #include #include #include #include extern struct totempg_group corosync_group; extern hdb_handle_t corosync_group_handle; extern hdb_handle_t corosync_poll_handle; extern unsigned long long *(*main_clm_get_by_nodeid) (unsigned int node_id); extern void main_get_config_modules(struct config_iface_ver0 ***modules, int *num); extern int main_mcast ( struct iovec *iovec, - int iov_len, + unsigned int iov_len, unsigned int guarantee); extern void message_source_set (mar_message_source_t *source, void *conn); extern int message_source_is_local (const mar_message_source_t *source); #endif /* MAIN_H_DEFINED */ diff --git a/exec/sync.c b/exec/sync.c index 273dc19c..21a1a67a 100644 --- a/exec/sync.c +++ b/exec/sync.c @@ -1,455 +1,455 @@ /* * Copyright (c) 2005-2006 MontaVista Software, Inc. * Copyright (c) 2006-2007, 2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "quorum.h" #include "sync.h" LOGSYS_DECLARE_SUBSYS ("SYNC", LOG_INFO); #define MESSAGE_REQ_SYNC_BARRIER 0 struct barrier_data { unsigned int nodeid; int completed; }; static const struct memb_ring_id *sync_ring_id; static int vsf_none = 0; static int (*sync_callbacks_retrieve) (int sync_id, struct sync_callbacks *callack); static struct sync_callbacks sync_callbacks; static int sync_processing = 0; static void (*sync_synchronization_completed) (void); static int sync_recovery_index = 0; static void *sync_callback_token_handle = 0; static struct barrier_data barrier_data_confchg[PROCESSOR_COUNT_MAX]; static size_t barrier_data_confchg_entries; static struct barrier_data barrier_data_process[PROCESSOR_COUNT_MAX]; static struct openais_vsf_iface_ver0 *vsf_iface; static int sync_barrier_send (const struct memb_ring_id *ring_id); static int sync_start_process (enum totem_callback_token_type type, const void *data); static void sync_service_init (struct memb_ring_id *ring_id); static int sync_service_process (enum totem_callback_token_type type, const void *data); static void sync_deliver_fn ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required); static void sync_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); static void sync_primary_callback_fn ( const unsigned int *view_list, size_t view_list_entries, int primary_designated, const struct memb_ring_id *ring_id); static struct totempg_group sync_group = { .group = "sync", .group_len = 4 }; static hdb_handle_t sync_group_handle; struct req_exec_sync_barrier_start { mar_req_header_t header; struct memb_ring_id ring_id; }; /* * Send a barrier data structure */ static int sync_barrier_send (const struct memb_ring_id *ring_id) { struct req_exec_sync_barrier_start req_exec_sync_barrier_start; struct iovec iovec; int res; req_exec_sync_barrier_start.header.size = sizeof (struct req_exec_sync_barrier_start); req_exec_sync_barrier_start.header.id = MESSAGE_REQ_SYNC_BARRIER; memcpy (&req_exec_sync_barrier_start.ring_id, ring_id, sizeof (struct memb_ring_id)); iovec.iov_base = (char *)&req_exec_sync_barrier_start; iovec.iov_len = sizeof (req_exec_sync_barrier_start); res = totempg_groups_mcast_joined (sync_group_handle, &iovec, 1, TOTEMPG_AGREED); return (res); } static void sync_start_init (const struct memb_ring_id *ring_id) { totempg_callback_token_create ( &sync_callback_token_handle, TOTEM_CALLBACK_TOKEN_SENT, 0, /* don't delete after callback */ sync_start_process, ring_id); } static void sync_service_init (struct memb_ring_id *ring_id) { sync_callbacks.sync_init (); totempg_callback_token_destroy (&sync_callback_token_handle); /* * Create the token callback for the processing */ totempg_callback_token_create ( &sync_callback_token_handle, TOTEM_CALLBACK_TOKEN_SENT, 0, /* don't delete after callback */ sync_service_process, ring_id); } static int sync_start_process (enum totem_callback_token_type type, const void *data) { int res; const struct memb_ring_id *ring_id = data; res = sync_barrier_send (ring_id); if (res == 0) { /* * Delete the token callback for the barrier */ totempg_callback_token_destroy (&sync_callback_token_handle); } return (0); } static void sync_callbacks_load (void) { int res; for (;;) { res = sync_callbacks_retrieve (sync_recovery_index, &sync_callbacks); /* * No more service handlers have sync callbacks at this time ` */ if (res == -1) { sync_processing = 0; break; } sync_recovery_index += 1; if (sync_callbacks.sync_init) { break; } } } static int sync_service_process (enum totem_callback_token_type type, const void *data) { int res; const struct memb_ring_id *ring_id = data; /* * If process operation not from this ring id, then ignore it and stop * processing */ if (memcmp (ring_id, sync_ring_id, sizeof (struct memb_ring_id)) != 0) { return (0); } /* * If process returns 0, then its time to activate * and start the next service's synchronization */ res = sync_callbacks.sync_process (); if (res != 0) { return (0); } totempg_callback_token_destroy (&sync_callback_token_handle); sync_start_init (ring_id); return (0); } int sync_register ( int (*callbacks_retrieve) (int sync_id, struct sync_callbacks *callack), void (*synchronization_completed) (void)) { unsigned int res; res = totempg_groups_initialize ( &sync_group_handle, sync_deliver_fn, sync_confchg_fn); if (res == -1) { log_printf (LOG_LEVEL_ERROR, "Couldn't initialize groups interface.\n"); return (-1); } res = totempg_groups_join ( sync_group_handle, &sync_group, 1); if (res == -1) { log_printf (LOG_LEVEL_ERROR, "Couldn't join group.\n"); return (-1); } sync_callbacks_retrieve = callbacks_retrieve; sync_synchronization_completed = synchronization_completed; return (0); } static void sync_primary_callback_fn ( const unsigned int *view_list, size_t view_list_entries, int primary_designated, const struct memb_ring_id *ring_id) { int i; if (primary_designated) { log_printf (LOG_LEVEL_DEBUG, "This node is within the primary component and will provide service.\n"); } else { log_printf (LOG_LEVEL_DEBUG, "This node is within the non-primary component and will NOT provide any services.\n"); return; } /* * Execute configuration change for synchronization service */ sync_processing = 1; totempg_callback_token_destroy (&sync_callback_token_handle); sync_recovery_index = 0; memset (&barrier_data_confchg, 0, sizeof (barrier_data_confchg)); for (i = 0; i < view_list_entries; i++) { barrier_data_confchg[i].nodeid = view_list[i]; barrier_data_confchg[i].completed = 0; } memcpy (barrier_data_process, barrier_data_confchg, sizeof (barrier_data_confchg)); barrier_data_confchg_entries = view_list_entries; sync_start_init (sync_ring_id); } static struct memb_ring_id deliver_ring_id; static void sync_endian_convert (struct req_exec_sync_barrier_start *req_exec_sync_barrier_start) { totemip_copy_endian_convert(&req_exec_sync_barrier_start->ring_id.rep, &req_exec_sync_barrier_start->ring_id.rep); req_exec_sync_barrier_start->ring_id.seq = swab64 (req_exec_sync_barrier_start->ring_id.seq); } static void sync_deliver_fn ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required) { struct req_exec_sync_barrier_start *req_exec_sync_barrier_start = (struct req_exec_sync_barrier_start *)iovec[0].iov_base; unsigned int barrier_completed; int i; log_printf (LOG_LEVEL_DEBUG, "confchg entries %d\n", barrier_data_confchg_entries); if (endian_conversion_required) { sync_endian_convert (req_exec_sync_barrier_start); } barrier_completed = 1; memcpy (&deliver_ring_id, &req_exec_sync_barrier_start->ring_id, sizeof (struct memb_ring_id)); /* * Is this barrier from this configuration, if not, ignore it */ if (memcmp (&req_exec_sync_barrier_start->ring_id, sync_ring_id, sizeof (struct memb_ring_id)) != 0) { return; } /* * Set completion for source_addr's address */ for (i = 0; i < barrier_data_confchg_entries; i++) { if (nodeid == barrier_data_process[i].nodeid) { barrier_data_process[i].completed = 1; log_printf (LOG_LEVEL_DEBUG, "Barrier Start Recieved From %d\n", barrier_data_process[i].nodeid); break; } } /* * Test if barrier is complete */ for (i = 0; i < barrier_data_confchg_entries; i++) { log_printf (LOG_LEVEL_DEBUG, "Barrier completion status for nodeid %d = %d. \n", barrier_data_process[i].nodeid, barrier_data_process[i].completed); if (barrier_data_process[i].completed == 0) { barrier_completed = 0; } } if (barrier_completed) { log_printf (LOG_LEVEL_DEBUG, "Synchronization barrier completed\n"); } /* * This sync is complete so activate and start next service sync */ if (barrier_completed && sync_callbacks.sync_activate) { sync_callbacks.sync_activate (); log_printf (LOG_LEVEL_DEBUG, "Committing synchronization for (%s)\n", sync_callbacks.name); } /* * Start synchronization if the barrier has completed */ if (barrier_completed) { memcpy (barrier_data_process, barrier_data_confchg, sizeof (barrier_data_confchg)); sync_callbacks_load(); /* * if sync service found, execute it */ if (sync_processing && sync_callbacks.sync_init) { log_printf (LOG_LEVEL_DEBUG, "Synchronization actions starting for (%s)\n", sync_callbacks.name); sync_service_init (&deliver_ring_id); } } return; } static void sync_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { sync_ring_id = ring_id; if (configuration_type != TOTEM_CONFIGURATION_REGULAR) { return; } if (sync_processing && sync_callbacks.sync_abort != NULL) { sync_callbacks.sync_abort (); sync_callbacks.sync_activate = NULL; } sync_primary_callback_fn ( member_list, member_list_entries, 1, ring_id); } int sync_in_process (void) { return (sync_processing); } int sync_primary_designated (void) { return (1); } diff --git a/exec/totemmrp.c b/exec/totemmrp.c index 0c022479..02635ac2 100644 --- a/exec/totemmrp.c +++ b/exec/totemmrp.c @@ -1,231 +1,231 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2007, 2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "totemmrp.h" #include "totemsrp.h" hdb_handle_t totemsrp_handle_in; void totemmrp_deliver_fn ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required); void totemmrp_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); void (*pg_deliver_fn) ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required) = 0; void (*pg_confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) = 0; void totemmrp_deliver_fn ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required) { pg_deliver_fn (nodeid, iovec, iov_len, endian_conversion_required); } void totemmrp_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { pg_confchg_fn (configuration_type, member_list, member_list_entries, left_list, left_list_entries, joined_list, joined_list_entries, ring_id); } /* * Initialize the totem multiple ring protocol */ int totemmrp_initialize ( hdb_handle_t poll_handle, struct totem_config *totem_config, void (*deliver_fn) ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required), void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id)) { int result; pg_deliver_fn = deliver_fn; pg_confchg_fn = confchg_fn; result = totemsrp_initialize ( poll_handle, &totemsrp_handle_in, totem_config, totemmrp_deliver_fn, totemmrp_confchg_fn); return (result); } void totemmrp_finalize (void) { totemsrp_finalize (totemsrp_handle_in); } /* * Multicast a message */ int totemmrp_mcast ( struct iovec *iovec, - int iov_len, + unsigned int iov_len, int priority) { return totemsrp_mcast (totemsrp_handle_in, iovec, iov_len, priority); } /* * Return number of available messages that can be queued */ int totemmrp_avail (void) { return (totemsrp_avail (totemsrp_handle_in)); } int totemmrp_callback_token_create ( void **handle_out, enum totem_callback_token_type type, int delete, int (*callback_fn) (enum totem_callback_token_type type, const void *), const void *data) { return totemsrp_callback_token_create (totemsrp_handle_in, handle_out, type, delete, callback_fn, data); } void totemmrp_callback_token_destroy ( void *handle_out) { totemsrp_callback_token_destroy (totemsrp_handle_in, handle_out); } void totemmrp_new_msg_signal (void) { totemsrp_new_msg_signal (totemsrp_handle_in); } int totemmrp_ifaces_get ( unsigned int nodeid, struct totem_ip_address *interfaces, char ***status, unsigned int *iface_count) { int res; res = totemsrp_ifaces_get ( totemsrp_handle_in, nodeid, interfaces, status, iface_count); return (res); } unsigned int totemmrp_my_nodeid_get (void) { return (totemsrp_my_nodeid_get (totemsrp_handle_in)); } int totemmrp_my_family_get (void) { return (totemsrp_my_family_get (totemsrp_handle_in)); } extern int totemmrp_ring_reenable (void) { int res; res = totemsrp_ring_reenable ( totemsrp_handle_in); return (res); } diff --git a/exec/totemmrp.h b/exec/totemmrp.h index e6d6483e..e1e02bf2 100644 --- a/exec/totemmrp.h +++ b/exec/totemmrp.h @@ -1,113 +1,113 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2007, 2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef TOTEMMRP_H_DEFINED #define TOTEMMRP_H_DEFINED #include /* * Totem Single Ring Protocol * depends on poll abstraction, POSIX, IPV4 */ /* * Initialize the logger */ extern void totemmrp_log_printf_init ( void (*log_printf) (int , char *, ...), int log_level_security, int log_level_error, int log_level_warning, int log_level_notice, int log_level_debug); /* * Initialize the group messaging interface */ extern int totemmrp_initialize ( hdb_handle_t poll_handle, struct totem_config *totem_config, void (*deliver_fn) ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required), void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id)); extern void totemmrp_finalize (void); /* * Multicast a message */ extern int totemmrp_mcast ( struct iovec *iovec, - int iov_len, + unsigned int iov_len, int priority); /* * Return number of available messages that can be queued */ extern int totemmrp_avail (void); extern int totemmrp_callback_token_create ( void **handle_out, enum totem_callback_token_type type, int delete, int (*callback_fn) (enum totem_callback_token_type type, const void *), const void *data); extern void totemmrp_callback_token_destroy ( void *handle_out); extern void totemmrp_new_msg_signal (void); extern int totemmrp_ifaces_get ( unsigned int nodeid, struct totem_ip_address *interfaces, char ***status, unsigned int *iface_count); extern unsigned int totemmrp_my_nodeid_get (void); extern int totemmrp_my_family_get (void); extern int totemmrp_ring_reenable (void); #endif /* TOTEMMRP_H_DEFINED */ diff --git a/exec/totemnet.c b/exec/totemnet.c index 3bcef726..b50752ad 100644 --- a/exec/totemnet.c +++ b/exec/totemnet.c @@ -1,1512 +1,1512 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2008 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "totemnet.h" #include "wthread.h" #include "crypto.h" #define MCAST_SOCKET_BUFFER_SIZE (TRANSMITS_ALLOWED * FRAME_SIZE_MAX) #define NETIF_STATE_REPORT_UP 1 #define NETIF_STATE_REPORT_DOWN 2 #define BIND_STATE_UNBOUND 0 #define BIND_STATE_REGULAR 1 #define BIND_STATE_LOOPBACK 2 #define HMAC_HASH_SIZE 20 struct security_header { unsigned char hash_digest[HMAC_HASH_SIZE]; /* The hash *MUST* be first in the data structure */ unsigned char salt[16]; /* random number */ char msg[0]; } __attribute__((packed)); struct totemnet_mcast_thread_state { unsigned char iobuf[FRAME_SIZE_MAX]; prng_state prng_state; }; struct totemnet_socket { int mcast_recv; int mcast_send; int token; }; struct totemnet_instance { hmac_state totemnet_hmac_state; prng_state totemnet_prng_state; unsigned char totemnet_private_key[1024]; unsigned int totemnet_private_key_len; hdb_handle_t totemnet_poll_handle; struct totem_interface *totem_interface; int netif_state_report; int netif_bind_state; struct worker_thread_group worker_thread_group; void *context; void (*totemnet_deliver_fn) ( void *context, void *msg, int msg_len); void (*totemnet_iface_change_fn) ( void *context, struct totem_ip_address *iface_address); /* * Function and data used to log messages */ int totemnet_log_level_security; int totemnet_log_level_error; int totemnet_log_level_warning; int totemnet_log_level_notice; int totemnet_log_level_debug; int totemnet_subsys_id; void (*totemnet_log_printf) (int subsys, const char *function, const char *file, int line, unsigned int level, const char *format, ...)__attribute__((format(printf, 6, 7))); hdb_handle_t handle; char iov_buffer[FRAME_SIZE_MAX]; char iov_buffer_flush[FRAME_SIZE_MAX]; struct iovec totemnet_iov_recv; struct iovec totemnet_iov_recv_flush; struct totemnet_socket totemnet_sockets; struct totem_ip_address mcast_address; int stats_sent; int stats_recv; int stats_delv; int stats_remcasts; int stats_orf_token; struct timeval stats_tv_start; struct totem_ip_address my_id; int firstrun; poll_timer_handle timer_netif_check_timeout; unsigned int my_memb_entries; int flushing; struct totem_config *totem_config; struct totem_ip_address token_target; }; struct work_item { struct iovec iovec[20]; - int iov_len; + unsigned int iov_len; struct totemnet_instance *instance; }; static void netif_down_check (struct totemnet_instance *instance); static int totemnet_build_sockets ( struct totemnet_instance *instance, struct totem_ip_address *bindnet_address, struct totem_ip_address *mcastaddress, struct totemnet_socket *sockets, struct totem_ip_address *bound_to); static struct totem_ip_address localhost; /* * All instances in one database */ static struct hdb_handle_database totemnet_instance_database = { .handle_count = 0, .handles = 0, .iterator = 0, .mutex = PTHREAD_MUTEX_INITIALIZER }; static void totemnet_instance_initialize (struct totemnet_instance *instance) { memset (instance, 0, sizeof (struct totemnet_instance)); instance->netif_state_report = NETIF_STATE_REPORT_UP | NETIF_STATE_REPORT_DOWN; instance->totemnet_iov_recv.iov_base = instance->iov_buffer; instance->totemnet_iov_recv.iov_len = FRAME_SIZE_MAX; //sizeof (instance->iov_buffer); instance->totemnet_iov_recv_flush.iov_base = instance->iov_buffer_flush; instance->totemnet_iov_recv_flush.iov_len = FRAME_SIZE_MAX; //sizeof (instance->iov_buffer); /* * There is always atleast 1 processor */ instance->my_memb_entries = 1; } #define log_printf(level, format, args...) \ do { \ instance->totemnet_log_printf (instance->totemnet_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, \ level, (const char *)format, ##args); \ } while (0); static int authenticate_and_decrypt ( struct totemnet_instance *instance, struct iovec *iov) { unsigned char keys[48]; struct security_header *header = iov[0].iov_base; prng_state keygen_prng_state; prng_state stream_prng_state; unsigned char *hmac_key = &keys[32]; unsigned char *cipher_key = &keys[16]; unsigned char *initial_vector = &keys[0]; unsigned char digest_comparison[HMAC_HASH_SIZE]; unsigned long len; /* * Generate MAC, CIPHER, IV keys from private key */ memset (keys, 0, sizeof (keys)); sober128_start (&keygen_prng_state); sober128_add_entropy (instance->totemnet_private_key, instance->totemnet_private_key_len, &keygen_prng_state); sober128_add_entropy (header->salt, sizeof (header->salt), &keygen_prng_state); sober128_read (keys, sizeof (keys), &keygen_prng_state); /* * Setup stream cipher */ sober128_start (&stream_prng_state); sober128_add_entropy (cipher_key, 16, &stream_prng_state); sober128_add_entropy (initial_vector, 16, &stream_prng_state); /* * Authenticate contents of message */ hmac_init (&instance->totemnet_hmac_state, DIGEST_SHA1, hmac_key, 16); hmac_process (&instance->totemnet_hmac_state, (unsigned char *)iov->iov_base + HMAC_HASH_SIZE, iov->iov_len - HMAC_HASH_SIZE); len = hash_descriptor[DIGEST_SHA1]->hashsize; assert (HMAC_HASH_SIZE >= len); hmac_done (&instance->totemnet_hmac_state, digest_comparison, &len); if (memcmp (digest_comparison, header->hash_digest, len) != 0) { log_printf (instance->totemnet_log_level_security, "Received message has invalid digest... ignoring.\n"); return (-1); } /* * Decrypt the contents of the message with the cipher key */ sober128_read ((unsigned char*)iov->iov_base + sizeof (struct security_header), iov->iov_len - sizeof (struct security_header), &stream_prng_state); return (0); } static void encrypt_and_sign_worker ( struct totemnet_instance *instance, unsigned char *buf, int *buf_len, struct iovec *iovec, - int iov_len, + unsigned int iov_len, prng_state *prng_state_in) { int i; unsigned char *addr; unsigned char keys[48]; struct security_header *header; unsigned char *hmac_key = &keys[32]; unsigned char *cipher_key = &keys[16]; unsigned char *initial_vector = &keys[0]; unsigned long len; int outlen = 0; hmac_state hmac_state; prng_state keygen_prng_state; prng_state stream_prng_state; header = (struct security_header *)buf; addr = buf + sizeof (struct security_header); memset (keys, 0, sizeof (keys)); memset (header->salt, 0, sizeof (header->salt)); /* * Generate MAC, CIPHER, IV keys from private key */ sober128_read (header->salt, sizeof (header->salt), prng_state_in); sober128_start (&keygen_prng_state); sober128_add_entropy (instance->totemnet_private_key, instance->totemnet_private_key_len, &keygen_prng_state); sober128_add_entropy (header->salt, sizeof (header->salt), &keygen_prng_state); sober128_read (keys, sizeof (keys), &keygen_prng_state); /* * Setup stream cipher */ sober128_start (&stream_prng_state); sober128_add_entropy (cipher_key, 16, &stream_prng_state); sober128_add_entropy (initial_vector, 16, &stream_prng_state); outlen = sizeof (struct security_header); /* * Copy remainder of message, then encrypt it */ for (i = 1; i < iov_len; i++) { memcpy (addr, iovec[i].iov_base, iovec[i].iov_len); addr += iovec[i].iov_len; outlen += iovec[i].iov_len; } /* * Encrypt message by XORing stream cipher data */ sober128_read (buf + sizeof (struct security_header), outlen - sizeof (struct security_header), &stream_prng_state); memset (&hmac_state, 0, sizeof (hmac_state)); /* * Sign the contents of the message with the hmac key and store signature in message */ hmac_init (&hmac_state, DIGEST_SHA1, hmac_key, 16); hmac_process (&hmac_state, buf + HMAC_HASH_SIZE, outlen - HMAC_HASH_SIZE); len = hash_descriptor[DIGEST_SHA1]->hashsize; hmac_done (&hmac_state, header->hash_digest, &len); *buf_len = outlen; } static inline void ucast_sendmsg ( struct totemnet_instance *instance, struct totem_ip_address *system_to, struct iovec *iovec_in, - int iov_len_in) + unsigned int iov_len_in) { struct msghdr msg_ucast; int res = 0; int buf_len; unsigned char sheader[sizeof (struct security_header)]; unsigned char encrypt_data[FRAME_SIZE_MAX]; struct iovec iovec_encrypt[20]; struct iovec *iovec_sendmsg; struct sockaddr_storage sockaddr; - int iov_len; + unsigned int iov_len; int addrlen; if (instance->totem_config->secauth == 1) { iovec_encrypt[0].iov_base = sheader; iovec_encrypt[0].iov_len = sizeof (struct security_header); memcpy (&iovec_encrypt[1], &iovec_in[0], sizeof (struct iovec) * iov_len_in); /* * Encrypt and digest the message */ encrypt_and_sign_worker ( instance, encrypt_data, &buf_len, iovec_encrypt, iov_len_in + 1, &instance->totemnet_prng_state); iovec_encrypt[0].iov_base = encrypt_data; iovec_encrypt[0].iov_len = buf_len; iovec_sendmsg = &iovec_encrypt[0]; iov_len = 1; } else { iovec_sendmsg = iovec_in; iov_len = iov_len_in; } /* * Build unicast message */ totemip_totemip_to_sockaddr_convert(system_to, instance->totem_interface->ip_port, &sockaddr, &addrlen); msg_ucast.msg_name = &sockaddr; msg_ucast.msg_namelen = addrlen; msg_ucast.msg_iov = iovec_sendmsg; msg_ucast.msg_iovlen = iov_len; msg_ucast.msg_control = 0; msg_ucast.msg_controllen = 0; msg_ucast.msg_flags = 0; /* * Transmit multicast message * An error here is recovered by totemsrp */ res = sendmsg (instance->totemnet_sockets.mcast_send, &msg_ucast, MSG_NOSIGNAL); } static inline void mcast_sendmsg ( struct totemnet_instance *instance, struct iovec *iovec_in, - int iov_len_in) + unsigned int iov_len_in) { struct msghdr msg_mcast; int res = 0; int buf_len; unsigned char sheader[sizeof (struct security_header)]; unsigned char encrypt_data[FRAME_SIZE_MAX]; struct iovec iovec_encrypt[20]; struct iovec *iovec_sendmsg; struct sockaddr_storage sockaddr; - int iov_len; + unsigned int iov_len; int addrlen; if (instance->totem_config->secauth == 1) { iovec_encrypt[0].iov_base = sheader; iovec_encrypt[0].iov_len = sizeof (struct security_header); memcpy (&iovec_encrypt[1], &iovec_in[0], sizeof (struct iovec) * iov_len_in); /* * Encrypt and digest the message */ encrypt_and_sign_worker ( instance, encrypt_data, &buf_len, iovec_encrypt, iov_len_in + 1, &instance->totemnet_prng_state); iovec_encrypt[0].iov_base = encrypt_data; iovec_encrypt[0].iov_len = buf_len; iovec_sendmsg = &iovec_encrypt[0]; iov_len = 1; } else { iovec_sendmsg = iovec_in; iov_len = iov_len_in; } /* * Build multicast message */ totemip_totemip_to_sockaddr_convert(&instance->mcast_address, instance->totem_interface->ip_port, &sockaddr, &addrlen); msg_mcast.msg_name = &sockaddr; msg_mcast.msg_namelen = addrlen; msg_mcast.msg_iov = iovec_sendmsg; msg_mcast.msg_iovlen = iov_len; msg_mcast.msg_control = 0; msg_mcast.msg_controllen = 0; msg_mcast.msg_flags = 0; /* * Transmit multicast message * An error here is recovered by totemsrp */ res = sendmsg (instance->totemnet_sockets.mcast_send, &msg_mcast, MSG_NOSIGNAL); } static void totemnet_mcast_thread_state_constructor ( void *totemnet_mcast_thread_state_in) { struct totemnet_mcast_thread_state *totemnet_mcast_thread_state = (struct totemnet_mcast_thread_state *)totemnet_mcast_thread_state_in; memset (totemnet_mcast_thread_state, 0, sizeof (totemnet_mcast_thread_state)); rng_make_prng (128, PRNG_SOBER, &totemnet_mcast_thread_state->prng_state, NULL); } static void totemnet_mcast_worker_fn (void *thread_state, void *work_item_in) { struct work_item *work_item = (struct work_item *)work_item_in; struct totemnet_mcast_thread_state *totemnet_mcast_thread_state = (struct totemnet_mcast_thread_state *)thread_state; struct totemnet_instance *instance = work_item->instance; struct msghdr msg_mcast; unsigned char sheader[sizeof (struct security_header)]; int res = 0; int buf_len; struct iovec iovec_encrypted; struct iovec *iovec_sendmsg; struct sockaddr_storage sockaddr; unsigned int iovs; int addrlen; if (instance->totem_config->secauth == 1) { memmove (&work_item->iovec[1], &work_item->iovec[0], work_item->iov_len * sizeof (struct iovec)); work_item->iovec[0].iov_base = sheader; work_item->iovec[0].iov_len = sizeof (struct security_header); /* * Encrypt and digest the message */ encrypt_and_sign_worker ( instance, totemnet_mcast_thread_state->iobuf, &buf_len, work_item->iovec, work_item->iov_len + 1, &totemnet_mcast_thread_state->prng_state); iovec_sendmsg = &iovec_encrypted; iovec_sendmsg->iov_base = totemnet_mcast_thread_state->iobuf; iovec_sendmsg->iov_len = buf_len; iovs = 1; } else { iovec_sendmsg = work_item->iovec; iovs = work_item->iov_len; } totemip_totemip_to_sockaddr_convert(&instance->mcast_address, instance->totem_interface->ip_port, &sockaddr, &addrlen); msg_mcast.msg_name = &sockaddr; msg_mcast.msg_namelen = addrlen; msg_mcast.msg_iov = iovec_sendmsg; msg_mcast.msg_iovlen = iovs; msg_mcast.msg_control = 0; msg_mcast.msg_controllen = 0; msg_mcast.msg_flags = 0; /* * Transmit multicast message * An error here is recovered by totemnet */ res = sendmsg (instance->totemnet_sockets.mcast_send, &msg_mcast, MSG_NOSIGNAL); if (res > 0) { instance->stats_sent += res; } } int totemnet_finalize ( hdb_handle_t handle) { struct totemnet_instance *instance; int res = 0; res = hdb_handle_get (&totemnet_instance_database, handle, (void *)&instance); if (res != 0) { res = ENOENT; goto error_exit; } worker_thread_group_exit (&instance->worker_thread_group); hdb_handle_put (&totemnet_instance_database, handle); error_exit: return (res); } /* * Only designed to work with a message with one iov */ static int net_deliver_fn ( hdb_handle_t handle, int fd, int revents, void *data) { struct totemnet_instance *instance = (struct totemnet_instance *)data; struct msghdr msg_recv; struct iovec *iovec; struct security_header *security_header; struct sockaddr_storage system_from; int bytes_received; int res = 0; unsigned char *msg_offset; unsigned int size_delv; if (instance->flushing == 1) { iovec = &instance->totemnet_iov_recv_flush; } else { iovec = &instance->totemnet_iov_recv; } /* * Receive datagram */ msg_recv.msg_name = &system_from; msg_recv.msg_namelen = sizeof (struct sockaddr_storage); msg_recv.msg_iov = iovec; msg_recv.msg_iovlen = 1; msg_recv.msg_control = 0; msg_recv.msg_controllen = 0; msg_recv.msg_flags = 0; bytes_received = recvmsg (fd, &msg_recv, MSG_NOSIGNAL | MSG_DONTWAIT); if (bytes_received == -1) { return (0); } else { instance->stats_recv += bytes_received; } if ((instance->totem_config->secauth == 1) && (bytes_received < sizeof (struct security_header))) { log_printf (instance->totemnet_log_level_security, "Received message is too short... ignoring %d.\n", bytes_received); return (0); } security_header = (struct security_header *)iovec->iov_base; iovec->iov_len = bytes_received; if (instance->totem_config->secauth == 1) { /* * Authenticate and if authenticated, decrypt datagram */ res = authenticate_and_decrypt (instance, iovec); if (res == -1) { log_printf (instance->totemnet_log_level_security, "Invalid packet data\n"); iovec->iov_len = FRAME_SIZE_MAX; return 0; } msg_offset = (unsigned char *)iovec->iov_base + sizeof (struct security_header); size_delv = bytes_received - sizeof (struct security_header); } else { msg_offset = iovec->iov_base; size_delv = bytes_received; } /* * Handle incoming message */ instance->totemnet_deliver_fn ( instance->context, msg_offset, size_delv); iovec->iov_len = FRAME_SIZE_MAX; return (0); } static int netif_determine ( struct totemnet_instance *instance, struct totem_ip_address *bindnet, struct totem_ip_address *bound_to, int *interface_up, int *interface_num) { int res; res = totemip_iface_check (bindnet, bound_to, interface_up, interface_num, + 0); // TODO andrew can address this instance->totem_config->clear_node_high_bit); return (res); } /* * If the interface is up, the sockets for totem are built. If the interface is down * this function is requeued in the timer list to retry building the sockets later. */ static void timer_function_netif_check_timeout ( void *data) { struct totemnet_instance *instance = (struct totemnet_instance *)data; int res; int interface_up; int interface_num; struct totem_ip_address *bind_address; /* * Build sockets for every interface */ netif_determine (instance, &instance->totem_interface->bindnet, &instance->totem_interface->boundto, &interface_up, &interface_num); /* * If the network interface isn't back up and we are already * in loopback mode, add timer to check again and return */ if ((instance->netif_bind_state == BIND_STATE_LOOPBACK && interface_up == 0) || (instance->my_memb_entries == 1 && instance->netif_bind_state == BIND_STATE_REGULAR && interface_up == 1)) { poll_timer_add (instance->totemnet_poll_handle, instance->totem_config->downcheck_timeout, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); /* * Add a timer to check for a downed regular interface */ return; } if (instance->totemnet_sockets.mcast_recv > 0) { close (instance->totemnet_sockets.mcast_recv); poll_dispatch_delete (instance->totemnet_poll_handle, instance->totemnet_sockets.mcast_recv); } if (instance->totemnet_sockets.mcast_send > 0) { close (instance->totemnet_sockets.mcast_send); } if (instance->totemnet_sockets.token > 0) { close (instance->totemnet_sockets.token); poll_dispatch_delete (instance->totemnet_poll_handle, instance->totemnet_sockets.token); } if (interface_up == 0) { /* * Interface is not up */ instance->netif_bind_state = BIND_STATE_LOOPBACK; bind_address = &localhost; /* * Add a timer to retry building interfaces and request memb_gather_enter */ poll_timer_add (instance->totemnet_poll_handle, instance->totem_config->downcheck_timeout, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); } else { /* * Interface is up */ instance->netif_bind_state = BIND_STATE_REGULAR; bind_address = &instance->totem_interface->bindnet; } /* * Create and bind the multicast and unicast sockets */ res = totemnet_build_sockets (instance, &instance->mcast_address, bind_address, &instance->totemnet_sockets, &instance->totem_interface->boundto); poll_dispatch_add ( instance->totemnet_poll_handle, instance->totemnet_sockets.mcast_recv, POLLIN, instance, net_deliver_fn); poll_dispatch_add ( instance->totemnet_poll_handle, instance->totemnet_sockets.token, POLLIN, instance, net_deliver_fn); totemip_copy (&instance->my_id, &instance->totem_interface->boundto); /* * This reports changes in the interface to the user and totemsrp */ if (instance->netif_bind_state == BIND_STATE_REGULAR) { if (instance->netif_state_report & NETIF_STATE_REPORT_UP) { log_printf (instance->totemnet_log_level_notice, "The network interface [%s] is now up.\n", totemip_print (&instance->totem_interface->boundto)); instance->netif_state_report = NETIF_STATE_REPORT_DOWN; instance->totemnet_iface_change_fn (instance->context, &instance->my_id); } /* * Add a timer to check for interface going down in single membership */ if (instance->my_memb_entries == 1) { poll_timer_add (instance->totemnet_poll_handle, instance->totem_config->downcheck_timeout, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); } } else { if (instance->netif_state_report & NETIF_STATE_REPORT_DOWN) { log_printf (instance->totemnet_log_level_notice, "The network interface is down.\n"); instance->totemnet_iface_change_fn (instance->context, &instance->my_id); } instance->netif_state_report = NETIF_STATE_REPORT_UP; } } /* * Check if an interface is down and reconfigure * totemnet waiting for it to come back up */ static void netif_down_check (struct totemnet_instance *instance) { timer_function_netif_check_timeout (instance); } /* Set the socket priority to INTERACTIVE to ensure that our messages don't get queued behind anything else */ static void totemnet_traffic_control_set(struct totemnet_instance *instance, int sock) { #ifdef SO_PRIORITY int prio = 6; /* TC_PRIO_INTERACTIVE */ if (setsockopt(sock, SOL_SOCKET, SO_PRIORITY, &prio, sizeof(int))) log_printf (instance->totemnet_log_level_warning, "Could not set traffic priority. (%s)\n", strerror (errno)); #endif } static int totemnet_build_sockets_ip ( struct totemnet_instance *instance, struct totem_ip_address *mcast_address, struct totem_ip_address *bindnet_address, struct totemnet_socket *sockets, struct totem_ip_address *bound_to, int interface_num) { struct sockaddr_storage sockaddr; struct ipv6_mreq mreq6; struct ip_mreq mreq; struct sockaddr_storage mcast_ss, boundto_ss; struct sockaddr_in6 *mcast_sin6 = (struct sockaddr_in6 *)&mcast_ss; struct sockaddr_in *mcast_sin = (struct sockaddr_in *)&mcast_ss; struct sockaddr_in *boundto_sin = (struct sockaddr_in *)&boundto_ss; unsigned int sendbuf_size; unsigned int recvbuf_size; unsigned int optlen = sizeof (sendbuf_size); int addrlen; int res; int flag; /* * Create multicast recv socket */ sockets->mcast_recv = socket (bindnet_address->family, SOCK_DGRAM, 0); if (sockets->mcast_recv == -1) { perror ("socket"); return (-1); } totemip_nosigpipe (sockets->mcast_recv); res = fcntl (sockets->mcast_recv, F_SETFL, O_NONBLOCK); if (res == -1) { log_printf (instance->totemnet_log_level_warning, "Could not set non-blocking operation on multicast socket: %s\n", strerror (errno)); return (-1); } /* * Force reuse */ flag = 1; if ( setsockopt(sockets->mcast_recv, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof (flag)) < 0) { perror("setsockopt reuseaddr"); return (-1); } /* * Bind to multicast socket used for multicast receives */ totemip_totemip_to_sockaddr_convert(mcast_address, instance->totem_interface->ip_port, &sockaddr, &addrlen); res = bind (sockets->mcast_recv, (struct sockaddr *)&sockaddr, addrlen); if (res == -1) { perror ("bind mcast recv socket failed"); return (-1); } /* * Setup mcast send socket */ sockets->mcast_send = socket (bindnet_address->family, SOCK_DGRAM, 0); if (sockets->mcast_send == -1) { perror ("socket"); return (-1); } totemip_nosigpipe (sockets->mcast_send); res = fcntl (sockets->mcast_send, F_SETFL, O_NONBLOCK); if (res == -1) { log_printf (instance->totemnet_log_level_warning, "Could not set non-blocking operation on multicast socket: %s\n", strerror (errno)); return (-1); } /* * Force reuse */ flag = 1; if ( setsockopt(sockets->mcast_send, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof (flag)) < 0) { perror("setsockopt reuseaddr"); return (-1); } totemip_totemip_to_sockaddr_convert(bound_to, instance->totem_interface->ip_port - 1, &sockaddr, &addrlen); res = bind (sockets->mcast_send, (struct sockaddr *)&sockaddr, addrlen); if (res == -1) { perror ("bind mcast send socket failed"); return (-1); } /* * Setup unicast socket */ sockets->token = socket (bindnet_address->family, SOCK_DGRAM, 0); if (sockets->token == -1) { perror ("socket2"); return (-1); } totemip_nosigpipe (sockets->token); res = fcntl (sockets->token, F_SETFL, O_NONBLOCK); if (res == -1) { log_printf (instance->totemnet_log_level_warning, "Could not set non-blocking operation on token socket: %s\n", strerror (errno)); return (-1); } /* * Force reuse */ flag = 1; if ( setsockopt(sockets->token, SOL_SOCKET, SO_REUSEADDR, (char *)&flag, sizeof (flag)) < 0) { perror("setsockopt reuseaddr"); return (-1); } /* * Bind to unicast socket used for token send/receives * This has the side effect of binding to the correct interface */ totemip_totemip_to_sockaddr_convert(bound_to, instance->totem_interface->ip_port, &sockaddr, &addrlen); res = bind (sockets->token, (struct sockaddr *)&sockaddr, addrlen); if (res == -1) { perror ("bind token socket failed"); return (-1); } recvbuf_size = MCAST_SOCKET_BUFFER_SIZE; sendbuf_size = MCAST_SOCKET_BUFFER_SIZE; /* * Set buffer sizes to avoid overruns */ res = setsockopt (sockets->mcast_recv, SOL_SOCKET, SO_RCVBUF, &recvbuf_size, optlen); res = setsockopt (sockets->mcast_send, SOL_SOCKET, SO_SNDBUF, &sendbuf_size, optlen); res = getsockopt (sockets->mcast_recv, SOL_SOCKET, SO_RCVBUF, &recvbuf_size, &optlen); if (res == 0) { log_printf (instance->totemnet_log_level_notice, "Receive multicast socket recv buffer size (%d bytes).\n", recvbuf_size); } res = getsockopt (sockets->mcast_send, SOL_SOCKET, SO_SNDBUF, &sendbuf_size, &optlen); if (res == 0) { log_printf (instance->totemnet_log_level_notice, "Transmit multicast socket send buffer size (%d bytes).\n", sendbuf_size); } /* * Join group membership on socket */ totemip_totemip_to_sockaddr_convert(mcast_address, instance->totem_interface->ip_port, &mcast_ss, &addrlen); totemip_totemip_to_sockaddr_convert(bound_to, instance->totem_interface->ip_port, &boundto_ss, &addrlen); switch ( bindnet_address->family ) { case AF_INET: memset(&mreq, 0, sizeof(mreq)); mreq.imr_multiaddr.s_addr = mcast_sin->sin_addr.s_addr; mreq.imr_interface.s_addr = boundto_sin->sin_addr.s_addr; res = setsockopt (sockets->mcast_recv, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof (mreq)); if (res == -1) { perror ("join ipv4 multicast group failed"); return (-1); } break; case AF_INET6: memset(&mreq6, 0, sizeof(mreq6)); memcpy(&mreq6.ipv6mr_multiaddr, &mcast_sin6->sin6_addr, sizeof(struct in6_addr)); mreq6.ipv6mr_interface = interface_num; res = setsockopt (sockets->mcast_recv, IPPROTO_IPV6, IPV6_JOIN_GROUP, &mreq6, sizeof (mreq6)); if (res == -1) { perror ("join ipv6 multicast group failed"); return (-1); } break; } /* * Turn on multicast loopback */ flag = 1; switch ( bindnet_address->family ) { case AF_INET: res = setsockopt (sockets->mcast_send, IPPROTO_IP, IP_MULTICAST_LOOP, &flag, sizeof (flag)); break; case AF_INET6: res = setsockopt (sockets->mcast_send, IPPROTO_IPV6, IPV6_MULTICAST_LOOP, &flag, sizeof (flag)); } if (res == -1) { perror ("turn off loopback"); return (-1); } /* * Set multicast packets TTL */ if ( bindnet_address->family == AF_INET6 ) { flag = 255; res = setsockopt (sockets->mcast_send, IPPROTO_IPV6, IPV6_MULTICAST_HOPS, &flag, sizeof (flag)); if (res == -1) { perror ("setp mcast hops"); return (-1); } } /* * Bind to a specific interface for multicast send and receive */ switch ( bindnet_address->family ) { case AF_INET: if (setsockopt (sockets->mcast_send, IPPROTO_IP, IP_MULTICAST_IF, &boundto_sin->sin_addr, sizeof (boundto_sin->sin_addr)) < 0) { perror ("cannot select interface"); return (-1); } if (setsockopt (sockets->mcast_recv, IPPROTO_IP, IP_MULTICAST_IF, &boundto_sin->sin_addr, sizeof (boundto_sin->sin_addr)) < 0) { perror ("cannot select interface"); return (-1); } break; case AF_INET6: if (setsockopt (sockets->mcast_send, IPPROTO_IPV6, IPV6_MULTICAST_IF, &interface_num, sizeof (interface_num)) < 0) { perror ("cannot select interface"); return (-1); } if (setsockopt (sockets->mcast_recv, IPPROTO_IPV6, IPV6_MULTICAST_IF, &interface_num, sizeof (interface_num)) < 0) { perror ("cannot select interface"); return (-1); } break; } return 0; } static int totemnet_build_sockets ( struct totemnet_instance *instance, struct totem_ip_address *mcast_address, struct totem_ip_address *bindnet_address, struct totemnet_socket *sockets, struct totem_ip_address *bound_to) { int interface_num; int interface_up; int res; /* * Determine the ip address bound to and the interface name */ res = netif_determine (instance, bindnet_address, bound_to, &interface_up, &interface_num); if (res == -1) { return (-1); } totemip_copy(&instance->my_id, bound_to); res = totemnet_build_sockets_ip (instance, mcast_address, bindnet_address, sockets, bound_to, interface_num); /* We only send out of the token socket */ totemnet_traffic_control_set(instance, sockets->token); return res; } /* * Totem Network interface - also does encryption/decryption * depends on poll abstraction, POSIX, IPV4 */ /* * Create an instance */ int totemnet_initialize ( hdb_handle_t poll_handle, hdb_handle_t *handle, struct totem_config *totem_config, int interface_no, void *context, void (*deliver_fn) ( void *context, void *msg, int msg_len), void (*iface_change_fn) ( void *context, struct totem_ip_address *iface_address)) { struct totemnet_instance *instance; unsigned int res; res = hdb_handle_create (&totemnet_instance_database, sizeof (struct totemnet_instance), handle); if (res != 0) { goto error_exit; } res = hdb_handle_get (&totemnet_instance_database, *handle, (void *)&instance); if (res != 0) { goto error_destroy; } totemnet_instance_initialize (instance); instance->totem_config = totem_config; /* * Configure logging */ instance->totemnet_log_level_security = 1; //totem_config->totem_logging_configuration.log_level_security; instance->totemnet_log_level_error = totem_config->totem_logging_configuration.log_level_error; instance->totemnet_log_level_warning = totem_config->totem_logging_configuration.log_level_warning; instance->totemnet_log_level_notice = totem_config->totem_logging_configuration.log_level_notice; instance->totemnet_log_level_debug = totem_config->totem_logging_configuration.log_level_debug; instance->totemnet_subsys_id = totem_config->totem_logging_configuration.log_subsys_id; instance->totemnet_log_printf = totem_config->totem_logging_configuration.log_printf; /* * Initialize random number generator for later use to generate salt */ memcpy (instance->totemnet_private_key, totem_config->private_key, totem_config->private_key_len); instance->totemnet_private_key_len = totem_config->private_key_len; rng_make_prng (128, PRNG_SOBER, &instance->totemnet_prng_state, NULL); /* * Initialize local variables for totemnet */ instance->totem_interface = &totem_config->interfaces[interface_no]; totemip_copy (&instance->mcast_address, &instance->totem_interface->mcast_addr); memset (instance->iov_buffer, 0, FRAME_SIZE_MAX); /* * If threaded send requested, initialize thread group data structure */ if (totem_config->threads) { worker_thread_group_init ( &instance->worker_thread_group, totem_config->threads, 128, sizeof (struct work_item), sizeof (struct totemnet_mcast_thread_state), totemnet_mcast_thread_state_constructor, totemnet_mcast_worker_fn); } instance->totemnet_poll_handle = poll_handle; instance->totem_interface->bindnet.nodeid = instance->totem_config->node_id; instance->context = context; instance->totemnet_deliver_fn = deliver_fn; instance->totemnet_iface_change_fn = iface_change_fn; instance->handle = *handle; rng_make_prng (128, PRNG_SOBER, &instance->totemnet_prng_state, NULL); totemip_localhost (instance->mcast_address.family, &localhost); netif_down_check (instance); error_exit: hdb_handle_put (&totemnet_instance_database, *handle); return (0); error_destroy: hdb_handle_destroy (&totemnet_instance_database, *handle); return (-1); } int totemnet_processor_count_set ( hdb_handle_t handle, int processor_count) { struct totemnet_instance *instance; int res = 0; res = hdb_handle_get (&totemnet_instance_database, handle, (void *)&instance); if (res != 0) { res = ENOENT; goto error_exit; } instance->my_memb_entries = processor_count; poll_timer_delete (instance->totemnet_poll_handle, instance->timer_netif_check_timeout); if (processor_count == 1) { poll_timer_add (instance->totemnet_poll_handle, instance->totem_config->downcheck_timeout, (void *)instance, timer_function_netif_check_timeout, &instance->timer_netif_check_timeout); } hdb_handle_put (&totemnet_instance_database, handle); error_exit: return (res); } int totemnet_recv_flush (hdb_handle_t handle) { struct totemnet_instance *instance; struct pollfd ufd; int nfds; int res = 0; res = hdb_handle_get (&totemnet_instance_database, handle, (void *)&instance); if (res != 0) { res = ENOENT; goto error_exit; } instance->flushing = 1; do { ufd.fd = instance->totemnet_sockets.mcast_recv; ufd.events = POLLIN; nfds = poll (&ufd, 1, 0); if (nfds == 1 && ufd.revents & POLLIN) { net_deliver_fn (0, instance->totemnet_sockets.mcast_recv, ufd.revents, instance); } } while (nfds == 1); instance->flushing = 0; hdb_handle_put (&totemnet_instance_database, handle); error_exit: return (res); } int totemnet_send_flush (hdb_handle_t handle) { struct totemnet_instance *instance; int res = 0; res = hdb_handle_get (&totemnet_instance_database, handle, (void *)&instance); if (res != 0) { res = ENOENT; goto error_exit; } worker_thread_group_wait (&instance->worker_thread_group); hdb_handle_put (&totemnet_instance_database, handle); error_exit: return (res); } int totemnet_token_send ( hdb_handle_t handle, struct iovec *iovec, - int iov_len) + unsigned int iov_len) { struct totemnet_instance *instance; int res = 0; res = hdb_handle_get (&totemnet_instance_database, handle, (void *)&instance); if (res != 0) { res = ENOENT; goto error_exit; } ucast_sendmsg (instance, &instance->token_target, iovec, iov_len); hdb_handle_put (&totemnet_instance_database, handle); error_exit: return (res); } int totemnet_mcast_flush_send ( hdb_handle_t handle, struct iovec *iovec, unsigned int iov_len) { struct totemnet_instance *instance; int res = 0; res = hdb_handle_get (&totemnet_instance_database, handle, (void *)&instance); if (res != 0) { res = ENOENT; goto error_exit; } mcast_sendmsg (instance, iovec, iov_len); hdb_handle_put (&totemnet_instance_database, handle); error_exit: return (res); } int totemnet_mcast_noflush_send ( hdb_handle_t handle, struct iovec *iovec, unsigned int iov_len) { struct totemnet_instance *instance; struct work_item work_item; int res = 0; res = hdb_handle_get (&totemnet_instance_database, handle, (void *)&instance); if (res != 0) { res = ENOENT; goto error_exit; } if (instance->totem_config->threads) { memcpy (&work_item.iovec[0], iovec, iov_len * sizeof (struct iovec)); work_item.iov_len = iov_len; work_item.instance = instance; worker_thread_group_work_add (&instance->worker_thread_group, &work_item); } else { mcast_sendmsg (instance, iovec, iov_len); } hdb_handle_put (&totemnet_instance_database, handle); error_exit: return (res); } extern int totemnet_iface_check (hdb_handle_t handle) { struct totemnet_instance *instance; int res = 0; res = hdb_handle_get (&totemnet_instance_database, handle, (void *)&instance); if (res != 0) { res = ENOENT; goto error_exit; } timer_function_netif_check_timeout (instance); hdb_handle_put (&totemnet_instance_database, handle); error_exit: return (res); } extern void totemnet_net_mtu_adjust (struct totem_config *totem_config) { #define UDPIP_HEADER_SIZE (20 + 8) /* 20 bytes for ip 8 bytes for udp */ if (totem_config->secauth == 1) { totem_config->net_mtu -= sizeof (struct security_header) + UDPIP_HEADER_SIZE; } else { totem_config->net_mtu -= UDPIP_HEADER_SIZE; } } const char *totemnet_iface_print (hdb_handle_t handle) { struct totemnet_instance *instance; int res = 0; const char *ret_char; res = hdb_handle_get (&totemnet_instance_database, handle, (void *)&instance); if (res != 0) { ret_char = "Invalid totemnet handle"; goto error_exit; } ret_char = totemip_print (&instance->my_id); hdb_handle_put (&totemnet_instance_database, handle); error_exit: return (ret_char); } int totemnet_iface_get ( hdb_handle_t handle, struct totem_ip_address *addr) { struct totemnet_instance *instance; unsigned int res; res = hdb_handle_get (&totemnet_instance_database, handle, (void *)&instance); if (res != 0) { goto error_exit; } memcpy (addr, &instance->my_id, sizeof (struct totem_ip_address)); hdb_handle_put (&totemnet_instance_database, handle); error_exit: return (res); } int totemnet_token_target_set ( hdb_handle_t handle, struct totem_ip_address *token_target) { struct totemnet_instance *instance; unsigned int res; res = hdb_handle_get (&totemnet_instance_database, handle, (void *)&instance); if (res != 0) { goto error_exit; } memcpy (&instance->token_target, token_target, sizeof (struct totem_ip_address)); hdb_handle_put (&totemnet_instance_database, handle); error_exit: return (res); } diff --git a/exec/totemnet.h b/exec/totemnet.h index 606b40f4..e7a3b8c9 100644 --- a/exec/totemnet.h +++ b/exec/totemnet.h @@ -1,108 +1,108 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2007 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef TOTEMNET_H_DEFINED #define TOTEMNET_H_DEFINED #include #include #include #define TOTEMNET_NOFLUSH 0 #define TOTEMNET_FLUSH 1 /* * Totem Network interface - also does encryption/decryption * depends on poll abstraction, POSIX, IPV4 */ /* * Create an instance */ extern int totemnet_initialize ( hdb_handle_t poll_handle, hdb_handle_t *handle, struct totem_config *totem_config, int interface_no, void *context, void (*deliver_fn) ( void *context, void *msg, int msg_len), void (*iface_change_fn) ( void *context, struct totem_ip_address *iface_address)); extern int totemnet_processor_count_set ( hdb_handle_t handle, int processor_count); extern int totemnet_token_send ( hdb_handle_t handle, struct iovec *iovec, - int iov_len); + unsigned int iov_len); extern int totemnet_mcast_flush_send ( hdb_handle_t handle, struct iovec *iovec, unsigned int iov_len); extern int totemnet_mcast_noflush_send ( hdb_handle_t handle, struct iovec *iovec, unsigned int iov_len); extern int totemnet_recv_flush (hdb_handle_t handle); extern int totemnet_send_flush (hdb_handle_t handle); extern int totemnet_iface_check (hdb_handle_t handle); extern int totemnet_finalize (hdb_handle_t handle); extern void totemnet_net_mtu_adjust (struct totem_config *totem_config); extern const char *totemnet_iface_print (hdb_handle_t handle); extern int totemnet_iface_get ( hdb_handle_t handle, struct totem_ip_address *addr); extern int totemnet_token_target_set ( hdb_handle_t handle, struct totem_ip_address *token_target); #endif /* TOTEMNET_H_DEFINED */ diff --git a/exec/totempg.c b/exec/totempg.c index d81943e8..e2ce17f0 100644 --- a/exec/totempg.c +++ b/exec/totempg.c @@ -1,1338 +1,1338 @@ /* * Copyright (c) 2003-2005 MontaVista Software, Inc. * Copyright (c) 2005 OSDL. * Copyright (c) 2006-2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * Author: Mark Haverkamp (markh@osdl.org) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * FRAGMENTATION AND PACKING ALGORITHM: * * Assemble the entire message into one buffer * if full fragment * store fragment into lengths list * for each full fragment * multicast fragment * set length and fragment fields of pg mesage * store remaining multicast into head of fragmentation data and set lens field * * If a message exceeds the maximum packet size allowed by the totem * single ring protocol, the protocol could lose forward progress. * Statically calculating the allowed data amount doesn't work because * the amount of data allowed depends on the number of fragments in * each message. In this implementation, the maximum fragment size * is dynamically calculated for each fragment added to the message. * It is possible for a message to be two bytes short of the maximum * packet size. This occurs when a message or collection of * messages + the mcast header + the lens are two bytes short of the * end of the packet. Since another len field consumes two bytes, the * len field would consume the rest of the packet without room for data. * * One optimization would be to forgo the final len field and determine * it from the size of the udp datagram. Then this condition would no * longer occur. */ /* * ASSEMBLY AND UNPACKING ALGORITHM: * * copy incoming packet into assembly data buffer indexed by current * location of end of fragment * * if not fragmented * deliver all messages in assembly data buffer * else * if msg_count > 1 and fragmented * deliver all messages except last message in assembly data buffer * copy last fragmented section to start of assembly data buffer * else * if msg_count = 1 and fragmented * do nothing * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "totemmrp.h" #include "totemsrp.h" #define min(a,b) ((a) < (b)) ? a : b struct totempg_mcast_header { short version; short type; }; /* * totempg_mcast structure * * header: Identify the mcast. * fragmented: Set if this message continues into next message * continuation: Set if this message is a continuation from last message * msg_count Indicates how many packed messages are contained * in the mcast. * Also, the size of each packed message and the messages themselves are * appended to the end of this structure when sent. */ struct totempg_mcast { struct totempg_mcast_header header; unsigned char fragmented; unsigned char continuation; unsigned short msg_count; /* * short msg_len[msg_count]; */ /* * data for messages */ }; /* * Maximum packet size for totem pg messages */ #define TOTEMPG_PACKET_SIZE (totempg_totem_config->net_mtu - \ sizeof (struct totempg_mcast)) /* * Local variables used for packing small messages */ static unsigned short mcast_packed_msg_lens[FRAME_SIZE_MAX]; static int mcast_packed_msg_count = 0; static int totempg_reserved = 0; /* * Function and data used to log messages */ static int totempg_log_level_security; static int totempg_log_level_error; static int totempg_log_level_warning; static int totempg_log_level_notice; static int totempg_log_level_debug; static int totempg_subsys_id; static void (*totempg_log_printf) (int subsys_id, const char *function, const char *file, int line, unsigned int level, const char *format, ...) __attribute__((format(printf, 6, 7))); struct totem_config *totempg_totem_config; struct assembly { unsigned int nodeid; unsigned char data[MESSAGE_SIZE_MAX]; int index; unsigned char last_frag_num; struct list_head list; }; static void assembly_deref (struct assembly *assembly); static int callback_token_received_fn (enum totem_callback_token_type type, const void *data); enum throw_away_mode_t { THROW_AWAY_INACTIVE, THROW_AWAY_ACTIVE }; static enum throw_away_mode_t throw_away_mode = THROW_AWAY_INACTIVE; DECLARE_LIST_INIT(assembly_list_inuse); DECLARE_LIST_INIT(assembly_list_free); /* * Staging buffer for packed messages. Messages are staged in this buffer * before sending. Multiple messages may fit which cuts down on the * number of mcasts sent. If a message doesn't completely fit, then * the mcast header has a fragment bit set that says that there are more * data to follow. fragment_size is an index into the buffer. It indicates * the size of message data and where to place new message data. * fragment_contuation indicates whether the first packed message in * the buffer is a continuation of a previously packed fragment. */ static unsigned char *fragmentation_data; static int fragment_size = 0; static int fragment_continuation = 0; static struct iovec iov_delv; static unsigned int totempg_max_handle = 0; struct totempg_group_instance { void (*deliver_fn) ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required); void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); struct totempg_group *groups; int groups_cnt; }; static struct hdb_handle_database totempg_groups_instance_database = { .handle_count = 0, .handles = 0, .iterator = 0, .mutex = PTHREAD_MUTEX_INITIALIZER }; static unsigned char next_fragment = 1; static pthread_mutex_t totempg_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t callback_token_mutex = PTHREAD_MUTEX_INITIALIZER; static pthread_mutex_t mcast_msg_mutex = PTHREAD_MUTEX_INITIALIZER; #define log_printf(level, format, args...) \ do { \ totempg_log_printf (totempg_subsys_id, __FUNCTION__, \ __FILE__, __LINE__, level, format, ##args); \ } while (0); static int msg_count_send_ok (int msg_count); static int byte_count_send_ok (int byte_count); static struct assembly *assembly_ref (unsigned int nodeid) { struct assembly *assembly; struct list_head *list; /* * Search inuse list for node id and return assembly buffer if found */ for (list = assembly_list_inuse.next; list != &assembly_list_inuse; list = list->next) { assembly = list_entry (list, struct assembly, list); if (nodeid == assembly->nodeid) { return (assembly); } } /* * Nothing found in inuse list get one from free list if available */ if (list_empty (&assembly_list_free) == 0) { assembly = list_entry (assembly_list_free.next, struct assembly, list); list_del (&assembly->list); list_add (&assembly->list, &assembly_list_inuse); assembly->nodeid = nodeid; return (assembly); } /* * Nothing available in inuse or free list, so allocate a new one */ assembly = malloc (sizeof (struct assembly)); memset (assembly, 0, sizeof (struct assembly)); /* * TODO handle memory allocation failure here */ assert (assembly); assembly->nodeid = nodeid; list_init (&assembly->list); list_add (&assembly->list, &assembly_list_inuse); return (assembly); } static void assembly_deref (struct assembly *assembly) { list_del (&assembly->list); list_add (&assembly->list, &assembly_list_free); } static inline void app_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { int i; struct totempg_group_instance *instance; unsigned int res; for (i = 0; i <= totempg_max_handle; i++) { res = hdb_handle_get (&totempg_groups_instance_database, hdb_nocheck_convert (i), (void *)&instance); if (res == 0) { if (instance->confchg_fn) { instance->confchg_fn ( configuration_type, member_list, member_list_entries, left_list, left_list_entries, joined_list, joined_list_entries, ring_id); } hdb_handle_put (&totempg_groups_instance_database, hdb_nocheck_convert (i)); } } } static inline void group_endian_convert ( struct iovec *iovec) { unsigned short *group_len; int i; struct iovec iovec_aligned = { NULL, 0 }; struct iovec *iovec_swab; /* * Align data structure for sparc and ia64 */ if ((size_t)iovec->iov_base % 4 != 0) { iovec_aligned.iov_base = alloca(iovec->iov_len); memcpy(iovec_aligned.iov_base, iovec->iov_base, iovec->iov_len); iovec_aligned.iov_len = iovec->iov_len; iovec_swab = &iovec_aligned; } else { iovec_swab = iovec; } group_len = (unsigned short *)iovec_swab->iov_base; group_len[0] = swab16(group_len[0]); for (i = 1; i < group_len[0] + 1; i++) { group_len[i] = swab16(group_len[i]); } if (iovec_swab == &iovec_aligned) { memcpy(iovec->iov_base, iovec_aligned.iov_base, iovec->iov_len); } } static inline int group_matches ( struct iovec *iovec, unsigned int iov_len, struct totempg_group *groups_b, unsigned int group_b_cnt, unsigned int *adjust_iovec) { unsigned short *group_len; char *group_name; int i; int j; struct iovec iovec_aligned = { NULL, 0 }; assert (iov_len == 1); /* * Align data structure for sparc and ia64 */ if ((size_t)iovec->iov_base % 4 != 0) { iovec_aligned.iov_base = alloca(iovec->iov_len); memcpy(iovec_aligned.iov_base, iovec->iov_base, iovec->iov_len); iovec_aligned.iov_len = iovec->iov_len; iovec = &iovec_aligned; } group_len = (unsigned short *)iovec->iov_base; group_name = ((char *)iovec->iov_base) + sizeof (unsigned short) * (group_len[0] + 1); /* * Calculate amount to adjust the iovec by before delivering to app */ *adjust_iovec = sizeof (unsigned short) * (group_len[0] + 1); for (i = 1; i < group_len[0] + 1; i++) { *adjust_iovec += group_len[i]; } /* * Determine if this message should be delivered to this instance */ for (i = 1; i < group_len[0] + 1; i++) { for (j = 0; j < group_b_cnt; j++) { if ((group_len[i] == groups_b[j].group_len) && (memcmp (groups_b[j].group, group_name, group_len[i]) == 0)) { return (1); } } group_name += group_len[i]; } return (0); } static inline void app_deliver_fn ( unsigned int nodeid, struct iovec *iovec, unsigned int iov_len, int endian_conversion_required) { int i; struct totempg_group_instance *instance; struct iovec stripped_iovec; unsigned int adjust_iovec; unsigned int res; struct iovec aligned_iovec = { NULL, 0 }; if (endian_conversion_required) { group_endian_convert (iovec); } /* * Align data structure for sparc and ia64 */ aligned_iovec.iov_base = alloca(iovec->iov_len); aligned_iovec.iov_len = iovec->iov_len; memcpy(aligned_iovec.iov_base, iovec->iov_base, iovec->iov_len); iovec = &aligned_iovec; for (i = 0; i <= totempg_max_handle; i++) { res = hdb_handle_get (&totempg_groups_instance_database, hdb_nocheck_convert (i), (void *)&instance); if (res == 0) { assert (iov_len == 1); if (group_matches (iovec, iov_len, instance->groups, instance->groups_cnt, &adjust_iovec)) { stripped_iovec.iov_len = iovec->iov_len - adjust_iovec; // stripped_iovec.iov_base = (char *)iovec->iov_base + adjust_iovec; /* * Align data structure for sparc and ia64 */ if ((char *)iovec->iov_base + adjust_iovec % 4 != 0) { /* * Deal with misalignment */ stripped_iovec.iov_base = alloca (stripped_iovec.iov_len); memcpy (stripped_iovec.iov_base, (char *)iovec->iov_base + adjust_iovec, stripped_iovec.iov_len); } instance->deliver_fn ( nodeid, &stripped_iovec, iov_len, endian_conversion_required); } hdb_handle_put (&totempg_groups_instance_database, hdb_nocheck_convert(i)); } } } static void totempg_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { // TODO optimize this app_confchg_fn (configuration_type, member_list, member_list_entries, left_list, left_list_entries, joined_list, joined_list_entries, ring_id); } static void totempg_deliver_fn ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required) { struct totempg_mcast *mcast; unsigned short *msg_lens; int i; struct assembly *assembly; char header[FRAME_SIZE_MAX]; int h_index; int a_i = 0; int msg_count; int continuation; int start; assembly = assembly_ref (nodeid); assert (assembly); /* * Assemble the header into one block of data and * assemble the packet contents into one block of data to simplify delivery */ if (iov_len == 1) { /* * This message originated from external processor * because there is only one iovec for the full msg. */ char *data; int datasize; mcast = (struct totempg_mcast *)iovec[0].iov_base; if (endian_conversion_required) { mcast->msg_count = swab16 (mcast->msg_count); } msg_count = mcast->msg_count; datasize = sizeof (struct totempg_mcast) + msg_count * sizeof (unsigned short); memcpy (header, iovec[0].iov_base, datasize); assert(iovec); data = iovec[0].iov_base; msg_lens = (unsigned short *) (header + sizeof (struct totempg_mcast)); if (endian_conversion_required) { for (i = 0; i < mcast->msg_count; i++) { msg_lens[i] = swab16 (msg_lens[i]); } } memcpy (&assembly->data[assembly->index], &data[datasize], iovec[0].iov_len - datasize); } else { /* * The message originated from local processor * becasue there is greater than one iovec for then full msg. */ h_index = 0; for (i = 0; i < 2; i++) { memcpy (&header[h_index], iovec[i].iov_base, iovec[i].iov_len); h_index += iovec[i].iov_len; } mcast = (struct totempg_mcast *)header; // TODO make sure we are using a copy of mcast not the actual data itself msg_lens = (unsigned short *) (header + sizeof (struct totempg_mcast)); for (i = 2; i < iov_len; i++) { a_i = assembly->index; assert (iovec[i].iov_len + a_i <= MESSAGE_SIZE_MAX); memcpy (&assembly->data[a_i], iovec[i].iov_base, iovec[i].iov_len); a_i += msg_lens[i - 2]; } iov_len -= 2; } /* * If the last message in the buffer is a fragment, then we * can't deliver it. We'll first deliver the full messages * then adjust the assembly buffer so we can add the rest of the * fragment when it arrives. */ msg_count = mcast->fragmented ? mcast->msg_count - 1 : mcast->msg_count; continuation = mcast->continuation; iov_delv.iov_base = &assembly->data[0]; iov_delv.iov_len = assembly->index + msg_lens[0]; /* * Make sure that if this message is a continuation, that it * matches the sequence number of the previous fragment. * Also, if the first packed message is a continuation * of a previous message, but the assembly buffer * is empty, then we need to discard it since we can't * assemble a complete message. Likewise, if this message isn't a * continuation and the assembly buffer is empty, we have to discard * the continued message. */ start = 0; if (throw_away_mode == THROW_AWAY_ACTIVE) { /* Throw away the first msg block */ if (mcast->fragmented == 0 || mcast->fragmented == 1) { throw_away_mode = THROW_AWAY_INACTIVE; assembly->index += msg_lens[0]; iov_delv.iov_base = &assembly->data[assembly->index]; iov_delv.iov_len = msg_lens[1]; start = 1; } } else if (throw_away_mode == THROW_AWAY_INACTIVE) { if (continuation == assembly->last_frag_num) { assembly->last_frag_num = mcast->fragmented; for (i = start; i < msg_count; i++) { app_deliver_fn(nodeid, &iov_delv, 1, endian_conversion_required); assembly->index += msg_lens[i]; iov_delv.iov_base = &assembly->data[assembly->index]; if (i < (msg_count - 1)) { iov_delv.iov_len = msg_lens[i + 1]; } } } else { throw_away_mode = THROW_AWAY_ACTIVE; } } if (mcast->fragmented == 0) { /* * End of messages, dereference assembly struct */ assembly->last_frag_num = 0; assembly->index = 0; assembly_deref (assembly); } else { /* * Message is fragmented, keep around assembly list */ if (mcast->msg_count > 1) { memmove (&assembly->data[0], &assembly->data[assembly->index], msg_lens[msg_count]); assembly->index = 0; } assembly->index += msg_lens[msg_count]; } } /* * Totem Process Group Abstraction * depends on poll abstraction, POSIX, IPV4 */ void *callback_token_received_handle; int callback_token_received_fn (enum totem_callback_token_type type, const void *data) { struct totempg_mcast mcast; struct iovec iovecs[3]; int res; pthread_mutex_lock (&mcast_msg_mutex); if (mcast_packed_msg_count == 0) { pthread_mutex_unlock (&mcast_msg_mutex); return (0); } if (totemmrp_avail() == 0) { pthread_mutex_unlock (&mcast_msg_mutex); return (0); } mcast.fragmented = 0; /* * Was the first message in this buffer a continuation of a * fragmented message? */ mcast.continuation = fragment_continuation; fragment_continuation = 0; mcast.msg_count = mcast_packed_msg_count; iovecs[0].iov_base = &mcast; iovecs[0].iov_len = sizeof (struct totempg_mcast); iovecs[1].iov_base = mcast_packed_msg_lens; iovecs[1].iov_len = mcast_packed_msg_count * sizeof (unsigned short); iovecs[2].iov_base = &fragmentation_data[0]; iovecs[2].iov_len = fragment_size; res = totemmrp_mcast (iovecs, 3, 0); mcast_packed_msg_count = 0; fragment_size = 0; pthread_mutex_unlock (&mcast_msg_mutex); return (0); } /* * Initialize the totem process group abstraction */ int totempg_initialize ( hdb_handle_t poll_handle, struct totem_config *totem_config) { int res; totempg_totem_config = totem_config; totempg_log_level_security = totem_config->totem_logging_configuration.log_level_security; totempg_log_level_error = totem_config->totem_logging_configuration.log_level_error; totempg_log_level_warning = totem_config->totem_logging_configuration.log_level_warning; totempg_log_level_notice = totem_config->totem_logging_configuration.log_level_notice; totempg_log_level_debug = totem_config->totem_logging_configuration.log_level_debug; totempg_log_printf = totem_config->totem_logging_configuration.log_printf; totempg_subsys_id = totem_config->totem_logging_configuration.log_subsys_id; fragmentation_data = malloc (TOTEMPG_PACKET_SIZE); if (fragmentation_data == 0) { return (-1); } res = totemmrp_initialize ( poll_handle, totem_config, totempg_deliver_fn, totempg_confchg_fn); totemmrp_callback_token_create ( &callback_token_received_handle, TOTEM_CALLBACK_TOKEN_RECEIVED, 0, callback_token_received_fn, 0); totemsrp_net_mtu_adjust (totem_config); return (res); } void totempg_finalize (void) { pthread_mutex_lock (&totempg_mutex); totemmrp_finalize (); pthread_mutex_unlock (&totempg_mutex); } /* * Multicast a message */ static int mcast_msg ( struct iovec *iovec_in, - int iov_len, + unsigned int iov_len, int guarantee) { int res = 0; struct totempg_mcast mcast; struct iovec iovecs[3]; struct iovec iovec[64]; int i; int dest, src; int max_packet_size = 0; int copy_len = 0; int copy_base = 0; int total_size = 0; pthread_mutex_lock (&mcast_msg_mutex); totemmrp_new_msg_signal (); /* * Remove zero length iovectors from the list */ assert (iov_len < 64); for (dest = 0, src = 0; src < iov_len; src++) { if (iovec_in[src].iov_len) { memcpy (&iovec[dest++], &iovec_in[src], sizeof (struct iovec)); } } iov_len = dest; max_packet_size = TOTEMPG_PACKET_SIZE - (sizeof (unsigned short) * (mcast_packed_msg_count + 1)); mcast_packed_msg_lens[mcast_packed_msg_count] = 0; /* * Check if we would overwrite new message queue */ for (i = 0; i < iov_len; i++) { total_size += iovec[i].iov_len; } if (byte_count_send_ok (total_size + sizeof(unsigned short) * (mcast_packed_msg_count+1)) == 0) { pthread_mutex_unlock (&mcast_msg_mutex); return(-1); } for (i = 0; i < iov_len; ) { mcast.fragmented = 0; mcast.continuation = fragment_continuation; copy_len = iovec[i].iov_len - copy_base; /* * If it all fits with room left over, copy it in. * We need to leave at least sizeof(short) + 1 bytes in the * fragment_buffer on exit so that max_packet_size + fragment_size * doesn't exceed the size of the fragment_buffer on the next call. */ if ((copy_len + fragment_size) < (max_packet_size - sizeof (unsigned short))) { memcpy (&fragmentation_data[fragment_size], (char *)iovec[i].iov_base + copy_base, copy_len); fragment_size += copy_len; mcast_packed_msg_lens[mcast_packed_msg_count] += copy_len; next_fragment = 1; copy_len = 0; copy_base = 0; i++; continue; /* * If it just fits or is too big, then send out what fits. */ } else { unsigned char *data_ptr; copy_len = min(copy_len, max_packet_size - fragment_size); if( copy_len == max_packet_size ) data_ptr = (unsigned char *)iovec[i].iov_base + copy_base; else { data_ptr = fragmentation_data; memcpy (&fragmentation_data[fragment_size], (unsigned char *)iovec[i].iov_base + copy_base, copy_len); } memcpy (&fragmentation_data[fragment_size], (unsigned char *)iovec[i].iov_base + copy_base, copy_len); mcast_packed_msg_lens[mcast_packed_msg_count] += copy_len; /* * if we're not on the last iovec or the iovec is too large to * fit, then indicate a fragment. This also means that the next * message will have the continuation of this one. */ if ((i < (iov_len - 1)) || ((copy_base + copy_len) < iovec[i].iov_len)) { if (!next_fragment) { next_fragment++; } fragment_continuation = next_fragment; mcast.fragmented = next_fragment++; assert(fragment_continuation != 0); assert(mcast.fragmented != 0); } else { fragment_continuation = 0; } /* * assemble the message and send it */ mcast.msg_count = ++mcast_packed_msg_count; iovecs[0].iov_base = &mcast; iovecs[0].iov_len = sizeof(struct totempg_mcast); iovecs[1].iov_base = mcast_packed_msg_lens; iovecs[1].iov_len = mcast_packed_msg_count * sizeof(unsigned short); iovecs[2].iov_base = data_ptr; iovecs[2].iov_len = max_packet_size; assert (totemmrp_avail() > 0); res = totemmrp_mcast (iovecs, 3, guarantee); /* * Recalculate counts and indexes for the next. */ mcast_packed_msg_lens[0] = 0; mcast_packed_msg_count = 0; fragment_size = 0; max_packet_size = TOTEMPG_PACKET_SIZE - (sizeof(unsigned short)); /* * If the iovec all fit, go to the next iovec */ if ((copy_base + copy_len) == iovec[i].iov_len) { copy_len = 0; copy_base = 0; i++; /* * Continue with the rest of the current iovec. */ } else { copy_base += copy_len; } } } /* * Bump only if we added message data. This may be zero if * the last buffer just fit into the fragmentation_data buffer * and we were at the last iovec. */ if (mcast_packed_msg_lens[mcast_packed_msg_count]) { mcast_packed_msg_count++; } pthread_mutex_unlock (&mcast_msg_mutex); return (res); } /* * Determine if a message of msg_size could be queued */ static int msg_count_send_ok ( int msg_count) { int avail = 0; avail = totemmrp_avail () - totempg_reserved - 1; return (avail > msg_count); } static int byte_count_send_ok ( int byte_count) { unsigned int msg_count = 0; int avail = 0; avail = totemmrp_avail () - 1; msg_count = (byte_count / (totempg_totem_config->net_mtu - 25)) + 1; return (avail > msg_count); } static int send_reserve ( int msg_size) { unsigned int msg_count = 0; msg_count = (msg_size / (totempg_totem_config->net_mtu - 25)) + 1; totempg_reserved += msg_count; return (msg_count); } static void send_release ( int msg_count) { totempg_reserved -= msg_count; } int totempg_callback_token_create ( void **handle_out, enum totem_callback_token_type type, int delete, int (*callback_fn) (enum totem_callback_token_type type, const void *), const void *data) { unsigned int res; pthread_mutex_lock (&callback_token_mutex); res = totemmrp_callback_token_create (handle_out, type, delete, callback_fn, data); pthread_mutex_unlock (&callback_token_mutex); return (res); } void totempg_callback_token_destroy ( void *handle_out) { pthread_mutex_lock (&callback_token_mutex); totemmrp_callback_token_destroy (handle_out); pthread_mutex_unlock (&callback_token_mutex); } /* * vi: set autoindent tabstop=4 shiftwidth=4 : */ int totempg_groups_initialize ( hdb_handle_t *handle, void (*deliver_fn) ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required), void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id)) { struct totempg_group_instance *instance; unsigned int res; pthread_mutex_lock (&totempg_mutex); res = hdb_handle_create (&totempg_groups_instance_database, sizeof (struct totempg_group_instance), handle); if (res != 0) { goto error_exit; } if (*handle > totempg_max_handle) { totempg_max_handle = *handle; } res = hdb_handle_get (&totempg_groups_instance_database, *handle, (void *)&instance); if (res != 0) { goto error_destroy; } instance->deliver_fn = deliver_fn; instance->confchg_fn = confchg_fn; instance->groups = 0; instance->groups_cnt = 0; hdb_handle_put (&totempg_groups_instance_database, *handle); pthread_mutex_unlock (&totempg_mutex); return (0); error_destroy: hdb_handle_destroy (&totempg_groups_instance_database, *handle); error_exit: pthread_mutex_unlock (&totempg_mutex); return (-1); } int totempg_groups_join ( hdb_handle_t handle, const struct totempg_group *groups, size_t group_cnt) { struct totempg_group_instance *instance; struct totempg_group *new_groups; unsigned int res; pthread_mutex_lock (&totempg_mutex); res = hdb_handle_get (&totempg_groups_instance_database, handle, (void *)&instance); if (res != 0) { goto error_exit; } new_groups = realloc (instance->groups, sizeof (struct totempg_group) * (instance->groups_cnt + group_cnt)); if (new_groups == 0) { res = ENOMEM; goto error_exit; } memcpy (&new_groups[instance->groups_cnt], groups, group_cnt * sizeof (struct totempg_group)); instance->groups = new_groups; instance->groups_cnt = instance->groups_cnt = group_cnt; hdb_handle_put (&totempg_groups_instance_database, handle); error_exit: pthread_mutex_unlock (&totempg_mutex); return (res); } int totempg_groups_leave ( hdb_handle_t handle, const struct totempg_group *groups, size_t group_cnt) { struct totempg_group_instance *instance; unsigned int res; pthread_mutex_lock (&totempg_mutex); res = hdb_handle_get (&totempg_groups_instance_database, handle, (void *)&instance); if (res != 0) { goto error_exit; } hdb_handle_put (&totempg_groups_instance_database, handle); error_exit: pthread_mutex_unlock (&totempg_mutex); return (res); } #define MAX_IOVECS_FROM_APP 32 #define MAX_GROUPS_PER_MSG 32 int totempg_groups_mcast_joined ( hdb_handle_t handle, const struct iovec *iovec, - int iov_len, + unsigned int iov_len, int guarantee) { struct totempg_group_instance *instance; unsigned short group_len[MAX_GROUPS_PER_MSG + 1]; struct iovec iovec_mcast[MAX_GROUPS_PER_MSG + 1 + MAX_IOVECS_FROM_APP]; int i; unsigned int res; pthread_mutex_lock (&totempg_mutex); res = hdb_handle_get (&totempg_groups_instance_database, handle, (void *)&instance); if (res != 0) { goto error_exit; } /* * Build group_len structure and the iovec_mcast structure */ group_len[0] = instance->groups_cnt; for (i = 0; i < instance->groups_cnt; i++) { group_len[i + 1] = instance->groups[i].group_len; iovec_mcast[i + 1].iov_len = instance->groups[i].group_len; iovec_mcast[i + 1].iov_base = (void *) instance->groups[i].group; } iovec_mcast[0].iov_len = (instance->groups_cnt + 1) * sizeof (unsigned short); iovec_mcast[0].iov_base = group_len; for (i = 0; i < iov_len; i++) { iovec_mcast[i + instance->groups_cnt + 1].iov_len = iovec[i].iov_len; iovec_mcast[i + instance->groups_cnt + 1].iov_base = iovec[i].iov_base; } res = mcast_msg (iovec_mcast, iov_len + instance->groups_cnt + 1, guarantee); hdb_handle_put (&totempg_groups_instance_database, handle); error_exit: pthread_mutex_unlock (&totempg_mutex); return (res); } int totempg_groups_joined_reserve ( hdb_handle_t handle, const struct iovec *iovec, - int iov_len) + unsigned int iov_len) { struct totempg_group_instance *instance; unsigned int size = 0; unsigned int i; unsigned int res; unsigned int reserved = 0; pthread_mutex_lock (&totempg_mutex); pthread_mutex_lock (&mcast_msg_mutex); res = hdb_handle_get (&totempg_groups_instance_database, handle, (void *)&instance); if (res != 0) { goto error_exit; } for (i = 0; i < instance->groups_cnt; i++) { size += instance->groups[i].group_len; } for (i = 0; i < iov_len; i++) { size += iovec[i].iov_len; } reserved = send_reserve (size); if (msg_count_send_ok (reserved) == 0) { send_release (reserved); reserved = 0; } hdb_handle_put (&totempg_groups_instance_database, handle); error_exit: pthread_mutex_unlock (&mcast_msg_mutex); pthread_mutex_unlock (&totempg_mutex); return (reserved); } int totempg_groups_joined_release (int msg_count) { pthread_mutex_lock (&totempg_mutex); pthread_mutex_lock (&mcast_msg_mutex); send_release (msg_count); pthread_mutex_unlock (&mcast_msg_mutex); pthread_mutex_unlock (&totempg_mutex); return 0; } int totempg_groups_mcast_groups ( hdb_handle_t handle, int guarantee, const struct totempg_group *groups, size_t groups_cnt, const struct iovec *iovec, size_t iov_len) { struct totempg_group_instance *instance; unsigned short group_len[MAX_GROUPS_PER_MSG + 1]; struct iovec iovec_mcast[MAX_GROUPS_PER_MSG + 1 + MAX_IOVECS_FROM_APP]; int i; unsigned int res; pthread_mutex_lock (&totempg_mutex); res = hdb_handle_get (&totempg_groups_instance_database, handle, (void *)&instance); if (res != 0) { goto error_exit; } /* * Build group_len structure and the iovec_mcast structure */ group_len[0] = groups_cnt; for (i = 0; i < groups_cnt; i++) { group_len[i + 1] = groups[i].group_len; iovec_mcast[i + 1].iov_len = groups[i].group_len; iovec_mcast[i + 1].iov_base = (void *) groups[i].group; } iovec_mcast[0].iov_len = (groups_cnt + 1) * sizeof (unsigned short); iovec_mcast[0].iov_base = group_len; for (i = 0; i < iov_len; i++) { iovec_mcast[i + groups_cnt + 1].iov_len = iovec[i].iov_len; iovec_mcast[i + groups_cnt + 1].iov_base = iovec[i].iov_base; } res = mcast_msg (iovec_mcast, iov_len + groups_cnt + 1, guarantee); hdb_handle_put (&totempg_groups_instance_database, handle); error_exit: pthread_mutex_unlock (&totempg_mutex); return (res); } /* * Returns -1 if error, 0 if can't send, 1 if can send the message */ int totempg_groups_send_ok_groups ( hdb_handle_t handle, const struct totempg_group *groups, size_t groups_cnt, const struct iovec *iovec, size_t iov_len) { struct totempg_group_instance *instance; unsigned int size = 0; unsigned int i; unsigned int res; pthread_mutex_lock (&totempg_mutex); res = hdb_handle_get (&totempg_groups_instance_database, handle, (void *)&instance); if (res != 0) { goto error_exit; } for (i = 0; i < groups_cnt; i++) { size += groups[i].group_len; } for (i = 0; i < iov_len; i++) { size += iovec[i].iov_len; } res = msg_count_send_ok (size); hdb_handle_put (&totempg_groups_instance_database, handle); error_exit: pthread_mutex_unlock (&totempg_mutex); return (res); } int totempg_ifaces_get ( unsigned int nodeid, struct totem_ip_address *interfaces, char ***status, unsigned int *iface_count) { int res; res = totemmrp_ifaces_get ( nodeid, interfaces, status, iface_count); return (res); } int totempg_ring_reenable (void) { int res; res = totemmrp_ring_reenable (); return (res); } const char *totempg_ifaces_print (unsigned int nodeid) { static char iface_string[256 * INTERFACE_MAX]; char one_iface[64]; struct totem_ip_address interfaces[INTERFACE_MAX]; char **status; unsigned int iface_count; unsigned int i; int res; iface_string[0] = '\0'; res = totempg_ifaces_get (nodeid, interfaces, &status, &iface_count); if (res == -1) { return ("no interface found for nodeid"); } for (i = 0; i < iface_count; i++) { sprintf (one_iface, "r(%d) ip(%s) ", i, totemip_print (&interfaces[i])); strcat (iface_string, one_iface); } return (iface_string); } unsigned int totempg_my_nodeid_get (void) { return (totemmrp_my_nodeid_get()); } int totempg_my_family_get (void) { return (totemmrp_my_family_get()); } diff --git a/exec/totemsrp.c b/exec/totemsrp.c index 4c6acddf..db08edba 100644 --- a/exec/totemsrp.c +++ b/exec/totemsrp.c @@ -1,4210 +1,4210 @@ /* * Copyright (c) 2003-2006 MontaVista Software, Inc. * Copyright (c) 2006-2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * The first version of this code was based upon Yair Amir's PhD thesis: * http://www.cs.jhu.edu/~yairamir/phd.ps) (ch4,5). * * The current version of totemsrp implements the Totem protocol specified in: * http://citeseer.ist.psu.edu/amir95totem.html * * The deviations from the above published protocols are: * - encryption of message contents with SOBER128 * - authentication of meessage contents with SHA1/HMAC * - token hold mode where token doesn't rotate on unused ring - reduces cpu * usage on 1.6ghz xeon from 35% to less then .1 % as measured by top */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "totemsrp.h" #include "totemrrp.h" #include "wthread.h" #include "crypto.h" #define LOCALHOST_IP inet_addr("127.0.0.1") #define QUEUE_RTR_ITEMS_SIZE_MAX 256 /* allow 256 retransmit items */ #define RETRANS_MESSAGE_QUEUE_SIZE_MAX 500 /* allow 500 messages to be queued */ #define RECEIVED_MESSAGE_QUEUE_SIZE_MAX 500 /* allow 500 messages to be queued */ #define MAXIOVS 5 #define RETRANSMIT_ENTRIES_MAX 30 #define TOKEN_SIZE_MAX 64000 /* bytes */ /* * Rollover handling: * SEQNO_START_MSG is the starting sequence number after a new configuration * This should remain zero, unless testing overflow in which case * 0x7ffff000 and 0xfffff000 are good starting values. * * SEQNO_START_TOKEN is the starting sequence number after a new configuration * for a token. This should remain zero, unless testing overflow in which * case 07fffff00 or 0xffffff00 are good starting values. * * SEQNO_START_MSG is the starting sequence number after a new configuration * This should remain zero, unless testing overflow in which case * 0x7ffff000 and 0xfffff000 are good values to start with */ #define SEQNO_START_MSG 0x0 #define SEQNO_START_TOKEN 0x0 /* * These can be used ot test different rollover points * #define SEQNO_START_MSG 0xfffffe00 * #define SEQNO_START_TOKEN 0xfffffe00 */ /* * These can be used to test the error recovery algorithms * #define TEST_DROP_ORF_TOKEN_PERCENTAGE 30 * #define TEST_DROP_COMMIT_TOKEN_PERCENTAGE 30 * #define TEST_DROP_MCAST_PERCENTAGE 50 * #define TEST_RECOVERY_MSG_COUNT 300 */ /* * we compare incoming messages to determine if their endian is * different - if so convert them * * do not change */ #define ENDIAN_LOCAL 0xff22 enum message_type { MESSAGE_TYPE_ORF_TOKEN = 0, /* Ordering, Reliability, Flow (ORF) control Token */ MESSAGE_TYPE_MCAST = 1, /* ring ordered multicast message */ MESSAGE_TYPE_MEMB_MERGE_DETECT = 2, /* merge rings if there are available rings */ MESSAGE_TYPE_MEMB_JOIN = 3, /* membership join message */ MESSAGE_TYPE_MEMB_COMMIT_TOKEN = 4, /* membership commit token */ MESSAGE_TYPE_TOKEN_HOLD_CANCEL = 5, /* cancel the holding of the token */ }; enum encapsulation_type { MESSAGE_ENCAPSULATED = 1, MESSAGE_NOT_ENCAPSULATED = 2 }; /* * New membership algorithm local variables */ struct srp_addr { struct totem_ip_address addr[INTERFACE_MAX]; }; struct consensus_list_item { struct srp_addr addr; int set; }; struct token_callback_instance { struct list_head list; int (*callback_fn) (enum totem_callback_token_type type, const void *); enum totem_callback_token_type callback_type; int delete; void *data; }; struct totemsrp_socket { int mcast; int token; }; struct message_header { char type; char encapsulated; unsigned short endian_detector; unsigned int nodeid; } __attribute__((packed)); struct mcast { struct message_header header; struct srp_addr system_from; unsigned int seq; int this_seqno; struct memb_ring_id ring_id; unsigned int node_id; int guarantee; } __attribute__((packed)); /* * MTU - multicast message header - IP header - UDP header * * On lossy switches, making use of the DF UDP flag can lead to loss of * forward progress. So the packets must be fragmented by a higher layer * * This layer can only handle packets of MTU size. */ #define FRAGMENT_SIZE (FRAME_SIZE_MAX - sizeof (struct mcast) - 20 - 8) struct rtr_item { struct memb_ring_id ring_id; unsigned int seq; }__attribute__((packed)); struct orf_token { struct message_header header; unsigned int seq; unsigned int token_seq; unsigned int aru; unsigned int aru_addr; struct memb_ring_id ring_id; unsigned int backlog; unsigned int fcc; int retrans_flg; int rtr_list_entries; struct rtr_item rtr_list[0]; }__attribute__((packed)); struct memb_join { struct message_header header; struct srp_addr system_from; unsigned int proc_list_entries; unsigned int failed_list_entries; unsigned long long ring_seq; unsigned char end_of_memb_join[0]; /* * These parts of the data structure are dynamic: * struct srp_addr proc_list[]; * struct srp_addr failed_list[]; */ } __attribute__((packed)); struct memb_merge_detect { struct message_header header; struct srp_addr system_from; struct memb_ring_id ring_id; } __attribute__((packed)); struct token_hold_cancel { struct message_header header; struct memb_ring_id ring_id; } __attribute__((packed)); struct memb_commit_token_memb_entry { struct memb_ring_id ring_id; unsigned int aru; unsigned int high_delivered; unsigned int received_flg; }__attribute__((packed)); struct memb_commit_token { struct message_header header; unsigned int token_seq; struct memb_ring_id ring_id; unsigned int retrans_flg; int memb_index; int addr_entries; unsigned char end_of_commit_token[0]; /* * These parts of the data structure are dynamic: * * struct srp_addr addr[PROCESSOR_COUNT_MAX]; * struct memb_commit_token_memb_entry memb_list[PROCESSOR_COUNT_MAX]; */ }__attribute__((packed)); struct message_item { struct mcast *mcast; struct iovec iovec[MAXIOVS]; - int iov_len; + unsigned int iov_len; }; struct sort_queue_item { struct iovec iovec[MAXIOVS]; - int iov_len; + unsigned int iov_len; }; struct orf_token_mcast_thread_state { char iobuf[9000]; prng_state prng_state; }; enum memb_state { MEMB_STATE_OPERATIONAL = 1, MEMB_STATE_GATHER = 2, MEMB_STATE_COMMIT = 3, MEMB_STATE_RECOVERY = 4 }; struct totemsrp_instance { int iface_changes; /* * Flow control mcasts and remcasts on last and current orf_token */ int fcc_remcast_last; int fcc_mcast_last; int fcc_remcast_current; struct consensus_list_item consensus_list[PROCESSOR_COUNT_MAX]; int consensus_list_entries; struct srp_addr my_id; struct srp_addr my_proc_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_failed_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_new_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_trans_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_deliver_memb_list[PROCESSOR_COUNT_MAX]; struct srp_addr my_left_memb_list[PROCESSOR_COUNT_MAX]; int my_proc_list_entries; int my_failed_list_entries; int my_new_memb_entries; int my_trans_memb_entries; int my_memb_entries; int my_deliver_memb_entries; int my_left_memb_entries; struct memb_ring_id my_ring_id; struct memb_ring_id my_old_ring_id; int my_aru_count; int my_merge_detect_timeout_outstanding; unsigned int my_last_aru; int my_seq_unchanged; int my_received_flg; unsigned int my_high_seq_received; unsigned int my_install_seq; int my_rotation_counter; int my_set_retrans_flg; int my_retrans_flg_count; unsigned int my_high_ring_delivered; int heartbeat_timeout; /* * Queues used to order, deliver, and recover messages */ struct queue new_message_queue; struct queue retrans_message_queue; struct sq regular_sort_queue; struct sq recovery_sort_queue; /* * Received up to and including */ unsigned int my_aru; unsigned int my_high_delivered; struct list_head token_callback_received_listhead; struct list_head token_callback_sent_listhead; char *orf_token_retransmit[TOKEN_SIZE_MAX]; int orf_token_retransmit_size; unsigned int my_token_seq; /* * Timers */ poll_timer_handle timer_orf_token_timeout; poll_timer_handle timer_orf_token_retransmit_timeout; poll_timer_handle timer_orf_token_hold_retransmit_timeout; poll_timer_handle timer_merge_detect_timeout; poll_timer_handle memb_timer_state_gather_join_timeout; poll_timer_handle memb_timer_state_gather_consensus_timeout; poll_timer_handle memb_timer_state_commit_timeout; poll_timer_handle timer_heartbeat_timeout; /* * Function and data used to log messages */ int totemsrp_log_level_security; int totemsrp_log_level_error; int totemsrp_log_level_warning; int totemsrp_log_level_notice; int totemsrp_log_level_debug; int totemsrp_subsys_id; void (*totemsrp_log_printf) (int subsys, const char *function, const char *file, int line, unsigned int level, const char *format, ...)__attribute__((format(printf, 6, 7)));; enum memb_state memb_state; //TODO struct srp_addr next_memb; char iov_buffer[FRAME_SIZE_MAX]; struct iovec totemsrp_iov_recv; hdb_handle_t totemsrp_poll_handle; /* * Function called when new message received */ - int (*totemsrp_recv) (char *group, struct iovec *iovec, int iov_len); + int (*totemsrp_recv) (char *group, struct iovec *iovec, unsigned int iov_len); struct totem_ip_address mcast_address; void (*totemsrp_deliver_fn) ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required); void (*totemsrp_confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); int global_seqno; int my_token_held; unsigned long long token_ring_id_seq; unsigned int last_released; unsigned int set_aru; int old_ring_state_saved; int old_ring_state_aru; unsigned int old_ring_state_high_seq_received; int ring_saved; unsigned int my_last_seq; struct timeval tv_old; hdb_handle_t totemrrp_handle; struct totem_config *totem_config; unsigned int use_heartbeat; unsigned int my_trc; unsigned int my_pbl; unsigned int my_cbl; }; struct message_handlers { int count; int (*handler_functions[6]) ( struct totemsrp_instance *instance, void *msg, int msg_len, int endian_conversion_needed); }; /* * forward decls */ static int message_handler_orf_token ( struct totemsrp_instance *instance, void *msg, int msg_len, int endian_conversion_needed); static int message_handler_mcast ( struct totemsrp_instance *instance, void *msg, int msg_len, int endian_conversion_needed); static int message_handler_memb_merge_detect ( struct totemsrp_instance *instance, void *msg, int msg_len, int endian_conversion_needed); static int message_handler_memb_join ( struct totemsrp_instance *instance, void *msg, int msg_len, int endian_conversion_needed); static int message_handler_memb_commit_token ( struct totemsrp_instance *instance, void *msg, int msg_len, int endian_conversion_needed); static int message_handler_token_hold_cancel ( struct totemsrp_instance *instance, void *msg, int msg_len, int endian_conversion_needed); static void totemsrp_instance_initialize (struct totemsrp_instance *instance); static unsigned int main_msgs_missing (void); static void main_token_seqid_get ( void *msg, unsigned int *seqid, unsigned int *token_is); static void srp_addr_copy (struct srp_addr *dest, struct srp_addr *src); static void srp_addr_to_nodeid ( unsigned int *nodeid_out, struct srp_addr *srp_addr_in, unsigned int entries); static int srp_addr_equal (struct srp_addr *a, struct srp_addr *b); static void memb_ring_id_create_or_load (struct totemsrp_instance *, struct memb_ring_id *); static void token_callbacks_execute (struct totemsrp_instance *instance, enum totem_callback_token_type type); static void memb_state_gather_enter (struct totemsrp_instance *instance, int gather_from); static void messages_deliver_to_app (struct totemsrp_instance *instance, int skip, unsigned int end_point); static int orf_token_mcast (struct totemsrp_instance *instance, struct orf_token *oken, int fcc_mcasts_allowed); static void messages_free (struct totemsrp_instance *instance, unsigned int token_aru); static void memb_ring_id_set_and_store (struct totemsrp_instance *instance, struct memb_ring_id *ring_id); static void memb_state_commit_token_update (struct totemsrp_instance *instance, struct memb_commit_token *commit_token); static void memb_state_commit_token_target_set (struct totemsrp_instance *instance, struct memb_commit_token *commit_token); static int memb_state_commit_token_send (struct totemsrp_instance *instance, struct memb_commit_token *memb_commit_token); static void memb_state_commit_token_create (struct totemsrp_instance *instance, struct memb_commit_token *commit_token); static int token_hold_cancel_send (struct totemsrp_instance *instance); static void orf_token_endian_convert (struct orf_token *in, struct orf_token *out); static void memb_commit_token_endian_convert (struct memb_commit_token *in, struct memb_commit_token *out); static void memb_join_endian_convert (struct memb_join *in, struct memb_join *out); static void mcast_endian_convert (struct mcast *in, struct mcast *out); static void memb_merge_detect_endian_convert ( struct memb_merge_detect *in, struct memb_merge_detect *out); static void srp_addr_copy_endian_convert (struct srp_addr *out, struct srp_addr *in); static void timer_function_orf_token_timeout (void *data); static void timer_function_heartbeat_timeout (void *data); static void timer_function_token_retransmit_timeout (void *data); static void timer_function_token_hold_retransmit_timeout (void *data); static void timer_function_merge_detect_timeout (void *data); void main_deliver_fn ( void *context, void *msg, int msg_len); void main_iface_change_fn ( void *context, struct totem_ip_address *iface_address, unsigned int iface_no); /* * All instances in one database */ static struct hdb_handle_database totemsrp_instance_database = { .handle_count = 0, .handles = 0, .iterator = 0, .mutex = PTHREAD_MUTEX_INITIALIZER }; struct message_handlers totemsrp_message_handlers = { 6, { message_handler_orf_token, message_handler_mcast, message_handler_memb_merge_detect, message_handler_memb_join, message_handler_memb_commit_token, message_handler_token_hold_cancel } }; static const char *rundir = NULL; #define log_printf(level, format, args...) \ do { \ instance->totemsrp_log_printf (instance->totemsrp_subsys_id, \ __FUNCTION__, __FILE__, __LINE__, level, \ format, ##args); \ } while (0); static void totemsrp_instance_initialize (struct totemsrp_instance *instance) { memset (instance, 0, sizeof (struct totemsrp_instance)); list_init (&instance->token_callback_received_listhead); list_init (&instance->token_callback_sent_listhead); instance->my_received_flg = 1; instance->my_token_seq = SEQNO_START_TOKEN - 1; instance->memb_state = MEMB_STATE_OPERATIONAL; instance->set_aru = -1; instance->my_aru = SEQNO_START_MSG; instance->my_high_seq_received = SEQNO_START_MSG; instance->my_high_delivered = SEQNO_START_MSG; } static void main_token_seqid_get ( void *msg, unsigned int *seqid, unsigned int *token_is) { struct orf_token *token = (struct orf_token *)msg; *seqid = 0; *token_is = 0; if (token->header.type == MESSAGE_TYPE_ORF_TOKEN) { *seqid = token->token_seq; *token_is = 1; } } static unsigned int main_msgs_missing (void) { // TODO return (0); } /* * Exported interfaces */ int totemsrp_initialize ( hdb_handle_t poll_handle, hdb_handle_t *handle, struct totem_config *totem_config, void (*deliver_fn) ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required), void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id)) { struct totemsrp_instance *instance; unsigned int res; res = hdb_handle_create (&totemsrp_instance_database, sizeof (struct totemsrp_instance), handle); if (res != 0) { goto error_exit; } res = hdb_handle_get (&totemsrp_instance_database, *handle, (void *)&instance); if (res != 0) { goto error_destroy; } rundir = getenv ("COROSYNC_RUN_DIR"); if (rundir == NULL) { rundir = LOCALSTATEDIR "/lib/corosync"; } res = mkdir (rundir, 0700); if (res == -1 && errno != EEXIST) { goto error_put; } res = chdir (rundir); if (res == -1) { goto error_put; } totemsrp_instance_initialize (instance); instance->totem_config = totem_config; /* * Configure logging */ instance->totemsrp_log_level_security = totem_config->totem_logging_configuration.log_level_security; instance->totemsrp_log_level_error = totem_config->totem_logging_configuration.log_level_error; instance->totemsrp_log_level_warning = totem_config->totem_logging_configuration.log_level_warning; instance->totemsrp_log_level_notice = totem_config->totem_logging_configuration.log_level_notice; instance->totemsrp_log_level_debug = totem_config->totem_logging_configuration.log_level_debug; instance->totemsrp_subsys_id = totem_config->totem_logging_configuration.log_subsys_id; instance->totemsrp_log_printf = totem_config->totem_logging_configuration.log_printf; /* * Initialize local variables for totemsrp */ totemip_copy (&instance->mcast_address, &totem_config->interfaces[0].mcast_addr); memset (instance->iov_buffer, 0, FRAME_SIZE_MAX); /* * Display totem configuration */ log_printf (instance->totemsrp_log_level_notice, "Token Timeout (%d ms) retransmit timeout (%d ms)\n", totem_config->token_timeout, totem_config->token_retransmit_timeout); log_printf (instance->totemsrp_log_level_notice, "token hold (%d ms) retransmits before loss (%d retrans)\n", totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const); log_printf (instance->totemsrp_log_level_notice, "join (%d ms) send_join (%d ms) consensus (%d ms) merge (%d ms)\n", totem_config->join_timeout, totem_config->send_join_timeout, totem_config->consensus_timeout, totem_config->merge_timeout); log_printf (instance->totemsrp_log_level_notice, "downcheck (%d ms) fail to recv const (%d msgs)\n", totem_config->downcheck_timeout, totem_config->fail_to_recv_const); log_printf (instance->totemsrp_log_level_notice, "seqno unchanged const (%d rotations) Maximum network MTU %d\n", totem_config->seqno_unchanged_const, totem_config->net_mtu); log_printf (instance->totemsrp_log_level_notice, "window size per rotation (%d messages) maximum messages per rotation (%d messages)\n", totem_config->window_size, totem_config->max_messages); log_printf (instance->totemsrp_log_level_notice, "send threads (%d threads)\n", totem_config->threads); log_printf (instance->totemsrp_log_level_notice, "RRP token expired timeout (%d ms)\n", totem_config->rrp_token_expired_timeout); log_printf (instance->totemsrp_log_level_notice, "RRP token problem counter (%d ms)\n", totem_config->rrp_problem_count_timeout); log_printf (instance->totemsrp_log_level_notice, "RRP threshold (%d problem count)\n", totem_config->rrp_problem_count_threshold); log_printf (instance->totemsrp_log_level_notice, "RRP mode set to %s.\n", instance->totem_config->rrp_mode); log_printf (instance->totemsrp_log_level_notice, "heartbeat_failures_allowed (%d)\n", totem_config->heartbeat_failures_allowed); log_printf (instance->totemsrp_log_level_notice, "max_network_delay (%d ms)\n", totem_config->max_network_delay); queue_init (&instance->retrans_message_queue, RETRANS_MESSAGE_QUEUE_SIZE_MAX, sizeof (struct message_item)); sq_init (&instance->regular_sort_queue, QUEUE_RTR_ITEMS_SIZE_MAX, sizeof (struct sort_queue_item), 0); sq_init (&instance->recovery_sort_queue, QUEUE_RTR_ITEMS_SIZE_MAX, sizeof (struct sort_queue_item), 0); instance->totemsrp_poll_handle = poll_handle; instance->totemsrp_deliver_fn = deliver_fn; instance->totemsrp_confchg_fn = confchg_fn; instance->use_heartbeat = 1; if ( totem_config->heartbeat_failures_allowed == 0 ) { log_printf (instance->totemsrp_log_level_notice, "HeartBeat is Disabled. To enable set heartbeat_failures_allowed > 0\n"); instance->use_heartbeat = 0; } if (instance->use_heartbeat) { instance->heartbeat_timeout = (totem_config->heartbeat_failures_allowed) * totem_config->token_retransmit_timeout + totem_config->max_network_delay; if (instance->heartbeat_timeout >= totem_config->token_timeout) { log_printf (instance->totemsrp_log_level_notice, "total heartbeat_timeout (%d ms) is not less than token timeout (%d ms)\n", instance->heartbeat_timeout, totem_config->token_timeout); log_printf (instance->totemsrp_log_level_notice, "heartbeat_timeout = heartbeat_failures_allowed * token_retransmit_timeout + max_network_delay\n"); log_printf (instance->totemsrp_log_level_notice, "heartbeat timeout should be less than the token timeout. HeartBeat is Diabled !!\n"); instance->use_heartbeat = 0; } else { log_printf (instance->totemsrp_log_level_notice, "total heartbeat_timeout (%d ms)\n", instance->heartbeat_timeout); } } totemrrp_initialize ( poll_handle, &instance->totemrrp_handle, totem_config, instance, main_deliver_fn, main_iface_change_fn, main_token_seqid_get, main_msgs_missing); /* * Must have net_mtu adjusted by totemrrp_initialize first */ queue_init (&instance->new_message_queue, MESSAGE_QUEUE_MAX, sizeof (struct message_item)); hdb_handle_put (&totemsrp_instance_database, *handle); return (0); error_put: hdb_handle_put (&totemsrp_instance_database, *handle); error_destroy: hdb_handle_destroy (&totemsrp_instance_database, *handle); error_exit: return (-1); } void totemsrp_finalize ( hdb_handle_t handle) { struct totemsrp_instance *instance; unsigned int res; res = hdb_handle_get (&totemsrp_instance_database, handle, (void *)&instance); if (res != 0) { return; } hdb_handle_put (&totemsrp_instance_database, handle); } int totemsrp_ifaces_get ( hdb_handle_t handle, unsigned int nodeid, struct totem_ip_address *interfaces, char ***status, unsigned int *iface_count) { struct totemsrp_instance *instance; int res; unsigned int found = 0; unsigned int i; res = hdb_handle_get (&totemsrp_instance_database, handle, (void *)&instance); if (res != 0) { goto error_exit; } for (i = 0; i < instance->my_memb_entries; i++) { if (instance->my_memb_list[i].addr[0].nodeid == nodeid) { found = 1; break; } } if (found) { memcpy (interfaces, &instance->my_memb_list[i], sizeof (struct srp_addr)); *iface_count = instance->totem_config->interface_count; goto finish; } for (i = 0; i < instance->my_left_memb_entries; i++) { if (instance->my_left_memb_list[i].addr[0].nodeid == nodeid) { found = 1; break; } } if (found) { memcpy (interfaces, &instance->my_left_memb_list[i], sizeof (struct srp_addr)); *iface_count = instance->totem_config->interface_count; } else { res = -1; } finish: totemrrp_ifaces_get (instance->totemrrp_handle, status, NULL); hdb_handle_put (&totemsrp_instance_database, handle); error_exit: return (res); } unsigned int totemsrp_my_nodeid_get ( hdb_handle_t handle) { struct totemsrp_instance *instance; unsigned int res; res = hdb_handle_get (&totemsrp_instance_database, handle, (void *)&instance); if (res != 0) { return (0); } res = instance->totem_config->interfaces[0].boundto.nodeid; hdb_handle_put (&totemsrp_instance_database, handle); return (res); } int totemsrp_my_family_get ( hdb_handle_t handle) { struct totemsrp_instance *instance; int res; res = hdb_handle_get (&totemsrp_instance_database, handle, (void *)&instance); if (res != 0) { return (0); } res = instance->totem_config->interfaces[0].boundto.family; hdb_handle_put (&totemsrp_instance_database, handle); return (res); } int totemsrp_ring_reenable ( hdb_handle_t handle) { struct totemsrp_instance *instance; int res; res = hdb_handle_get (&totemsrp_instance_database, handle, (void *)&instance); if (res != 0) { goto error_exit; } totemrrp_ring_reenable (instance->totemrrp_handle); hdb_handle_put (&totemsrp_instance_database, handle); error_exit: return (res); } /* * Set operations for use by the membership algorithm */ static int srp_addr_equal (struct srp_addr *a, struct srp_addr *b) { unsigned int i; unsigned int res; for (i = 0; i < 1; i++) { res = totemip_equal (&a->addr[i], &b->addr[i]); if (res == 0) { return (0); } } return (1); } static void srp_addr_copy (struct srp_addr *dest, struct srp_addr *src) { unsigned int i; for (i = 0; i < INTERFACE_MAX; i++) { totemip_copy (&dest->addr[i], &src->addr[i]); } } static void srp_addr_to_nodeid ( unsigned int *nodeid_out, struct srp_addr *srp_addr_in, unsigned int entries) { unsigned int i; for (i = 0; i < entries; i++) { nodeid_out[i] = srp_addr_in[i].addr[0].nodeid; } } static void srp_addr_copy_endian_convert (struct srp_addr *out, struct srp_addr *in) { int i; for (i = 0; i < INTERFACE_MAX; i++) { totemip_copy_endian_convert (&out->addr[i], &in->addr[i]); } } static void memb_consensus_reset (struct totemsrp_instance *instance) { instance->consensus_list_entries = 0; } static void memb_set_subtract ( struct srp_addr *out_list, int *out_list_entries, struct srp_addr *one_list, int one_list_entries, struct srp_addr *two_list, int two_list_entries) { int found = 0; int i; int j; *out_list_entries = 0; for (i = 0; i < one_list_entries; i++) { for (j = 0; j < two_list_entries; j++) { if (srp_addr_equal (&one_list[i], &two_list[j])) { found = 1; break; } } if (found == 0) { srp_addr_copy (&out_list[*out_list_entries], &one_list[i]); *out_list_entries = *out_list_entries + 1; } found = 0; } } /* * Set consensus for a specific processor */ static void memb_consensus_set ( struct totemsrp_instance *instance, struct srp_addr *addr) { int found = 0; int i; for (i = 0; i < instance->consensus_list_entries; i++) { if (srp_addr_equal(addr, &instance->consensus_list[i].addr)) { found = 1; break; /* found entry */ } } srp_addr_copy (&instance->consensus_list[i].addr, addr); instance->consensus_list[i].set = 1; if (found == 0) { instance->consensus_list_entries++; } return; } /* * Is consensus set for a specific processor */ static int memb_consensus_isset ( struct totemsrp_instance *instance, struct srp_addr *addr) { int i; for (i = 0; i < instance->consensus_list_entries; i++) { if (srp_addr_equal (addr, &instance->consensus_list[i].addr)) { return (instance->consensus_list[i].set); } } return (0); } /* * Is consensus agreed upon based upon consensus database */ static int memb_consensus_agreed ( struct totemsrp_instance *instance) { struct srp_addr token_memb[PROCESSOR_COUNT_MAX]; int token_memb_entries = 0; int agreed = 1; int i; memb_set_subtract (token_memb, &token_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); for (i = 0; i < token_memb_entries; i++) { if (memb_consensus_isset (instance, &token_memb[i]) == 0) { agreed = 0; break; } } assert (token_memb_entries >= 1); return (agreed); } static void memb_consensus_notset ( struct totemsrp_instance *instance, struct srp_addr *no_consensus_list, int *no_consensus_list_entries, struct srp_addr *comparison_list, int comparison_list_entries) { int i; *no_consensus_list_entries = 0; for (i = 0; i < instance->my_proc_list_entries; i++) { if (memb_consensus_isset (instance, &instance->my_proc_list[i]) == 0) { srp_addr_copy (&no_consensus_list[*no_consensus_list_entries], &instance->my_proc_list[i]); *no_consensus_list_entries = *no_consensus_list_entries + 1; } } } /* * Is set1 equal to set2 Entries can be in different orders */ static int memb_set_equal ( struct srp_addr *set1, int set1_entries, struct srp_addr *set2, int set2_entries) { int i; int j; int found = 0; if (set1_entries != set2_entries) { return (0); } for (i = 0; i < set2_entries; i++) { for (j = 0; j < set1_entries; j++) { if (srp_addr_equal (&set1[j], &set2[i])) { found = 1; break; } } if (found == 0) { return (0); } found = 0; } return (1); } /* * Is subset fully contained in fullset */ static int memb_set_subset ( struct srp_addr *subset, int subset_entries, struct srp_addr *fullset, int fullset_entries) { int i; int j; int found = 0; if (subset_entries > fullset_entries) { return (0); } for (i = 0; i < subset_entries; i++) { for (j = 0; j < fullset_entries; j++) { if (srp_addr_equal (&subset[i], &fullset[j])) { found = 1; } } if (found == 0) { return (0); } found = 0; } return (1); } /* * merge subset into fullset taking care not to add duplicates */ static void memb_set_merge ( struct srp_addr *subset, int subset_entries, struct srp_addr *fullset, int *fullset_entries) { int found = 0; int i; int j; for (i = 0; i < subset_entries; i++) { for (j = 0; j < *fullset_entries; j++) { if (srp_addr_equal (&fullset[j], &subset[i])) { found = 1; break; } } if (found == 0) { srp_addr_copy (&fullset[*fullset_entries], &subset[i]); *fullset_entries = *fullset_entries + 1; } found = 0; } return; } static void memb_set_and ( struct srp_addr *set1, int set1_entries, struct srp_addr *set2, int set2_entries, struct srp_addr *and, int *and_entries) { int i; int j; int found = 0; *and_entries = 0; for (i = 0; i < set2_entries; i++) { for (j = 0; j < set1_entries; j++) { if (srp_addr_equal (&set1[j], &set2[i])) { found = 1; break; } } if (found) { srp_addr_copy (&and[*and_entries], &set1[j]); *and_entries = *and_entries + 1; } found = 0; } return; } #ifdef CODE_COVERAGE static void memb_set_print ( char *string, struct srp_addr *list, int list_entries) { int i; int j; printf ("List '%s' contains %d entries:\n", string, list_entries); for (i = 0; i < list_entries; i++) { for (j = 0; j < INTERFACE_MAX; j++) { printf ("Address %d\n", i); printf ("\tiface %d %s\n", j, totemip_print (&list[i].addr[j])); printf ("family %d\n", list[i].addr[j].family); } } } #endif static void reset_token_retransmit_timeout (struct totemsrp_instance *instance) { poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_orf_token_retransmit_timeout); poll_timer_add (instance->totemsrp_poll_handle, instance->totem_config->token_retransmit_timeout, (void *)instance, timer_function_token_retransmit_timeout, &instance->timer_orf_token_retransmit_timeout); } static void start_merge_detect_timeout (struct totemsrp_instance *instance) { if (instance->my_merge_detect_timeout_outstanding == 0) { poll_timer_add (instance->totemsrp_poll_handle, instance->totem_config->merge_timeout, (void *)instance, timer_function_merge_detect_timeout, &instance->timer_merge_detect_timeout); instance->my_merge_detect_timeout_outstanding = 1; } } static void cancel_merge_detect_timeout (struct totemsrp_instance *instance) { poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_merge_detect_timeout); instance->my_merge_detect_timeout_outstanding = 0; } /* * ring_state_* is used to save and restore the sort queue * state when a recovery operation fails (and enters gather) */ static void old_ring_state_save (struct totemsrp_instance *instance) { if (instance->old_ring_state_saved == 0) { instance->old_ring_state_saved = 1; instance->old_ring_state_aru = instance->my_aru; instance->old_ring_state_high_seq_received = instance->my_high_seq_received; log_printf (instance->totemsrp_log_level_notice, "Saving state aru %x high seq received %x\n", instance->my_aru, instance->my_high_seq_received); } } static void ring_save (struct totemsrp_instance *instance) { if (instance->ring_saved == 0) { instance->ring_saved = 1; memcpy (&instance->my_old_ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); } } static void ring_reset (struct totemsrp_instance *instance) { instance->ring_saved = 0; } static void ring_state_restore (struct totemsrp_instance *instance) { if (instance->old_ring_state_saved) { totemip_zero_set(&instance->my_ring_id.rep); instance->my_aru = instance->old_ring_state_aru; instance->my_high_seq_received = instance->old_ring_state_high_seq_received; log_printf (instance->totemsrp_log_level_notice, "Restoring instance->my_aru %x my high seq received %x\n", instance->my_aru, instance->my_high_seq_received); } } static void old_ring_state_reset (struct totemsrp_instance *instance) { instance->old_ring_state_saved = 0; } static void reset_token_timeout (struct totemsrp_instance *instance) { poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout); poll_timer_add (instance->totemsrp_poll_handle, instance->totem_config->token_timeout, (void *)instance, timer_function_orf_token_timeout, &instance->timer_orf_token_timeout); } static void reset_heartbeat_timeout (struct totemsrp_instance *instance) { poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_heartbeat_timeout); poll_timer_add (instance->totemsrp_poll_handle, instance->heartbeat_timeout, (void *)instance, timer_function_heartbeat_timeout, &instance->timer_heartbeat_timeout); } static void cancel_token_timeout (struct totemsrp_instance *instance) { poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout); } static void cancel_heartbeat_timeout (struct totemsrp_instance *instance) { poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_heartbeat_timeout); } static void cancel_token_retransmit_timeout (struct totemsrp_instance *instance) { poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_orf_token_retransmit_timeout); } static void start_token_hold_retransmit_timeout (struct totemsrp_instance *instance) { poll_timer_add (instance->totemsrp_poll_handle, instance->totem_config->token_hold_timeout, (void *)instance, timer_function_token_hold_retransmit_timeout, &instance->timer_orf_token_hold_retransmit_timeout); } static void cancel_token_hold_retransmit_timeout (struct totemsrp_instance *instance) { poll_timer_delete (instance->totemsrp_poll_handle, instance->timer_orf_token_hold_retransmit_timeout); } static void memb_state_consensus_timeout_expired ( struct totemsrp_instance *instance) { struct srp_addr no_consensus_list[PROCESSOR_COUNT_MAX]; int no_consensus_list_entries; if (memb_consensus_agreed (instance)) { memb_consensus_reset (instance); memb_consensus_set (instance, &instance->my_id); reset_token_timeout (instance); // REVIEWED } else { memb_consensus_notset ( instance, no_consensus_list, &no_consensus_list_entries, instance->my_proc_list, instance->my_proc_list_entries); memb_set_merge (no_consensus_list, no_consensus_list_entries, instance->my_failed_list, &instance->my_failed_list_entries); memb_state_gather_enter (instance, 0); } } static void memb_join_message_send (struct totemsrp_instance *instance); static void memb_merge_detect_transmit (struct totemsrp_instance *instance); /* * Timers used for various states of the membership algorithm */ static void timer_function_orf_token_timeout (void *data) { struct totemsrp_instance *instance = (struct totemsrp_instance *)data; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: log_printf (instance->totemsrp_log_level_notice, "The token was lost in the OPERATIONAL state.\n"); totemrrp_iface_check (instance->totemrrp_handle); memb_state_gather_enter (instance, 2); break; case MEMB_STATE_GATHER: log_printf (instance->totemsrp_log_level_notice, "The consensus timeout expired.\n"); memb_state_consensus_timeout_expired (instance); memb_state_gather_enter (instance, 3); break; case MEMB_STATE_COMMIT: log_printf (instance->totemsrp_log_level_notice, "The token was lost in the COMMIT state.\n"); memb_state_gather_enter (instance, 4); break; case MEMB_STATE_RECOVERY: log_printf (instance->totemsrp_log_level_notice, "The token was lost in the RECOVERY state.\n"); ring_state_restore (instance); memb_state_gather_enter (instance, 5); break; } } static void timer_function_heartbeat_timeout (void *data) { struct totemsrp_instance *instance = (struct totemsrp_instance *)data; log_printf (instance->totemsrp_log_level_notice, "HeartBeat Timer expired Invoking token loss mechanism in state %d \n", instance->memb_state); timer_function_orf_token_timeout(data); } static void memb_timer_function_state_gather (void *data) { struct totemsrp_instance *instance = (struct totemsrp_instance *)data; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: case MEMB_STATE_RECOVERY: assert (0); /* this should never happen */ break; case MEMB_STATE_GATHER: case MEMB_STATE_COMMIT: memb_join_message_send (instance); /* * Restart the join timeout `*/ poll_timer_delete (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout); poll_timer_add (instance->totemsrp_poll_handle, instance->totem_config->join_timeout, (void *)instance, memb_timer_function_state_gather, &instance->memb_timer_state_gather_join_timeout); break; } } static void memb_timer_function_gather_consensus_timeout (void *data) { struct totemsrp_instance *instance = (struct totemsrp_instance *)data; memb_state_consensus_timeout_expired (instance); } static void deliver_messages_from_recovery_to_regular (struct totemsrp_instance *instance) { unsigned int i; struct sort_queue_item *recovery_message_item; struct sort_queue_item regular_message_item; unsigned int range = 0; int res; void *ptr; struct mcast *mcast; log_printf (instance->totemsrp_log_level_debug, "recovery to regular %x-%x\n", SEQNO_START_MSG + 1, instance->my_aru); range = instance->my_aru - SEQNO_START_MSG; /* * Move messages from recovery to regular sort queue */ // todo should i be initialized to 0 or 1 ? for (i = 1; i <= range; i++) { res = sq_item_get (&instance->recovery_sort_queue, i + SEQNO_START_MSG, &ptr); if (res != 0) { continue; } recovery_message_item = (struct sort_queue_item *)ptr; /* * Convert recovery message into regular message */ if (recovery_message_item->iov_len > 1) { mcast = recovery_message_item->iovec[1].iov_base; memcpy (®ular_message_item.iovec[0], &recovery_message_item->iovec[1], sizeof (struct iovec) * recovery_message_item->iov_len); } else { mcast = recovery_message_item->iovec[0].iov_base; if (mcast->header.encapsulated == MESSAGE_ENCAPSULATED) { /* * Message is a recovery message encapsulated * in a new ring message */ regular_message_item.iovec[0].iov_base = (char *)recovery_message_item->iovec[0].iov_base + sizeof (struct mcast); regular_message_item.iovec[0].iov_len = recovery_message_item->iovec[0].iov_len - sizeof (struct mcast); regular_message_item.iov_len = 1; mcast = regular_message_item.iovec[0].iov_base; } else { continue; /* TODO this case shouldn't happen */ /* * Message is originated on new ring and not * encapsulated */ regular_message_item.iovec[0].iov_base = recovery_message_item->iovec[0].iov_base; regular_message_item.iovec[0].iov_len = recovery_message_item->iovec[0].iov_len; } } log_printf (instance->totemsrp_log_level_debug, "comparing if ring id is for this processors old ring seqno %d\n", mcast->seq); /* * Only add this message to the regular sort * queue if it was originated with the same ring * id as the previous ring */ if (memcmp (&instance->my_old_ring_id, &mcast->ring_id, sizeof (struct memb_ring_id)) == 0) { regular_message_item.iov_len = recovery_message_item->iov_len; res = sq_item_inuse (&instance->regular_sort_queue, mcast->seq); if (res == 0) { sq_item_add (&instance->regular_sort_queue, ®ular_message_item, mcast->seq); if (sq_lt_compare (instance->old_ring_state_high_seq_received, mcast->seq)) { instance->old_ring_state_high_seq_received = mcast->seq; } } } else { log_printf (instance->totemsrp_log_level_notice, "-not adding msg with seq no %x\n", mcast->seq); } } } /* * Change states in the state machine of the membership algorithm */ static void memb_state_operational_enter (struct totemsrp_instance *instance) { struct srp_addr joined_list[PROCESSOR_COUNT_MAX]; int joined_list_entries = 0; unsigned int aru_save; unsigned int joined_list_totemip[PROCESSOR_COUNT_MAX]; unsigned int trans_memb_list_totemip[PROCESSOR_COUNT_MAX]; unsigned int new_memb_list_totemip[PROCESSOR_COUNT_MAX]; unsigned int left_list[PROCESSOR_COUNT_MAX]; memb_consensus_reset (instance); old_ring_state_reset (instance); ring_reset (instance); deliver_messages_from_recovery_to_regular (instance); log_printf (instance->totemsrp_log_level_debug, "Delivering to app %x to %x\n", instance->my_high_delivered + 1, instance->old_ring_state_high_seq_received); aru_save = instance->my_aru; instance->my_aru = instance->old_ring_state_aru; messages_deliver_to_app (instance, 0, instance->old_ring_state_high_seq_received); /* * Calculate joined and left list */ memb_set_subtract (instance->my_left_memb_list, &instance->my_left_memb_entries, instance->my_memb_list, instance->my_memb_entries, instance->my_trans_memb_list, instance->my_trans_memb_entries); memb_set_subtract (joined_list, &joined_list_entries, instance->my_new_memb_list, instance->my_new_memb_entries, instance->my_trans_memb_list, instance->my_trans_memb_entries); /* * Install new membership */ instance->my_memb_entries = instance->my_new_memb_entries; memcpy (&instance->my_memb_list, instance->my_new_memb_list, sizeof (struct srp_addr) * instance->my_memb_entries); instance->last_released = 0; instance->my_set_retrans_flg = 0; /* * Deliver transitional configuration to application */ srp_addr_to_nodeid (left_list, instance->my_left_memb_list, instance->my_left_memb_entries); srp_addr_to_nodeid (trans_memb_list_totemip, instance->my_trans_memb_list, instance->my_trans_memb_entries); instance->totemsrp_confchg_fn (TOTEM_CONFIGURATION_TRANSITIONAL, trans_memb_list_totemip, instance->my_trans_memb_entries, left_list, instance->my_left_memb_entries, 0, 0, &instance->my_ring_id); // TODO we need to filter to ensure we only deliver those // messages which are part of instance->my_deliver_memb messages_deliver_to_app (instance, 1, instance->old_ring_state_high_seq_received); instance->my_aru = aru_save; /* * Deliver regular configuration to application */ srp_addr_to_nodeid (new_memb_list_totemip, instance->my_new_memb_list, instance->my_new_memb_entries); srp_addr_to_nodeid (joined_list_totemip, joined_list, joined_list_entries); instance->totemsrp_confchg_fn (TOTEM_CONFIGURATION_REGULAR, new_memb_list_totemip, instance->my_new_memb_entries, 0, 0, joined_list_totemip, joined_list_entries, &instance->my_ring_id); /* * The recovery sort queue now becomes the regular * sort queue. It is necessary to copy the state * into the regular sort queue. */ sq_copy (&instance->regular_sort_queue, &instance->recovery_sort_queue); instance->my_last_aru = SEQNO_START_MSG; sq_items_release (&instance->regular_sort_queue, SEQNO_START_MSG - 1); /* When making my_proc_list smaller, ensure that the * now non-used entries are zero-ed out. There are some suspect * assert's that assume that there is always 2 entries in the list. * These fail when my_proc_list is reduced to 1 entry (and the * valid [0] entry is the same as the 'unused' [1] entry). */ memset(instance->my_proc_list, 0, sizeof (struct srp_addr) * instance->my_proc_list_entries); instance->my_proc_list_entries = instance->my_new_memb_entries; memcpy (instance->my_proc_list, instance->my_new_memb_list, sizeof (struct srp_addr) * instance->my_memb_entries); instance->my_failed_list_entries = 0; instance->my_high_delivered = instance->my_aru; // TODO the recovery messages are leaked log_printf (instance->totemsrp_log_level_notice, "entering OPERATIONAL state.\n"); instance->memb_state = MEMB_STATE_OPERATIONAL; instance->my_received_flg = 1; return; } static void memb_state_gather_enter ( struct totemsrp_instance *instance, int gather_from) { memb_set_merge ( &instance->my_id, 1, instance->my_proc_list, &instance->my_proc_list_entries); assert (srp_addr_equal (&instance->my_proc_list[0], &instance->my_proc_list[1]) == 0); memb_join_message_send (instance); /* * Restart the join timeout */ poll_timer_delete (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout); poll_timer_add (instance->totemsrp_poll_handle, instance->totem_config->join_timeout, (void *)instance, memb_timer_function_state_gather, &instance->memb_timer_state_gather_join_timeout); /* * Restart the consensus timeout */ poll_timer_delete (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_consensus_timeout); poll_timer_add (instance->totemsrp_poll_handle, instance->totem_config->consensus_timeout, (void *)instance, memb_timer_function_gather_consensus_timeout, &instance->memb_timer_state_gather_consensus_timeout); /* * Cancel the token loss and token retransmission timeouts */ cancel_token_retransmit_timeout (instance); // REVIEWED cancel_token_timeout (instance); // REVIEWED cancel_merge_detect_timeout (instance); memb_consensus_reset (instance); memb_consensus_set (instance, &instance->my_id); log_printf (instance->totemsrp_log_level_notice, "entering GATHER state from %d.\n", gather_from); instance->memb_state = MEMB_STATE_GATHER; return; } static void timer_function_token_retransmit_timeout (void *data); static void memb_state_commit_enter ( struct totemsrp_instance *instance, struct memb_commit_token *commit_token) { ring_save (instance); old_ring_state_save (instance); memb_state_commit_token_update (instance, commit_token); memb_state_commit_token_target_set (instance, commit_token); memb_ring_id_set_and_store (instance, &commit_token->ring_id); memb_state_commit_token_send (instance, commit_token); instance->token_ring_id_seq = instance->my_ring_id.seq; poll_timer_delete (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_join_timeout); instance->memb_timer_state_gather_join_timeout = 0; poll_timer_delete (instance->totemsrp_poll_handle, instance->memb_timer_state_gather_consensus_timeout); instance->memb_timer_state_gather_consensus_timeout = 0; reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED log_printf (instance->totemsrp_log_level_notice, "entering COMMIT state.\n"); instance->memb_state = MEMB_STATE_COMMIT; /* * reset all flow control variables since we are starting a new ring */ instance->my_trc = 0; instance->my_pbl = 0; instance->my_cbl = 0; return; } static void memb_state_recovery_enter ( struct totemsrp_instance *instance, struct memb_commit_token *commit_token) { int i; int local_received_flg = 1; unsigned int low_ring_aru; unsigned int range = 0; unsigned int messages_originated = 0; char is_originated[4096]; char not_originated[4096]; char seqno_string_hex[10]; struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; addr = (struct srp_addr *)commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + commit_token->addr_entries); log_printf (instance->totemsrp_log_level_notice, "entering RECOVERY state.\n"); instance->my_high_ring_delivered = 0; sq_reinit (&instance->recovery_sort_queue, SEQNO_START_MSG); queue_reinit (&instance->retrans_message_queue); low_ring_aru = instance->old_ring_state_high_seq_received; memb_state_commit_token_send (instance, commit_token); instance->my_token_seq = SEQNO_START_TOKEN - 1; /* * Build regular configuration */ totemrrp_processor_count_set ( instance->totemrrp_handle, commit_token->addr_entries); /* * Build transitional configuration */ memb_set_and (instance->my_new_memb_list, instance->my_new_memb_entries, instance->my_memb_list, instance->my_memb_entries, instance->my_trans_memb_list, &instance->my_trans_memb_entries); for (i = 0; i < instance->my_new_memb_entries; i++) { log_printf (instance->totemsrp_log_level_notice, "position [%d] member %s:\n", i, totemip_print (&addr[i].addr[0])); log_printf (instance->totemsrp_log_level_notice, "previous ring seq %lld rep %s\n", memb_list[i].ring_id.seq, totemip_print (&memb_list[i].ring_id.rep)); log_printf (instance->totemsrp_log_level_notice, "aru %x high delivered %x received flag %d\n", memb_list[i].aru, memb_list[i].high_delivered, memb_list[i].received_flg); // assert (totemip_print (&memb_list[i].ring_id.rep) != 0); } /* * Determine if any received flag is false */ for (i = 0; i < commit_token->addr_entries; i++) { if (memb_set_subset (&instance->my_new_memb_list[i], 1, instance->my_trans_memb_list, instance->my_trans_memb_entries) && memb_list[i].received_flg == 0) { instance->my_deliver_memb_entries = instance->my_trans_memb_entries; memcpy (instance->my_deliver_memb_list, instance->my_trans_memb_list, sizeof (struct srp_addr) * instance->my_trans_memb_entries); local_received_flg = 0; break; } } if (local_received_flg == 1) { goto no_originate; } /* Else originate messages if we should */ /* * Calculate my_low_ring_aru, instance->my_high_ring_delivered for the transitional membership */ for (i = 0; i < commit_token->addr_entries; i++) { if (memb_set_subset (&instance->my_new_memb_list[i], 1, instance->my_deliver_memb_list, instance->my_deliver_memb_entries) && memcmp (&instance->my_old_ring_id, &memb_list[i].ring_id, sizeof (struct memb_ring_id)) == 0) { if (sq_lt_compare (memb_list[i].aru, low_ring_aru)) { low_ring_aru = memb_list[i].aru; } if (sq_lt_compare (instance->my_high_ring_delivered, memb_list[i].high_delivered)) { instance->my_high_ring_delivered = memb_list[i].high_delivered; } } } /* * Copy all old ring messages to instance->retrans_message_queue */ range = instance->old_ring_state_high_seq_received - low_ring_aru; if (range == 0) { /* * No messages to copy */ goto no_originate; } assert (range < 1024); log_printf (instance->totemsrp_log_level_notice, "copying all old ring messages from %x-%x.\n", low_ring_aru + 1, instance->old_ring_state_high_seq_received); strcpy (not_originated, "Not Originated for recovery: "); strcpy (is_originated, "Originated for recovery: "); for (i = 1; i <= range; i++) { struct sort_queue_item *sort_queue_item; struct message_item message_item; void *ptr; int res; sprintf (seqno_string_hex, "%x ", low_ring_aru + i); res = sq_item_get (&instance->regular_sort_queue, low_ring_aru + i, &ptr); if (res != 0) { strcat (not_originated, seqno_string_hex); continue; } strcat (is_originated, seqno_string_hex); sort_queue_item = ptr; assert (sort_queue_item->iov_len > 0); assert (sort_queue_item->iov_len <= MAXIOVS); messages_originated++; memset (&message_item, 0, sizeof (struct message_item)); // TODO LEAK message_item.mcast = malloc (sizeof (struct mcast)); assert (message_item.mcast); message_item.mcast->header.type = MESSAGE_TYPE_MCAST; srp_addr_copy (&message_item.mcast->system_from, &instance->my_id); message_item.mcast->header.encapsulated = MESSAGE_ENCAPSULATED; message_item.mcast->header.nodeid = instance->my_id.addr[0].nodeid; assert (message_item.mcast->header.nodeid); message_item.mcast->header.endian_detector = ENDIAN_LOCAL; memcpy (&message_item.mcast->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); message_item.iov_len = sort_queue_item->iov_len; memcpy (&message_item.iovec, &sort_queue_item->iovec, sizeof (struct iovec) * sort_queue_item->iov_len); queue_item_add (&instance->retrans_message_queue, &message_item); } log_printf (instance->totemsrp_log_level_notice, "Originated %d messages in RECOVERY.\n", messages_originated); strcat (not_originated, "\n"); strcat (is_originated, "\n"); log_printf (instance->totemsrp_log_level_notice, "%s", is_originated); log_printf (instance->totemsrp_log_level_notice, "%s", not_originated); goto originated; no_originate: log_printf (instance->totemsrp_log_level_notice, "Did not need to originate any messages in recovery.\n"); originated: instance->my_aru = SEQNO_START_MSG; instance->my_aru_count = 0; instance->my_seq_unchanged = 0; instance->my_high_seq_received = SEQNO_START_MSG; instance->my_install_seq = SEQNO_START_MSG; instance->last_released = SEQNO_START_MSG; reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED instance->memb_state = MEMB_STATE_RECOVERY; return; } int totemsrp_new_msg_signal (hdb_handle_t handle) { struct totemsrp_instance *instance; unsigned int res; res = hdb_handle_get (&totemsrp_instance_database, handle, (void *)&instance); if (res != 0) { goto error_exit; } token_hold_cancel_send (instance); hdb_handle_put (&totemsrp_instance_database, handle); return (0); error_exit: return (-1); } int totemsrp_mcast ( hdb_handle_t handle, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int guarantee) { int i; int j; struct message_item message_item; struct totemsrp_instance *instance; unsigned int res; res = hdb_handle_get (&totemsrp_instance_database, handle, (void *)&instance); if (res != 0) { goto error_exit; } if (queue_is_full (&instance->new_message_queue)) { log_printf (instance->totemsrp_log_level_warning, "queue full\n"); return (-1); } for (j = 0, i = 0; i < iov_len; i++) { j+= iovec[i].iov_len; } memset (&message_item, 0, sizeof (struct message_item)); /* * Allocate pending item */ // TODO LEAK message_item.mcast = malloc (sizeof (struct mcast)); if (message_item.mcast == 0) { goto error_mcast; } /* * Set mcast header */ message_item.mcast->header.type = MESSAGE_TYPE_MCAST; message_item.mcast->header.endian_detector = ENDIAN_LOCAL; message_item.mcast->header.encapsulated = MESSAGE_NOT_ENCAPSULATED; message_item.mcast->header.nodeid = instance->my_id.addr[0].nodeid; assert (message_item.mcast->header.nodeid); message_item.mcast->guarantee = guarantee; srp_addr_copy (&message_item.mcast->system_from, &instance->my_id); for (i = 0; i < iov_len; i++) { // TODO LEAK message_item.iovec[i].iov_base = malloc (iovec[i].iov_len); if (message_item.iovec[i].iov_base == 0) { goto error_iovec; } memcpy (message_item.iovec[i].iov_base, iovec[i].iov_base, iovec[i].iov_len); message_item.iovec[i].iov_len = iovec[i].iov_len; } message_item.iov_len = iov_len; log_printf (instance->totemsrp_log_level_debug, "mcasted message added to pending queue\n"); queue_item_add (&instance->new_message_queue, &message_item); hdb_handle_put (&totemsrp_instance_database, handle); return (0); error_iovec: for (j = 0; j < i; j++) { free (message_item.iovec[j].iov_base); } free(message_item.mcast); error_mcast: hdb_handle_put (&totemsrp_instance_database, handle); error_exit: return (-1); } /* * Determine if there is room to queue a new message */ int totemsrp_avail (hdb_handle_t handle) { int avail; struct totemsrp_instance *instance; unsigned int res; res = hdb_handle_get (&totemsrp_instance_database, handle, (void *)&instance); if (res != 0) { goto error_exit; } queue_avail (&instance->new_message_queue, &avail); hdb_handle_put (&totemsrp_instance_database, handle); return (avail); error_exit: return (0); } /* * ORF Token Management */ /* * Recast message to mcast group if it is available */ static int orf_token_remcast ( struct totemsrp_instance *instance, int seq) { struct sort_queue_item *sort_queue_item; int res; void *ptr; struct sq *sort_queue; if (instance->memb_state == MEMB_STATE_RECOVERY) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } res = sq_in_range (sort_queue, seq); if (res == 0) { log_printf (instance->totemsrp_log_level_debug, "sq not in range\n"); return (-1); } /* * Get RTR item at seq, if not available, return */ res = sq_item_get (sort_queue, seq, &ptr); if (res != 0) { return -1; } sort_queue_item = ptr; totemrrp_mcast_noflush_send (instance->totemrrp_handle, sort_queue_item->iovec, sort_queue_item->iov_len); return (0); } /* * Free all freeable messages from ring */ static void messages_free ( struct totemsrp_instance *instance, unsigned int token_aru) { struct sort_queue_item *regular_message; unsigned int i, j; int res; int log_release = 0; unsigned int release_to; unsigned int range = 0; release_to = token_aru; if (sq_lt_compare (instance->my_last_aru, release_to)) { release_to = instance->my_last_aru; } if (sq_lt_compare (instance->my_high_delivered, release_to)) { release_to = instance->my_high_delivered; } /* * Ensure we dont try release before an already released point */ if (sq_lt_compare (release_to, instance->last_released)) { return; } range = release_to - instance->last_released; assert (range < 1024); /* * Release retransmit list items if group aru indicates they are transmitted */ for (i = 1; i <= range; i++) { void *ptr; res = sq_item_get (&instance->regular_sort_queue, instance->last_released + i, &ptr); if (res == 0) { regular_message = ptr; for (j = 0; j < regular_message->iov_len; j++) { free (regular_message->iovec[j].iov_base); } } sq_items_release (&instance->regular_sort_queue, instance->last_released + i); log_release = 1; } instance->last_released += range; if (log_release) { log_printf (instance->totemsrp_log_level_debug, "releasing messages up to and including %x\n", release_to); } } static void update_aru ( struct totemsrp_instance *instance) { unsigned int i; int res; struct sq *sort_queue; unsigned int range; unsigned int my_aru_saved = 0; if (instance->memb_state == MEMB_STATE_RECOVERY) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } range = instance->my_high_seq_received - instance->my_aru; if (range > 1024) { return; } my_aru_saved = instance->my_aru; for (i = 1; i <= range; i++) { void *ptr; res = sq_item_get (sort_queue, my_aru_saved + i, &ptr); /* * If hole, stop updating aru */ if (res != 0) { break; } } instance->my_aru += i - 1; } /* * Multicasts pending messages onto the ring (requires orf_token possession) */ static int orf_token_mcast ( struct totemsrp_instance *instance, struct orf_token *token, int fcc_mcasts_allowed) { struct message_item *message_item = 0; struct queue *mcast_queue; struct sq *sort_queue; struct sort_queue_item sort_queue_item; struct sort_queue_item *sort_queue_item_ptr; struct mcast *mcast; unsigned int fcc_mcast_current; if (instance->memb_state == MEMB_STATE_RECOVERY) { mcast_queue = &instance->retrans_message_queue; sort_queue = &instance->recovery_sort_queue; reset_token_retransmit_timeout (instance); // REVIEWED } else { mcast_queue = &instance->new_message_queue; sort_queue = &instance->regular_sort_queue; } for (fcc_mcast_current = 0; fcc_mcast_current < fcc_mcasts_allowed; fcc_mcast_current++) { if (queue_is_empty (mcast_queue)) { break; } message_item = (struct message_item *)queue_item_get (mcast_queue); /* preincrement required by algo */ if (instance->old_ring_state_saved && (instance->memb_state == MEMB_STATE_GATHER || instance->memb_state == MEMB_STATE_COMMIT)) { log_printf (instance->totemsrp_log_level_debug, "not multicasting at seqno is %d\n", token->seq); return (0); } message_item->mcast->seq = ++token->seq; message_item->mcast->this_seqno = instance->global_seqno++; /* * Build IO vector */ memset (&sort_queue_item, 0, sizeof (struct sort_queue_item)); sort_queue_item.iovec[0].iov_base = message_item->mcast; sort_queue_item.iovec[0].iov_len = sizeof (struct mcast); mcast = sort_queue_item.iovec[0].iov_base; memcpy (&sort_queue_item.iovec[1], message_item->iovec, message_item->iov_len * sizeof (struct iovec)); memcpy (&mcast->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); sort_queue_item.iov_len = message_item->iov_len + 1; assert (sort_queue_item.iov_len < 16); /* * Add message to retransmit queue */ sort_queue_item_ptr = sq_item_add (sort_queue, &sort_queue_item, message_item->mcast->seq); totemrrp_mcast_noflush_send (instance->totemrrp_handle, sort_queue_item_ptr->iovec, sort_queue_item_ptr->iov_len); /* * Delete item from pending queue */ queue_item_remove (mcast_queue); /* * If messages mcasted, deliver any new messages to totempg */ instance->my_high_seq_received = token->seq; } update_aru (instance); /* * Return 1 if more messages are available for single node clusters */ return (fcc_mcast_current); } /* * Remulticasts messages in orf_token's retransmit list (requires orf_token) * Modify's orf_token's rtr to include retransmits required by this process */ static int orf_token_rtr ( struct totemsrp_instance *instance, struct orf_token *orf_token, unsigned int *fcc_allowed) { unsigned int res; unsigned int i, j; unsigned int found; unsigned int total_entries; struct sq *sort_queue; struct rtr_item *rtr_list; unsigned int range = 0; char retransmit_msg[1024]; char value[64]; if (instance->memb_state == MEMB_STATE_RECOVERY) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } rtr_list = &orf_token->rtr_list[0]; strcpy (retransmit_msg, "Retransmit List: "); if (orf_token->rtr_list_entries) { log_printf (instance->totemsrp_log_level_debug, "Retransmit List %d\n", orf_token->rtr_list_entries); for (i = 0; i < orf_token->rtr_list_entries; i++) { sprintf (value, "%x ", rtr_list[i].seq); strcat (retransmit_msg, value); } strcat (retransmit_msg, "\n"); log_printf (instance->totemsrp_log_level_notice, "%s", retransmit_msg); } total_entries = orf_token->rtr_list_entries; /* * Retransmit messages on orf_token's RTR list from RTR queue */ for (instance->fcc_remcast_current = 0, i = 0; instance->fcc_remcast_current < *fcc_allowed && i < orf_token->rtr_list_entries;) { /* * If this retransmit request isn't from this configuration, * try next rtr entry */ if (memcmp (&rtr_list[i].ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)) != 0) { i += 1; continue; } res = orf_token_remcast (instance, rtr_list[i].seq); if (res == 0) { /* * Multicasted message, so no need to copy to new retransmit list */ orf_token->rtr_list_entries -= 1; assert (orf_token->rtr_list_entries >= 0); memmove (&rtr_list[i], &rtr_list[i + 1], sizeof (struct rtr_item) * (orf_token->rtr_list_entries)); instance->fcc_remcast_current++; } else { i += 1; } } *fcc_allowed = *fcc_allowed - instance->fcc_remcast_current; /* * Add messages to retransmit to RTR list * but only retry if there is room in the retransmit list */ range = instance->my_high_seq_received - instance->my_aru; assert (range < 100000); for (i = 1; (orf_token->rtr_list_entries < RETRANSMIT_ENTRIES_MAX) && (i <= range); i++) { /* * Ensure message is within the sort queue range */ res = sq_in_range (sort_queue, instance->my_aru + i); if (res == 0) { break; } /* * Find if a message is missing from this processor */ res = sq_item_inuse (sort_queue, instance->my_aru + i); if (res == 0) { /* * Determine if missing message is already in retransmit list */ found = 0; for (j = 0; j < orf_token->rtr_list_entries; j++) { if (instance->my_aru + i == rtr_list[j].seq) { found = 1; } } if (found == 0) { /* * Missing message not found in current retransmit list so add it */ memcpy (&rtr_list[orf_token->rtr_list_entries].ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); rtr_list[orf_token->rtr_list_entries].seq = instance->my_aru + i; orf_token->rtr_list_entries++; } } } return (instance->fcc_remcast_current); } static void token_retransmit (struct totemsrp_instance *instance) { struct iovec iovec; iovec.iov_base = instance->orf_token_retransmit; iovec.iov_len = instance->orf_token_retransmit_size; totemrrp_token_send (instance->totemrrp_handle, &iovec, 1); } /* * Retransmit the regular token if no mcast or token has * been received in retransmit token period retransmit * the token to the next processor */ static void timer_function_token_retransmit_timeout (void *data) { struct totemsrp_instance *instance = (struct totemsrp_instance *)data; switch (instance->memb_state) { case MEMB_STATE_GATHER: break; case MEMB_STATE_COMMIT: case MEMB_STATE_OPERATIONAL: case MEMB_STATE_RECOVERY: token_retransmit (instance); reset_token_retransmit_timeout (instance); // REVIEWED break; } } static void timer_function_token_hold_retransmit_timeout (void *data) { struct totemsrp_instance *instance = (struct totemsrp_instance *)data; switch (instance->memb_state) { case MEMB_STATE_GATHER: break; case MEMB_STATE_COMMIT: break; case MEMB_STATE_OPERATIONAL: case MEMB_STATE_RECOVERY: token_retransmit (instance); break; } } static void timer_function_merge_detect_timeout(void *data) { struct totemsrp_instance *instance = (struct totemsrp_instance *)data; instance->my_merge_detect_timeout_outstanding = 0; switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0])) { memb_merge_detect_transmit (instance); } break; case MEMB_STATE_GATHER: case MEMB_STATE_COMMIT: case MEMB_STATE_RECOVERY: break; } } /* * Send orf_token to next member (requires orf_token) */ static int token_send ( struct totemsrp_instance *instance, struct orf_token *orf_token, int forward_token) { struct iovec iovec; int res = 0; - int iov_len = sizeof (struct orf_token) + + unsigned int iov_len = sizeof (struct orf_token) + (orf_token->rtr_list_entries * sizeof (struct rtr_item)); memcpy (instance->orf_token_retransmit, orf_token, iov_len); instance->orf_token_retransmit_size = iov_len; orf_token->header.nodeid = instance->my_id.addr[0].nodeid; assert (orf_token->header.nodeid); if (forward_token == 0) { return (0); } iovec.iov_base = orf_token; iovec.iov_len = iov_len; totemrrp_token_send (instance->totemrrp_handle, &iovec, 1); return (res); } static int token_hold_cancel_send (struct totemsrp_instance *instance) { struct token_hold_cancel token_hold_cancel; struct iovec iovec[2]; /* * Only cancel if the token is currently held */ if (instance->my_token_held == 0) { return (0); } instance->my_token_held = 0; /* * Build message */ token_hold_cancel.header.type = MESSAGE_TYPE_TOKEN_HOLD_CANCEL; token_hold_cancel.header.endian_detector = ENDIAN_LOCAL; token_hold_cancel.header.nodeid = instance->my_id.addr[0].nodeid; assert (token_hold_cancel.header.nodeid); iovec[0].iov_base = &token_hold_cancel; iovec[0].iov_len = sizeof (struct token_hold_cancel) - sizeof (struct memb_ring_id); iovec[1].iov_base = &instance->my_ring_id; iovec[1].iov_len = sizeof (struct memb_ring_id); totemrrp_mcast_flush_send (instance->totemrrp_handle, iovec, 2); return (0); } //AAA static int orf_token_send_initial (struct totemsrp_instance *instance) { struct orf_token orf_token; int res; orf_token.header.type = MESSAGE_TYPE_ORF_TOKEN; orf_token.header.endian_detector = ENDIAN_LOCAL; orf_token.header.encapsulated = 0; orf_token.header.nodeid = instance->my_id.addr[0].nodeid; assert (orf_token.header.nodeid); orf_token.seq = SEQNO_START_MSG; orf_token.token_seq = SEQNO_START_TOKEN; orf_token.retrans_flg = 1; instance->my_set_retrans_flg = 1; if (queue_is_empty (&instance->retrans_message_queue) == 1) { orf_token.retrans_flg = 0; instance->my_set_retrans_flg = 0; } else { orf_token.retrans_flg = 1; instance->my_set_retrans_flg = 1; } orf_token.aru = 0; orf_token.aru = SEQNO_START_MSG - 1; orf_token.aru_addr = instance->my_id.addr[0].nodeid; memcpy (&orf_token.ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)); orf_token.fcc = 0; orf_token.backlog = 0; orf_token.rtr_list_entries = 0; res = token_send (instance, &orf_token, 1); return (res); } static void memb_state_commit_token_update ( struct totemsrp_instance *instance, struct memb_commit_token *commit_token) { struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; unsigned int high_aru; unsigned int i; addr = (struct srp_addr *)commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + commit_token->addr_entries); memcpy (instance->my_new_memb_list, addr, sizeof (struct srp_addr) * commit_token->addr_entries); instance->my_new_memb_entries = commit_token->addr_entries; memcpy (&memb_list[commit_token->memb_index].ring_id, &instance->my_old_ring_id, sizeof (struct memb_ring_id)); assert (!totemip_zero_check(&instance->my_old_ring_id.rep)); memb_list[commit_token->memb_index].aru = instance->old_ring_state_aru; /* * TODO high delivered is really instance->my_aru, but with safe this * could change? */ instance->my_received_flg = (instance->my_aru == instance->my_high_seq_received); memb_list[commit_token->memb_index].received_flg = instance->my_received_flg; memb_list[commit_token->memb_index].high_delivered = instance->my_high_delivered; /* * find high aru up to current memb_index for all matching ring ids * if any ring id matching memb_index has aru less then high aru set * received flag for that entry to false */ high_aru = memb_list[commit_token->memb_index].aru; for (i = 0; i <= commit_token->memb_index; i++) { if (memcmp (&memb_list[commit_token->memb_index].ring_id, &memb_list[i].ring_id, sizeof (struct memb_ring_id)) == 0) { if (sq_lt_compare (high_aru, memb_list[i].aru)) { high_aru = memb_list[i].aru; } } } for (i = 0; i <= commit_token->memb_index; i++) { if (memcmp (&memb_list[commit_token->memb_index].ring_id, &memb_list[i].ring_id, sizeof (struct memb_ring_id)) == 0) { if (sq_lt_compare (memb_list[i].aru, high_aru)) { memb_list[i].received_flg = 0; if (i == commit_token->memb_index) { instance->my_received_flg = 0; } } } } commit_token->header.nodeid = instance->my_id.addr[0].nodeid; commit_token->memb_index += 1; assert (commit_token->memb_index <= commit_token->addr_entries); assert (commit_token->header.nodeid); } static void memb_state_commit_token_target_set ( struct totemsrp_instance *instance, struct memb_commit_token *commit_token) { struct srp_addr *addr; unsigned int i; addr = (struct srp_addr *)commit_token->end_of_commit_token; for (i = 0; i < instance->totem_config->interface_count; i++) { totemrrp_token_target_set ( instance->totemrrp_handle, &addr[commit_token->memb_index % commit_token->addr_entries].addr[i], i); } } static int memb_state_commit_token_send ( struct totemsrp_instance *instance, struct memb_commit_token *commit_token) { struct iovec iovec; struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; addr = (struct srp_addr *)commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + commit_token->addr_entries); commit_token->token_seq++; iovec.iov_base = commit_token; iovec.iov_len = sizeof (struct memb_commit_token) + ((sizeof (struct srp_addr) + sizeof (struct memb_commit_token_memb_entry)) * commit_token->addr_entries); /* * Make a copy for retransmission if necessary */ memcpy (instance->orf_token_retransmit, commit_token, iovec.iov_len); instance->orf_token_retransmit_size = iovec.iov_len; totemrrp_token_send (instance->totemrrp_handle, &iovec, 1); /* * Request retransmission of the commit token in case it is lost */ reset_token_retransmit_timeout (instance); return (0); } static int memb_lowest_in_config (struct totemsrp_instance *instance) { struct srp_addr token_memb[PROCESSOR_COUNT_MAX]; int token_memb_entries = 0; int i; struct totem_ip_address *lowest_addr; memb_set_subtract (token_memb, &token_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); /* * find representative by searching for smallest identifier */ lowest_addr = &token_memb[0].addr[0]; for (i = 1; i < token_memb_entries; i++) { if (totemip_compare(lowest_addr, &token_memb[i].addr[0]) > 0) { totemip_copy (lowest_addr, &token_memb[i].addr[0]); } } return (totemip_compare (lowest_addr, &instance->my_id.addr[0]) == 0); } static int srp_addr_compare (const void *a, const void *b) { const struct srp_addr *srp_a = (const struct srp_addr *)a; const struct srp_addr *srp_b = (const struct srp_addr *)b; return (totemip_compare (&srp_a->addr[0], &srp_b->addr[0])); } static void memb_state_commit_token_create ( struct totemsrp_instance *instance, struct memb_commit_token *commit_token) { struct srp_addr token_memb[PROCESSOR_COUNT_MAX]; struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; int token_memb_entries = 0; log_printf (instance->totemsrp_log_level_notice, "Creating commit token because I am the rep.\n"); memb_set_subtract (token_memb, &token_memb_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); memset (commit_token, 0, sizeof (struct memb_commit_token)); commit_token->header.type = MESSAGE_TYPE_MEMB_COMMIT_TOKEN; commit_token->header.endian_detector = ENDIAN_LOCAL; commit_token->header.encapsulated = 0; commit_token->header.nodeid = instance->my_id.addr[0].nodeid; assert (commit_token->header.nodeid); totemip_copy(&commit_token->ring_id.rep, &instance->my_id.addr[0]); commit_token->ring_id.seq = instance->token_ring_id_seq + 4; /* * This qsort is necessary to ensure the commit token traverses * the ring in the proper order */ qsort (token_memb, token_memb_entries, sizeof (struct srp_addr), srp_addr_compare); commit_token->memb_index = 0; commit_token->addr_entries = token_memb_entries; addr = (struct srp_addr *)commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + commit_token->addr_entries); memcpy (addr, token_memb, token_memb_entries * sizeof (struct srp_addr)); memset (memb_list, 0, sizeof (struct memb_commit_token_memb_entry) * token_memb_entries); } static void memb_join_message_send (struct totemsrp_instance *instance) { struct memb_join memb_join; struct iovec iovec[3]; unsigned int iovs; memb_join.header.type = MESSAGE_TYPE_MEMB_JOIN; memb_join.header.endian_detector = ENDIAN_LOCAL; memb_join.header.encapsulated = 0; memb_join.header.nodeid = instance->my_id.addr[0].nodeid; assert (memb_join.header.nodeid); assert (srp_addr_equal (&instance->my_proc_list[0], &instance->my_proc_list[1]) == 0); memb_join.ring_seq = instance->my_ring_id.seq; memb_join.proc_list_entries = instance->my_proc_list_entries; memb_join.failed_list_entries = instance->my_failed_list_entries; srp_addr_copy (&memb_join.system_from, &instance->my_id); iovec[0].iov_base = &memb_join; iovec[0].iov_len = sizeof (struct memb_join); iovec[1].iov_base = &instance->my_proc_list; iovec[1].iov_len = instance->my_proc_list_entries * sizeof (struct srp_addr); if (instance->my_failed_list_entries == 0) { iovs = 2; } else { iovs = 3; iovec[2].iov_base = instance->my_failed_list; iovec[2].iov_len = instance->my_failed_list_entries * sizeof (struct srp_addr); } if (instance->totem_config->send_join_timeout) { usleep (random() % (instance->totem_config->send_join_timeout * 1000)); } totemrrp_mcast_flush_send ( instance->totemrrp_handle, iovec, iovs); } static void memb_merge_detect_transmit (struct totemsrp_instance *instance) { struct memb_merge_detect memb_merge_detect; struct iovec iovec[2]; memb_merge_detect.header.type = MESSAGE_TYPE_MEMB_MERGE_DETECT; memb_merge_detect.header.endian_detector = ENDIAN_LOCAL; memb_merge_detect.header.encapsulated = 0; memb_merge_detect.header.nodeid = instance->my_id.addr[0].nodeid; srp_addr_copy (&memb_merge_detect.system_from, &instance->my_id); assert (memb_merge_detect.header.nodeid); iovec[0].iov_base = &memb_merge_detect; iovec[0].iov_len = sizeof (struct memb_merge_detect) - sizeof (struct memb_ring_id); iovec[1].iov_base = &instance->my_ring_id; iovec[1].iov_len = sizeof (struct memb_ring_id); totemrrp_mcast_flush_send (instance->totemrrp_handle, iovec, 2); } static void memb_ring_id_create_or_load ( struct totemsrp_instance *instance, struct memb_ring_id *memb_ring_id) { int fd; int res; char filename[256]; snprintf (filename, sizeof(filename), "%s/ringid_%s", rundir, totemip_print (&instance->my_id.addr[0])); fd = open (filename, O_RDONLY, 0700); if (fd > 0) { res = read (fd, &memb_ring_id->seq, sizeof (unsigned long long)); assert (res == sizeof (unsigned long long)); close (fd); } else if (fd == -1 && errno == ENOENT) { memb_ring_id->seq = 0; umask(0); fd = open (filename, O_CREAT|O_RDWR, 0700); if (fd == -1) { log_printf (instance->totemsrp_log_level_warning, "Couldn't create %s %s\n", filename, strerror (errno)); } res = write (fd, &memb_ring_id->seq, sizeof (unsigned long long)); assert (res == sizeof (unsigned long long)); close (fd); } else { log_printf (instance->totemsrp_log_level_warning, "Couldn't open %s %s\n", filename, strerror (errno)); } totemip_copy(&memb_ring_id->rep, &instance->my_id.addr[0]); assert (!totemip_zero_check(&memb_ring_id->rep)); instance->token_ring_id_seq = memb_ring_id->seq; } static void memb_ring_id_set_and_store ( struct totemsrp_instance *instance, struct memb_ring_id *ring_id) { char filename[256]; int fd; int res; memcpy (&instance->my_ring_id, ring_id, sizeof (struct memb_ring_id)); snprintf (filename, sizeof(filename), "%s/ringid_%s", rundir, totemip_print (&instance->my_id.addr[0])); fd = open (filename, O_WRONLY, 0777); if (fd == -1) { fd = open (filename, O_CREAT|O_RDWR, 0777); } if (fd == -1) { log_printf (instance->totemsrp_log_level_warning, "Couldn't store new ring id %llx to stable storage (%s)\n", instance->my_ring_id.seq, strerror (errno)); assert (0); return; } log_printf (instance->totemsrp_log_level_notice, "Storing new sequence id for ring %llx\n", instance->my_ring_id.seq); //assert (fd > 0); res = write (fd, &instance->my_ring_id.seq, sizeof (unsigned long long)); assert (res == sizeof (unsigned long long)); close (fd); } int totemsrp_callback_token_create ( hdb_handle_t handle, void **handle_out, enum totem_callback_token_type type, int delete, int (*callback_fn) (enum totem_callback_token_type type, const void *), const void *data) { struct token_callback_instance *callback_handle; struct totemsrp_instance *instance; unsigned int res; res = hdb_handle_get (&totemsrp_instance_database, handle, (void *)&instance); if (res != 0) { goto error_exit; } token_hold_cancel_send (instance); callback_handle = malloc (sizeof (struct token_callback_instance)); if (callback_handle == 0) { return (-1); } *handle_out = (void *)callback_handle; list_init (&callback_handle->list); callback_handle->callback_fn = callback_fn; callback_handle->data = (void *) data; callback_handle->callback_type = type; callback_handle->delete = delete; switch (type) { case TOTEM_CALLBACK_TOKEN_RECEIVED: list_add (&callback_handle->list, &instance->token_callback_received_listhead); break; case TOTEM_CALLBACK_TOKEN_SENT: list_add (&callback_handle->list, &instance->token_callback_sent_listhead); break; } hdb_handle_put (&totemsrp_instance_database, handle); error_exit: return (0); } void totemsrp_callback_token_destroy (hdb_handle_t handle, void **handle_out) { struct token_callback_instance *h; if (*handle_out) { h = (struct token_callback_instance *)*handle_out; list_del (&h->list); free (h); h = NULL; *handle_out = 0; } } static void token_callbacks_execute ( struct totemsrp_instance *instance, enum totem_callback_token_type type) { struct list_head *list; struct list_head *list_next; struct list_head *callback_listhead = 0; struct token_callback_instance *token_callback_instance; int res; int del; switch (type) { case TOTEM_CALLBACK_TOKEN_RECEIVED: callback_listhead = &instance->token_callback_received_listhead; break; case TOTEM_CALLBACK_TOKEN_SENT: callback_listhead = &instance->token_callback_sent_listhead; break; default: assert (0); } for (list = callback_listhead->next; list != callback_listhead; list = list_next) { token_callback_instance = list_entry (list, struct token_callback_instance, list); list_next = list->next; del = token_callback_instance->delete; if (del == 1) { list_del (list); } res = token_callback_instance->callback_fn ( token_callback_instance->callback_type, token_callback_instance->data); /* * This callback failed to execute, try it again on the next token */ if (res == -1 && del == 1) { list_add (list, callback_listhead); } else if (del) { free (token_callback_instance); } } } /* * Flow control functions */ static unsigned int backlog_get (struct totemsrp_instance *instance) { unsigned int backlog = 0; if (instance->memb_state == MEMB_STATE_OPERATIONAL) { backlog = queue_used (&instance->new_message_queue); } else if (instance->memb_state == MEMB_STATE_RECOVERY) { backlog = queue_used (&instance->retrans_message_queue); } return (backlog); } static int fcc_calculate ( struct totemsrp_instance *instance, struct orf_token *token) { unsigned int transmits_allowed; unsigned int backlog_calc; transmits_allowed = instance->totem_config->max_messages; if (transmits_allowed > instance->totem_config->window_size - token->fcc) { transmits_allowed = instance->totem_config->window_size - token->fcc; } instance->my_cbl = backlog_get (instance); /* * Only do backlog calculation if there is a backlog otherwise * we would result in div by zero */ if (token->backlog + instance->my_cbl - instance->my_pbl) { backlog_calc = (instance->totem_config->window_size * instance->my_pbl) / (token->backlog + instance->my_cbl - instance->my_pbl); if (backlog_calc > 0 && transmits_allowed > backlog_calc) { transmits_allowed = backlog_calc; } } return (transmits_allowed); } /* * don't overflow the RTR sort queue */ static void fcc_rtr_limit ( struct totemsrp_instance *instance, struct orf_token *token, unsigned int *transmits_allowed) { assert ((QUEUE_RTR_ITEMS_SIZE_MAX - *transmits_allowed - instance->totem_config->window_size) >= 0); if (sq_lt_compare (instance->last_released + QUEUE_RTR_ITEMS_SIZE_MAX - *transmits_allowed - instance->totem_config->window_size, token->seq)) { *transmits_allowed = 0; } } static void fcc_token_update ( struct totemsrp_instance *instance, struct orf_token *token, unsigned int msgs_transmitted) { token->fcc += msgs_transmitted - instance->my_trc; token->backlog += instance->my_cbl - instance->my_pbl; assert (token->backlog >= 0); instance->my_trc = msgs_transmitted; instance->my_pbl = instance->my_cbl; } /* * Message Handlers */ struct timeval tv_old; /* * message handler called when TOKEN message type received */ static int message_handler_orf_token ( struct totemsrp_instance *instance, void *msg, int msg_len, int endian_conversion_needed) { char token_storage[1500]; char token_convert[1500]; struct orf_token *token = NULL; int forward_token; unsigned int transmits_allowed; unsigned int mcasted_retransmit; unsigned int mcasted_regular; unsigned int last_aru; #ifdef GIVEINFO struct timeval tv_current; struct timeval tv_diff; gettimeofday (&tv_current, NULL); timersub (&tv_current, &tv_old, &tv_diff); memcpy (&tv_old, &tv_current, sizeof (struct timeval)); log_printf (instance->totemsrp_log_level_notice, "Time since last token %0.4f ms\n", (((float)tv_diff.tv_sec) * 1000) + ((float)tv_diff.tv_usec) / 1000.0); #endif #ifdef TEST_DROP_ORF_TOKEN_PERCENTAGE if (random()%100 < TEST_DROP_ORF_TOKEN_PERCENTAGE) { return (0); } #endif if (endian_conversion_needed) { orf_token_endian_convert ((struct orf_token *)msg, (struct orf_token *)token_convert); msg = (struct orf_token *)token_convert; } /* * Make copy of token and retransmit list in case we have * to flush incoming messages from the kernel queue */ token = (struct orf_token *)token_storage; memcpy (token, msg, sizeof (struct orf_token)); memcpy (&token->rtr_list[0], (char *)msg + sizeof (struct orf_token), sizeof (struct rtr_item) * RETRANSMIT_ENTRIES_MAX); /* * Handle merge detection timeout */ if (token->seq == instance->my_last_seq) { start_merge_detect_timeout (instance); instance->my_seq_unchanged += 1; } else { cancel_merge_detect_timeout (instance); cancel_token_hold_retransmit_timeout (instance); instance->my_seq_unchanged = 0; } instance->my_last_seq = token->seq; #ifdef TEST_RECOVERY_MSG_COUNT if (instance->memb_state == MEMB_STATE_OPERATIONAL && token->seq > TEST_RECOVERY_MSG_COUNT) { return (0); } #endif totemrrp_recv_flush (instance->totemrrp_handle); /* * Determine if we should hold (in reality drop) the token */ instance->my_token_held = 0; if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0]) && instance->my_seq_unchanged > instance->totem_config->seqno_unchanged_const) { instance->my_token_held = 1; } else if (!totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0]) && instance->my_seq_unchanged >= instance->totem_config->seqno_unchanged_const) { instance->my_token_held = 1; } /* * Hold onto token when there is no activity on ring and * this processor is the ring rep */ forward_token = 1; if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0])) { if (instance->my_token_held) { forward_token = 0; } } token_callbacks_execute (instance, TOTEM_CALLBACK_TOKEN_RECEIVED); switch (instance->memb_state) { case MEMB_STATE_COMMIT: /* Discard token */ break; case MEMB_STATE_OPERATIONAL: messages_free (instance, token->aru); case MEMB_STATE_GATHER: /* * DO NOT add break, we use different free mechanism in recovery state */ case MEMB_STATE_RECOVERY: last_aru = instance->my_last_aru; instance->my_last_aru = token->aru; /* * Discard tokens from another configuration */ if (memcmp (&token->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)) != 0) { if ((forward_token) && instance->use_heartbeat) { reset_heartbeat_timeout(instance); } else { cancel_heartbeat_timeout(instance); } return (0); /* discard token */ } /* * Discard retransmitted tokens */ if (sq_lte_compare (token->token_seq, instance->my_token_seq)) { /* * If this processor receives a retransmitted token, it is sure * the previous processor is still alive. As a result, it can * reset its token timeout. If some processor previous to that * has failed, it will eventually not execute a reset of the * token timeout, and will cause a reconfiguration to occur. */ reset_token_timeout (instance); if ((forward_token) && instance->use_heartbeat) { reset_heartbeat_timeout(instance); } else { cancel_heartbeat_timeout(instance); } return (0); /* discard token */ } transmits_allowed = fcc_calculate (instance, token); mcasted_retransmit = orf_token_rtr (instance, token, &transmits_allowed); fcc_rtr_limit (instance, token, &transmits_allowed); mcasted_regular = orf_token_mcast (instance, token, transmits_allowed); fcc_token_update (instance, token, mcasted_retransmit + mcasted_regular); if (sq_lt_compare (instance->my_aru, token->aru) || instance->my_id.addr[0].nodeid == token->aru_addr || token->aru_addr == 0) { token->aru = instance->my_aru; if (token->aru == token->seq) { token->aru_addr = 0; } else { token->aru_addr = instance->my_id.addr[0].nodeid; } } if (token->aru == last_aru && token->aru_addr != 0) { instance->my_aru_count += 1; } else { instance->my_aru_count = 0; } if (instance->my_aru_count > instance->totem_config->fail_to_recv_const && token->aru_addr != instance->my_id.addr[0].nodeid) { log_printf (instance->totemsrp_log_level_error, "FAILED TO RECEIVE\n"); // TODO if we fail to receive, it may be possible to end with a gather // state of proc == failed = 0 entries /* THIS IS A BIG TODO memb_set_merge (&token->aru_addr, 1, instance->my_failed_list, &instance->my_failed_list_entries); */ ring_state_restore (instance); memb_state_gather_enter (instance, 6); } else { instance->my_token_seq = token->token_seq; token->token_seq += 1; if (instance->memb_state == MEMB_STATE_RECOVERY) { /* * instance->my_aru == instance->my_high_seq_received means this processor * has recovered all messages it can recover * (ie: its retrans queue is empty) */ if (queue_is_empty (&instance->retrans_message_queue) == 0) { if (token->retrans_flg == 0) { token->retrans_flg = 1; instance->my_set_retrans_flg = 1; } } else if (token->retrans_flg == 1 && instance->my_set_retrans_flg) { token->retrans_flg = 0; } log_printf (instance->totemsrp_log_level_debug, "token retrans flag is %d my set retrans flag%d retrans queue empty %d count %d, aru %x\n", token->retrans_flg, instance->my_set_retrans_flg, queue_is_empty (&instance->retrans_message_queue), instance->my_retrans_flg_count, token->aru); if (token->retrans_flg == 0) { instance->my_retrans_flg_count += 1; } else { instance->my_retrans_flg_count = 0; } if (instance->my_retrans_flg_count == 2) { instance->my_install_seq = token->seq; } log_printf (instance->totemsrp_log_level_debug, "install seq %x aru %x high seq received %x\n", instance->my_install_seq, instance->my_aru, instance->my_high_seq_received); if (instance->my_retrans_flg_count >= 2 && instance->my_received_flg == 0 && sq_lte_compare (instance->my_install_seq, instance->my_aru)) { instance->my_received_flg = 1; instance->my_deliver_memb_entries = instance->my_trans_memb_entries; memcpy (instance->my_deliver_memb_list, instance->my_trans_memb_list, sizeof (struct totem_ip_address) * instance->my_trans_memb_entries); } if (instance->my_retrans_flg_count >= 3 && sq_lte_compare (instance->my_install_seq, token->aru)) { instance->my_rotation_counter += 1; } else { instance->my_rotation_counter = 0; } if (instance->my_rotation_counter == 2) { log_printf (instance->totemsrp_log_level_debug, "retrans flag count %x token aru %x install seq %x aru %x %x\n", instance->my_retrans_flg_count, token->aru, instance->my_install_seq, instance->my_aru, token->seq); memb_state_operational_enter (instance); instance->my_rotation_counter = 0; instance->my_retrans_flg_count = 0; } } totemrrp_send_flush (instance->totemrrp_handle); token_send (instance, token, forward_token); #ifdef GIVEINFO gettimeofday (&tv_current, NULL); timersub (&tv_current, &tv_old, &tv_diff); memcpy (&tv_old, &tv_current, sizeof (struct timeval)); log_printf (instance->totemsrp_log_level_notice, "I held %0.4f ms\n", ((float)tv_diff.tv_usec) / 1000.0); #endif if (instance->memb_state == MEMB_STATE_OPERATIONAL) { messages_deliver_to_app (instance, 0, instance->my_high_seq_received); } /* * Deliver messages after token has been transmitted * to improve performance */ reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED if (totemip_equal(&instance->my_id.addr[0], &instance->my_ring_id.rep) && instance->my_token_held == 1) { start_token_hold_retransmit_timeout (instance); } token_callbacks_execute (instance, TOTEM_CALLBACK_TOKEN_SENT); } break; } if ((forward_token) && instance->use_heartbeat) { reset_heartbeat_timeout(instance); } else { cancel_heartbeat_timeout(instance); } return (0); } static void messages_deliver_to_app ( struct totemsrp_instance *instance, int skip, unsigned int end_point) { struct sort_queue_item *sort_queue_item_p; unsigned int i; int res; struct mcast *mcast_in; struct mcast mcast_header; unsigned int range = 0; int endian_conversion_required; unsigned int my_high_delivered_stored = 0; range = end_point - instance->my_high_delivered; if (range) { log_printf (instance->totemsrp_log_level_debug, "Delivering %x to %x\n", instance->my_high_delivered, end_point); } assert (range < 10240); my_high_delivered_stored = instance->my_high_delivered; /* * Deliver messages in order from rtr queue to pending delivery queue */ for (i = 1; i <= range; i++) { void *ptr = 0; /* * If out of range of sort queue, stop assembly */ res = sq_in_range (&instance->regular_sort_queue, my_high_delivered_stored + i); if (res == 0) { break; } res = sq_item_get (&instance->regular_sort_queue, my_high_delivered_stored + i, &ptr); /* * If hole, stop assembly */ if (res != 0 && skip == 0) { break; } instance->my_high_delivered = my_high_delivered_stored + i; if (res != 0) { continue; } sort_queue_item_p = ptr; mcast_in = sort_queue_item_p->iovec[0].iov_base; assert (mcast_in != (struct mcast *)0xdeadbeef); endian_conversion_required = 0; if (mcast_in->header.endian_detector != ENDIAN_LOCAL) { endian_conversion_required = 1; mcast_endian_convert (mcast_in, &mcast_header); } else { memcpy (&mcast_header, mcast_in, sizeof (struct mcast)); } /* * Skip messages not originated in instance->my_deliver_memb */ if (skip && memb_set_subset (&mcast_header.system_from, 1, instance->my_deliver_memb_list, instance->my_deliver_memb_entries) == 0) { instance->my_high_delivered = my_high_delivered_stored + i; continue; } /* * Message found */ log_printf (instance->totemsrp_log_level_debug, "Delivering MCAST message with seq %x to pending delivery queue\n", mcast_header.seq); /* * Message is locally originated multicast */ if (sort_queue_item_p->iov_len > 1 && sort_queue_item_p->iovec[0].iov_len == sizeof (struct mcast)) { instance->totemsrp_deliver_fn ( mcast_header.header.nodeid, &sort_queue_item_p->iovec[1], sort_queue_item_p->iov_len - 1, endian_conversion_required); } else { sort_queue_item_p->iovec[0].iov_len -= sizeof (struct mcast); sort_queue_item_p->iovec[0].iov_base = (char *)sort_queue_item_p->iovec[0].iov_base + sizeof (struct mcast); instance->totemsrp_deliver_fn ( mcast_header.header.nodeid, sort_queue_item_p->iovec, sort_queue_item_p->iov_len, endian_conversion_required); sort_queue_item_p->iovec[0].iov_len += sizeof (struct mcast); sort_queue_item_p->iovec[0].iov_base = (char *)sort_queue_item_p->iovec[0].iov_base - sizeof (struct mcast); } //TODO instance->stats_delv += 1; } } /* * recv message handler called when MCAST message type received */ static int message_handler_mcast ( struct totemsrp_instance *instance, void *msg, int msg_len, int endian_conversion_needed) { struct sort_queue_item sort_queue_item; struct sq *sort_queue; struct mcast mcast_header; if (endian_conversion_needed) { mcast_endian_convert (msg, &mcast_header); } else { memcpy (&mcast_header, msg, sizeof (struct mcast)); } if (mcast_header.header.encapsulated == MESSAGE_ENCAPSULATED) { sort_queue = &instance->recovery_sort_queue; } else { sort_queue = &instance->regular_sort_queue; } assert (msg_len < FRAME_SIZE_MAX); #ifdef TEST_DROP_MCAST_PERCENTAGE if (random()%100 < TEST_DROP_MCAST_PERCENTAGE) { printf ("dropping message %d\n", mcast_header.seq); return (0); } else { printf ("accepting message %d\n", mcast_header.seq); } #endif if (srp_addr_equal (&mcast_header.system_from, &instance->my_id) == 0) { cancel_token_retransmit_timeout (instance); } /* * If the message is foreign execute the switch below */ if (memcmp (&instance->my_ring_id, &mcast_header.ring_id, sizeof (struct memb_ring_id)) != 0) { switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: memb_set_merge ( &mcast_header.system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, 7); break; case MEMB_STATE_GATHER: if (!memb_set_subset ( &mcast_header.system_from, 1, instance->my_proc_list, instance->my_proc_list_entries)) { memb_set_merge (&mcast_header.system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, 8); return (0); } break; case MEMB_STATE_COMMIT: /* discard message */ break; case MEMB_STATE_RECOVERY: /* discard message */ break; } return (0); } log_printf (instance->totemsrp_log_level_debug, "Received ringid(%s:%lld) seq %x\n", totemip_print (&mcast_header.ring_id.rep), mcast_header.ring_id.seq, mcast_header.seq); /* * Add mcast message to rtr queue if not already in rtr queue * otherwise free io vectors */ if (msg_len > 0 && msg_len < FRAME_SIZE_MAX && sq_in_range (sort_queue, mcast_header.seq) && sq_item_inuse (sort_queue, mcast_header.seq) == 0) { /* * Allocate new multicast memory block */ // TODO LEAK sort_queue_item.iovec[0].iov_base = malloc (msg_len); if (sort_queue_item.iovec[0].iov_base == 0) { return (-1); /* error here is corrected by the algorithm */ } memcpy (sort_queue_item.iovec[0].iov_base, msg, msg_len); sort_queue_item.iovec[0].iov_len = msg_len; assert (sort_queue_item.iovec[0].iov_len > 0); assert (sort_queue_item.iovec[0].iov_len < FRAME_SIZE_MAX); sort_queue_item.iov_len = 1; if (sq_lt_compare (instance->my_high_seq_received, mcast_header.seq)) { instance->my_high_seq_received = mcast_header.seq; } sq_item_add (sort_queue, &sort_queue_item, mcast_header.seq); } update_aru (instance); if (instance->memb_state == MEMB_STATE_OPERATIONAL) { messages_deliver_to_app (instance, 0, instance->my_high_seq_received); } /* TODO remove from retrans message queue for old ring in recovery state */ return (0); } static int message_handler_memb_merge_detect ( struct totemsrp_instance *instance, void *msg, int msg_len, int endian_conversion_needed) { struct memb_merge_detect *memb_merge_detect = (struct memb_merge_detect *)msg; if (endian_conversion_needed) { memb_merge_detect_endian_convert (msg, msg); } /* * do nothing if this is a merge detect from this configuration */ if (memcmp (&instance->my_ring_id, &memb_merge_detect->ring_id, sizeof (struct memb_ring_id)) == 0) { return (0); } /* * Execute merge operation */ switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: memb_set_merge (&memb_merge_detect->system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, 9); break; case MEMB_STATE_GATHER: if (!memb_set_subset ( &memb_merge_detect->system_from, 1, instance->my_proc_list, instance->my_proc_list_entries)) { memb_set_merge (&memb_merge_detect->system_from, 1, instance->my_proc_list, &instance->my_proc_list_entries); memb_state_gather_enter (instance, 10); return (0); } break; case MEMB_STATE_COMMIT: /* do nothing in commit */ break; case MEMB_STATE_RECOVERY: /* do nothing in recovery */ break; } return (0); } static int memb_join_process ( struct totemsrp_instance *instance, struct memb_join *memb_join) { unsigned char *commit_token_storage[TOKEN_SIZE_MAX]; struct memb_commit_token *my_commit_token = (struct memb_commit_token *)commit_token_storage; struct srp_addr *proc_list; struct srp_addr *failed_list; proc_list = (struct srp_addr *)memb_join->end_of_memb_join; failed_list = proc_list + memb_join->proc_list_entries; if (memb_set_equal (proc_list, memb_join->proc_list_entries, instance->my_proc_list, instance->my_proc_list_entries) && memb_set_equal (failed_list, memb_join->failed_list_entries, instance->my_failed_list, instance->my_failed_list_entries)) { memb_consensus_set (instance, &memb_join->system_from); if (memb_consensus_agreed (instance) && memb_lowest_in_config (instance)) { memb_state_commit_token_create (instance, my_commit_token); memb_state_commit_enter (instance, my_commit_token); } else { return (0); } } else if (memb_set_subset (proc_list, memb_join->proc_list_entries, instance->my_proc_list, instance->my_proc_list_entries) && memb_set_subset (failed_list, memb_join->failed_list_entries, instance->my_failed_list, instance->my_failed_list_entries)) { return (0); } else if (memb_set_subset (&memb_join->system_from, 1, instance->my_failed_list, instance->my_failed_list_entries)) { return (0); } else { memb_set_merge (proc_list, memb_join->proc_list_entries, instance->my_proc_list, &instance->my_proc_list_entries); if (memb_set_subset ( &instance->my_id, 1, failed_list, memb_join->failed_list_entries)) { memb_set_merge ( &memb_join->system_from, 1, instance->my_failed_list, &instance->my_failed_list_entries); } else { memb_set_merge (failed_list, memb_join->failed_list_entries, instance->my_failed_list, &instance->my_failed_list_entries); } memb_state_gather_enter (instance, 11); return (1); /* gather entered */ } return (0); /* gather not entered */ } static void memb_join_endian_convert (struct memb_join *in, struct memb_join *out) { int i; struct srp_addr *in_proc_list; struct srp_addr *in_failed_list; struct srp_addr *out_proc_list; struct srp_addr *out_failed_list; out->header.type = in->header.type; out->header.endian_detector = ENDIAN_LOCAL; out->header.nodeid = swab32 (in->header.nodeid); srp_addr_copy_endian_convert (&out->system_from, &in->system_from); out->proc_list_entries = swab32 (in->proc_list_entries); out->failed_list_entries = swab32 (in->failed_list_entries); out->ring_seq = swab64 (in->ring_seq); in_proc_list = (struct srp_addr *)in->end_of_memb_join; in_failed_list = in_proc_list + out->proc_list_entries; out_proc_list = (struct srp_addr *)out->end_of_memb_join; out_failed_list = out_proc_list + out->proc_list_entries; for (i = 0; i < out->proc_list_entries; i++) { srp_addr_copy_endian_convert (&out_proc_list[i], &in_proc_list[i]); } for (i = 0; i < out->failed_list_entries; i++) { srp_addr_copy_endian_convert (&out_failed_list[i], &in_failed_list[i]); } } static void memb_commit_token_endian_convert (struct memb_commit_token *in, struct memb_commit_token *out) { int i; struct srp_addr *in_addr = (struct srp_addr *)in->end_of_commit_token; struct srp_addr *out_addr = (struct srp_addr *)out->end_of_commit_token; struct memb_commit_token_memb_entry *in_memb_list; struct memb_commit_token_memb_entry *out_memb_list; out->header.type = in->header.type; out->header.endian_detector = ENDIAN_LOCAL; out->header.nodeid = swab32 (in->header.nodeid); out->token_seq = swab32 (in->token_seq); totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep); out->ring_id.seq = swab64 (in->ring_id.seq); out->retrans_flg = swab32 (in->retrans_flg); out->memb_index = swab32 (in->memb_index); out->addr_entries = swab32 (in->addr_entries); in_memb_list = (struct memb_commit_token_memb_entry *)(in_addr + out->addr_entries); out_memb_list = (struct memb_commit_token_memb_entry *)(out_addr + out->addr_entries); for (i = 0; i < out->addr_entries; i++) { srp_addr_copy_endian_convert (&out_addr[i], &in_addr[i]); /* * Only convert the memb entry if it has been set */ if (in_memb_list[i].ring_id.rep.family != 0) { totemip_copy_endian_convert (&out_memb_list[i].ring_id.rep, &in_memb_list[i].ring_id.rep); out_memb_list[i].ring_id.seq = swab64 (in_memb_list[i].ring_id.seq); out_memb_list[i].aru = swab32 (in_memb_list[i].aru); out_memb_list[i].high_delivered = swab32 (in_memb_list[i].high_delivered); out_memb_list[i].received_flg = swab32 (in_memb_list[i].received_flg); } } } static void orf_token_endian_convert (struct orf_token *in, struct orf_token *out) { int i; out->header.type = in->header.type; out->header.endian_detector = ENDIAN_LOCAL; out->header.nodeid = swab32 (in->header.nodeid); out->seq = swab32 (in->seq); out->token_seq = swab32 (in->token_seq); out->aru = swab32 (in->aru); totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep); out->aru_addr = swab32(in->aru_addr); out->ring_id.seq = swab64 (in->ring_id.seq); out->fcc = swab32 (in->fcc); out->backlog = swab32 (in->backlog); out->retrans_flg = swab32 (in->retrans_flg); out->rtr_list_entries = swab32 (in->rtr_list_entries); for (i = 0; i < out->rtr_list_entries; i++) { totemip_copy_endian_convert(&out->rtr_list[i].ring_id.rep, &in->rtr_list[i].ring_id.rep); out->rtr_list[i].ring_id.seq = swab64 (in->rtr_list[i].ring_id.seq); out->rtr_list[i].seq = swab32 (in->rtr_list[i].seq); } } static void mcast_endian_convert (struct mcast *in, struct mcast *out) { out->header.type = in->header.type; out->header.endian_detector = ENDIAN_LOCAL; out->header.nodeid = swab32 (in->header.nodeid); out->header.encapsulated = in->header.encapsulated; out->seq = swab32 (in->seq); out->this_seqno = swab32 (in->this_seqno); totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep); out->ring_id.seq = swab64 (in->ring_id.seq); out->node_id = swab32 (in->node_id); out->guarantee = swab32 (in->guarantee); srp_addr_copy_endian_convert (&out->system_from, &in->system_from); } static void memb_merge_detect_endian_convert ( struct memb_merge_detect *in, struct memb_merge_detect *out) { out->header.type = in->header.type; out->header.endian_detector = ENDIAN_LOCAL; out->header.nodeid = swab32 (in->header.nodeid); totemip_copy_endian_convert(&out->ring_id.rep, &in->ring_id.rep); out->ring_id.seq = swab64 (in->ring_id.seq); srp_addr_copy_endian_convert (&out->system_from, &in->system_from); } static int message_handler_memb_join ( struct totemsrp_instance *instance, void *msg, int msg_len, int endian_conversion_needed) { struct memb_join *memb_join; struct memb_join *memb_join_convert = alloca (msg_len); int gather_entered; if (endian_conversion_needed) { memb_join = memb_join_convert; memb_join_endian_convert (msg, memb_join_convert); } else { memb_join = (struct memb_join *)msg; } if (instance->token_ring_id_seq < memb_join->ring_seq) { instance->token_ring_id_seq = memb_join->ring_seq; } switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: gather_entered = memb_join_process (instance, memb_join); if (gather_entered == 0) { memb_state_gather_enter (instance, 12); } break; case MEMB_STATE_GATHER: memb_join_process (instance, memb_join); break; case MEMB_STATE_COMMIT: if (memb_set_subset (&memb_join->system_from, 1, instance->my_new_memb_list, instance->my_new_memb_entries) && memb_join->ring_seq >= instance->my_ring_id.seq) { memb_join_process (instance, memb_join); memb_state_gather_enter (instance, 13); } break; case MEMB_STATE_RECOVERY: if (memb_set_subset (&memb_join->system_from, 1, instance->my_new_memb_list, instance->my_new_memb_entries) && memb_join->ring_seq >= instance->my_ring_id.seq) { ring_state_restore (instance); memb_join_process (instance, memb_join); memb_state_gather_enter (instance, 14); } break; } return (0); } static int message_handler_memb_commit_token ( struct totemsrp_instance *instance, void *msg, int msg_len, int endian_conversion_needed) { struct memb_commit_token *memb_commit_token_convert = alloca (msg_len); struct memb_commit_token *memb_commit_token; struct srp_addr sub[PROCESSOR_COUNT_MAX]; int sub_entries; struct srp_addr *addr; struct memb_commit_token_memb_entry *memb_list; log_printf (instance->totemsrp_log_level_debug, "got commit token\n"); if (endian_conversion_needed) { memb_commit_token = memb_commit_token_convert; memb_commit_token_endian_convert (msg, memb_commit_token); } else { memb_commit_token = (struct memb_commit_token *)msg; } addr = (struct srp_addr *)memb_commit_token->end_of_commit_token; memb_list = (struct memb_commit_token_memb_entry *)(addr + memb_commit_token->addr_entries); #ifdef TEST_DROP_COMMIT_TOKEN_PERCENTAGE if (random()%100 < TEST_DROP_COMMIT_TOKEN_PERCENTAGE) { return (0); } #endif switch (instance->memb_state) { case MEMB_STATE_OPERATIONAL: /* discard token */ break; case MEMB_STATE_GATHER: memb_set_subtract (sub, &sub_entries, instance->my_proc_list, instance->my_proc_list_entries, instance->my_failed_list, instance->my_failed_list_entries); if (memb_set_equal (addr, memb_commit_token->addr_entries, sub, sub_entries) && memb_commit_token->ring_id.seq > instance->my_ring_id.seq) { memb_state_commit_enter (instance, memb_commit_token); } break; case MEMB_STATE_COMMIT: /* * If retransmitted commit tokens are sent on this ring * filter them out and only enter recovery once the * commit token has traversed the array. This is * determined by : * memb_commit_token->memb_index == memb_commit_token->addr_entries) { */ if (memb_commit_token->ring_id.seq == instance->my_ring_id.seq && memb_commit_token->memb_index == memb_commit_token->addr_entries) { memb_state_recovery_enter (instance, memb_commit_token); } break; case MEMB_STATE_RECOVERY: if (totemip_equal (&instance->my_id.addr[0], &instance->my_ring_id.rep)) { log_printf (instance->totemsrp_log_level_notice, "Sending initial ORF token\n"); // TODO convert instead of initiate orf_token_send_initial (instance); reset_token_timeout (instance); // REVIEWED reset_token_retransmit_timeout (instance); // REVIEWED } break; } return (0); } static int message_handler_token_hold_cancel ( struct totemsrp_instance *instance, void *msg, int msg_len, int endian_conversion_needed) { struct token_hold_cancel *token_hold_cancel = (struct token_hold_cancel *)msg; if (memcmp (&token_hold_cancel->ring_id, &instance->my_ring_id, sizeof (struct memb_ring_id)) == 0) { instance->my_seq_unchanged = 0; if (totemip_equal(&instance->my_ring_id.rep, &instance->my_id.addr[0])) { timer_function_token_retransmit_timeout (instance); } } return (0); } void main_deliver_fn ( void *context, void *msg, int msg_len) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; struct message_header *message_header = (struct message_header *)msg; if (msg_len < sizeof (struct message_header)) { log_printf (instance->totemsrp_log_level_security, "Received message is too short... ignoring %d.\n", msg_len); return; } if ((int)message_header->type >= totemsrp_message_handlers.count) { log_printf (instance->totemsrp_log_level_security, "Type of received message is wrong... ignoring %d.\n", (int)message_header->type); return; } /* * Handle incoming message */ totemsrp_message_handlers.handler_functions[(int)message_header->type] ( instance, msg, msg_len, message_header->endian_detector != ENDIAN_LOCAL); } void main_iface_change_fn ( void *context, struct totem_ip_address *iface_addr, unsigned int iface_no) { struct totemsrp_instance *instance = (struct totemsrp_instance *)context; totemip_copy (&instance->my_id.addr[iface_no], iface_addr); assert (instance->my_id.addr[iface_no].nodeid); totemip_copy (&instance->my_memb_list[0].addr[iface_no], iface_addr); if (instance->iface_changes++ == 0) { memb_ring_id_create_or_load (instance, &instance->my_ring_id); log_printf ( instance->totemsrp_log_level_notice, "Created or loaded sequence id %lld.%s for this ring.\n", instance->my_ring_id.seq, totemip_print (&instance->my_ring_id.rep)); } if (instance->iface_changes >= instance->totem_config->interface_count) { memb_state_gather_enter (instance, 15); } } void totemsrp_net_mtu_adjust (struct totem_config *totem_config) { totem_config->net_mtu -= sizeof (struct mcast); } diff --git a/exec/totemsrp.h b/exec/totemsrp.h index 1ec9f3e6..09ebcb8c 100644 --- a/exec/totemsrp.h +++ b/exec/totemsrp.h @@ -1,114 +1,114 @@ /* * Copyright (c) 2003-2005 MontaVista Software, Inc. * Copyright (c) 2006-2007, 2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef TOTEMSRP_H_DEFINED #define TOTEMSRP_H_DEFINED #include #include /* * Totem Single Ring Protocol * depends on poll abstraction, POSIX, IPV4 */ /* * Create a protocol instance */ int totemsrp_initialize ( hdb_handle_t poll_handle, hdb_handle_t *handle, struct totem_config *totem_config, void (*deliver_fn) ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required), void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id)); void totemsrp_finalize (hdb_handle_t handle); /* * Multicast a message */ int totemsrp_mcast ( hdb_handle_t handle, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int priority); /* * Return number of available messages that can be queued */ int totemsrp_avail (hdb_handle_t handle); int totemsrp_callback_token_create ( hdb_handle_t handle, void **handle_out, enum totem_callback_token_type type, int delete, int (*callback_fn) (enum totem_callback_token_type type, const void *), const void *data); void totemsrp_callback_token_destroy ( hdb_handle_t handle, void **handle_out); int totemsrp_new_msg_signal (hdb_handle_t handle); extern void totemsrp_net_mtu_adjust (struct totem_config *totem_config); extern int totemsrp_ifaces_get ( hdb_handle_t handle, unsigned int nodeid, struct totem_ip_address *interfaces, char ***status, unsigned int *iface_count); extern unsigned int totemsrp_my_nodeid_get ( hdb_handle_t handle); extern int totemsrp_my_family_get ( hdb_handle_t handle); extern int totemsrp_ring_reenable ( hdb_handle_t handle); #endif /* TOTEMSRP_H_DEFINED */ diff --git a/exec/vsf_ykd.c b/exec/vsf_ykd.c index 23a17d6e..83353c60 100644 --- a/exec/vsf_ykd.c +++ b/exec/vsf_ykd.c @@ -1,555 +1,555 @@ /* * Copyright (c) 2005 MontaVista Software, Inc. * Copyright (c) 2006-2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include LOGSYS_DECLARE_SUBSYS ("YKD", LOG_INFO); #define YKD_PROCESSOR_COUNT_MAX 32 enum ykd_header_values { YKD_HEADER_SENDSTATE = 0, YKD_HEADER_ATTEMPT = 1 }; enum ykd_mode { YKD_MODE_SENDSTATE = 0, YKD_MODE_ATTEMPT = 1 }; struct ykd_header { int id; }; struct ykd_session { unsigned int member_list[YKD_PROCESSOR_COUNT_MAX]; int member_list_entries; int session_id; }; struct ykd_state { struct ykd_session last_primary; struct ykd_session last_formed[YKD_PROCESSOR_COUNT_MAX]; int last_formed_entries; struct ykd_session ambiguous_sessions[YKD_PROCESSOR_COUNT_MAX]; int ambiguous_sessions_entries; int session_id; }; struct state_received { unsigned int nodeid; int received; struct ykd_state ykd_state; }; struct ykd_state ykd_state; static hdb_handle_t ykd_group_handle; static struct state_received state_received_confchg[YKD_PROCESSOR_COUNT_MAX]; static int state_received_confchg_entries; static struct state_received state_received_process[YKD_PROCESSOR_COUNT_MAX]; static int state_received_process_entries; static enum ykd_mode ykd_mode; static unsigned int view_list[YKD_PROCESSOR_COUNT_MAX]; static int view_list_entries; static int session_id_max; static struct ykd_session *last_primary_max; static struct ykd_session ambiguous_sessions_max[YKD_PROCESSOR_COUNT_MAX]; static int ambiguous_sessions_max_entries; static int primary_designated = 0; static struct memb_ring_id ykd_ring_id; static void *ykd_attempt_send_callback_token_handle = 0; static void *ykd_state_send_callback_token_handle = 0; static struct corosync_api_v1 *api; static void (*ykd_primary_callback_fn) ( const unsigned int *view_list, size_t view_list_entries, int primary_designated, struct memb_ring_id *ring_id) = NULL; static void ykd_state_init (void) { ykd_state.session_id = 0; ykd_state.last_formed_entries = 0; ykd_state.ambiguous_sessions_entries = 0; ykd_state.last_primary.session_id = 0; ykd_state.last_primary.member_list_entries = 0; } static int ykd_state_send_msg (enum totem_callback_token_type type, const void *context) { struct iovec iovec[2]; struct ykd_header header; int res; header.id = YKD_HEADER_SENDSTATE; iovec[0].iov_base = (char *)&header; iovec[0].iov_len = sizeof (struct ykd_header); iovec[1].iov_base = (char *)&ykd_state; iovec[1].iov_len = sizeof (struct ykd_state); res = api->tpg_joined_mcast (ykd_group_handle, iovec, 2, TOTEM_AGREED); return (res); } static void ykd_state_send (void) { api->totem_callback_token_create ( &ykd_state_send_callback_token_handle, TOTEM_CALLBACK_TOKEN_SENT, 1, /* delete after callback */ ykd_state_send_msg, NULL); } static int ykd_attempt_send_msg (enum totem_callback_token_type type, const void *context) { struct iovec iovec; struct ykd_header header; int res; header.id = YKD_HEADER_SENDSTATE; iovec.iov_base = (char *)&header; iovec.iov_len = sizeof (struct ykd_header); res = api->tpg_joined_mcast (ykd_group_handle, &iovec, 1, TOTEM_AGREED); return (res); } static void ykd_attempt_send (void) { api->totem_callback_token_create ( &ykd_attempt_send_callback_token_handle, TOTEM_CALLBACK_TOKEN_SENT, 1, /* delete after callback */ ykd_attempt_send_msg, NULL); } static void compute (void) { int i; int j; session_id_max = 0; last_primary_max = &state_received_process[0].ykd_state.last_primary; ambiguous_sessions_max_entries = 0; for (i = 0; i < state_received_process_entries; i++) { /* * Calculate maximum session id */ if (state_received_process[i].ykd_state.session_id > session_id_max) { session_id_max = state_received_process[i].ykd_state.session_id; } /* * Calculate maximum primary id */ if (state_received_process[i].ykd_state.last_primary.session_id > last_primary_max->session_id) { last_primary_max = &state_received_process[i].ykd_state.last_primary; } /* * generate the maximum ambiguous sessions list */ for (j = 0; j < state_received_process[i].ykd_state.ambiguous_sessions_entries; j++) { if (state_received_process[i].ykd_state.ambiguous_sessions[j].session_id > last_primary_max->session_id) { memcpy (&ambiguous_sessions_max[ambiguous_sessions_max_entries], &state_received_process[i].ykd_state.ambiguous_sessions[j], sizeof (struct ykd_session)); ambiguous_sessions_max_entries += 1; } } } } static int subquorum ( unsigned int *member_list, int member_list_entries, struct ykd_session *session) { int intersections = 0; int i; int j; for (i = 0; i < member_list_entries; i++) { for (j = 0; j < session->member_list_entries; j++) { if (member_list[i] == session->member_list[j]) { intersections += 1; } } } /* * even split */ if (intersections == (session->member_list_entries - intersections)) { return (1); } else /* * majority split */ if (intersections > (session->member_list_entries - intersections)) { return (1); } return (0); } static int decide (void) { int i; /* * Determine if there is a subquorum */ if (subquorum (view_list, view_list_entries, last_primary_max) == 0) { return (0); } for (i = 0; i < ambiguous_sessions_max_entries; i++) { if (subquorum (view_list, view_list_entries, &ambiguous_sessions_max[i]) == 0) { return (0); } } return (1); } static void ykd_session_endian_convert (struct ykd_session *ykd_session) { int i; ykd_session->member_list_entries = swab32 (ykd_session->member_list_entries); ykd_session->session_id = swab32 (ykd_session->session_id); for (i = 0; i < ykd_session->member_list_entries; i++) { ykd_session->member_list[i] = swab32 (ykd_session->member_list[i]); } } static void ykd_state_endian_convert (struct ykd_state *ykd_state) { int i; ykd_session_endian_convert (&ykd_state->last_primary); ykd_state->last_formed_entries = swab32 (ykd_state->last_formed_entries); ykd_state->ambiguous_sessions_entries = swab32 (ykd_state->ambiguous_sessions_entries); ykd_state->session_id = swab32 (ykd_state->session_id); for (i = 0; i < ykd_state->last_formed_entries; i++) { ykd_session_endian_convert (&ykd_state->last_formed[i]); } for (i = 0; i < ykd_state->ambiguous_sessions_entries; i++) { ykd_session_endian_convert (&ykd_state->ambiguous_sessions[i]); } } static void ykd_deliver_fn ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required) { int all_received = 1; int state_position = 0; int i; char *msg_state = (char *)(iovec->iov_base) + sizeof (struct ykd_header); /* * If this is a localhost address, this node is always primary */ #ifdef TODO if (totemip_localhost_check (source_addr)) { log_printf (LOG_LEVEL_NOTICE, "This processor is within the primary component.\n"); primary_designated = 1; ykd_primary_callback_fn ( view_list, view_list_entries, primary_designated, &ykd_ring_id); return; } #endif if (endian_conversion_required && (iovec->iov_len > sizeof (struct ykd_header))) { ykd_state_endian_convert ((struct ykd_state *)msg_state); } /* * Set completion for source_addr's address */ for (state_position = 0; state_position < state_received_confchg_entries; state_position++) { if (nodeid == state_received_process[state_position].nodeid) { /* * State position contains the address of the state to modify * This may be used later by the other algorithms */ state_received_process[state_position].received = 1; break; } } /* * Test if all nodes have submitted their state data */ for (i = 0; i < state_received_confchg_entries; i++) { if (state_received_process[i].received == 0) { all_received = 0; } } switch (ykd_mode) { case YKD_MODE_SENDSTATE: assert (iovec->iov_len > sizeof (struct ykd_header)); /* * Copy state information for the sending processor */ memcpy (&state_received_process[state_position].ykd_state, msg_state, sizeof (struct ykd_state)); /* * Try to form a component */ if (all_received) { for (i = 0; i < state_received_confchg_entries; i++) { state_received_process[i].received = 0; } ykd_mode = YKD_MODE_ATTEMPT; // TODO resolve optimizes for failure conditions during ykd calculation // resolve(); compute(); if (decide ()) { ykd_state.session_id = session_id_max + 1; memcpy (ykd_state.ambiguous_sessions[ykd_state.ambiguous_sessions_entries].member_list, view_list, sizeof (unsigned int) * view_list_entries); ykd_state.ambiguous_sessions[ykd_state.ambiguous_sessions_entries].member_list_entries = view_list_entries; ykd_state.ambiguous_sessions_entries += 1; ykd_attempt_send(); } } break; case YKD_MODE_ATTEMPT: if (all_received) { log_printf (LOG_LEVEL_NOTICE, "This processor is within the primary component.\n"); primary_designated = 1; ykd_primary_callback_fn ( view_list, view_list_entries, primary_designated, &ykd_ring_id); memcpy (ykd_state.last_primary.member_list, view_list, sizeof (view_list)); ykd_state.last_primary.member_list_entries = view_list_entries; ykd_state.last_primary.session_id = ykd_state.session_id; ykd_state.ambiguous_sessions_entries = 0; } break; } } int first_run = 1; static void ykd_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { int i; if (configuration_type != TOTEM_CONFIGURATION_REGULAR) { return; } memcpy (&ykd_ring_id, ring_id, sizeof (struct memb_ring_id)); if (first_run) { ykd_state.last_primary.member_list[0] = api->totem_nodeid_get(); ykd_state.last_primary.member_list_entries = 1; ykd_state.last_primary.session_id = 0; first_run = 0; } memcpy (view_list, member_list, member_list_entries * sizeof (unsigned int)); view_list_entries = member_list_entries; ykd_mode = YKD_MODE_SENDSTATE; primary_designated = 0; ykd_primary_callback_fn ( view_list, view_list_entries, primary_designated, &ykd_ring_id); memset (&state_received_confchg, 0, sizeof (state_received_confchg)); for (i = 0; i < member_list_entries; i++) { state_received_confchg[i].nodeid = member_list[i]; state_received_confchg[i].received = 0; } memcpy (state_received_process, state_received_confchg, sizeof (state_received_confchg)); state_received_confchg_entries = member_list_entries; state_received_process_entries = member_list_entries; ykd_state_send (); } struct corosync_tpg_group ykd_group = { .group = "ykd", .group_len = 3 }; static void ykd_init ( struct corosync_api_v1 *corosync_api, quorum_set_quorate_fn_t set_primary) { ykd_primary_callback_fn = set_primary; api = corosync_api; api->tpg_init ( &ykd_group_handle, ykd_deliver_fn, ykd_confchg_fn); api->tpg_join ( ykd_group_handle, &ykd_group, 1); ykd_state_init (); } /* * lcrso object definition */ static struct quorum_services_api_ver1 vsf_ykd_iface_ver0 = { .init = ykd_init, }; static struct lcr_iface corosync_vsf_ykd_ver0[1] = { { .name = "corosync_quorum_ykd", .version = 0, .versions_replace = 0, .versions_replace_count = 0, .dependencies = 0, .dependency_count = 0, .constructor = NULL, .destructor = NULL, .interfaces = (void **)(void *)&vsf_ykd_iface_ver0, } }; static struct lcr_comp vsf_ykd_comp_ver0 = { .iface_count = 1, .ifaces = corosync_vsf_ykd_ver0 }; __attribute__ ((constructor)) static void vsf_ykd_comp_register (void) { lcr_component_register (&vsf_ykd_comp_ver0); } diff --git a/include/corosync/coroipcc.h b/include/corosync/coroipcc.h index 81802ba4..fe91ed51 100644 --- a/include/corosync/coroipcc.h +++ b/include/corosync/coroipcc.h @@ -1,134 +1,134 @@ /* * Copyright (c) 2002-2003 MontaVista Software, Inc. * Copyright (c) 2006-2007, 2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef COROIPC_H_DEFINED #define COROIPC_H_DEFINED #include #include #include #include #include /* Debug macro */ #ifdef DEBUG #define DPRINT(s) printf s #else #define DPRINT(s) #endif #ifdef SO_NOSIGPIPE #ifndef MSG_NOSIGNAL #define MSG_NOSIGNAL 0 #endif void socket_nosigpipe(int s); #else #define socket_nosigpipe(s) #endif struct saHandleDatabase { unsigned int handleCount; struct saHandle *handles; pthread_mutex_t mutex; void (*handleInstanceDestructor) (void *); }; cs_error_t coroipcc_service_connect ( const char *socket_name, enum service_types service, void **ipc_context); cs_error_t coroipcc_service_disconnect ( void *ipc_context); int coroipcc_fd_get ( void *ipc_context); int coroipcc_dispatch_recv ( void *ipc_context, void *buf, size_t buflen, int timeout); int coroipcc_dispatch_flow_control_get ( void *ipc_context); cs_error_t coroipcc_msg_send_reply_receive ( void *ipc_context, const struct iovec *iov, - int iov_len, + unsigned int iov_len, void *res_msg, int res_len); cs_error_t coroipcc_msg_send_reply_receive_in_buf ( void *ipc_context, const struct iovec *iov, - int iov_len, + unsigned int iov_len, void **res_msg); cs_error_t saHandleCreate ( struct saHandleDatabase *handleDatabase, int instanceSize, uint64_t *handleOut); cs_error_t saHandleDestroy ( struct saHandleDatabase *handleDatabase, uint64_t handle); cs_error_t saHandleInstanceGet ( struct saHandleDatabase *handleDatabase, uint64_t handle, void **instance); cs_error_t saHandleInstancePut ( struct saHandleDatabase *handleDatabase, uint64_t handle); #define offset_of(type,member) (int)(&(((type *)0)->member)) #endif /* COROIPC_H_DEFINED */ diff --git a/include/corosync/cpg.h b/include/corosync/cpg.h index 348ddb1d..5403eaa7 100644 --- a/include/corosync/cpg.h +++ b/include/corosync/cpg.h @@ -1,201 +1,201 @@ /* * Copyright (c) 2006-2008 Red Hat, Inc. * * All rights reserved. * * Author: Christine Caulfield (ccaulfi@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef COROSYNC_CPG_H_DEFINED #define COROSYNC_CPG_H_DEFINED #include #include /** * @addtogroup cpg_corosync * * @{ */ typedef uint64_t cpg_handle_t; typedef enum { CPG_TYPE_UNORDERED, /* not implemented */ CPG_TYPE_FIFO, /* same as agreed */ CPG_TYPE_AGREED, CPG_TYPE_SAFE /* not implemented */ } cpg_guarantee_t; typedef enum { CPG_FLOW_CONTROL_DISABLED, /* flow control is disabled - new messages may be sent */ CPG_FLOW_CONTROL_ENABLED /* flow control is enabled - new messages should not be sent */ } cpg_flow_control_state_t; typedef enum { CPG_REASON_JOIN = 1, CPG_REASON_LEAVE = 2, CPG_REASON_NODEDOWN = 3, CPG_REASON_NODEUP = 4, CPG_REASON_PROCDOWN = 5 } cpg_reason_t; struct cpg_address { uint32_t nodeid; uint32_t pid; uint32_t reason; }; #define CPG_MAX_NAME_LENGTH 128 struct cpg_name { uint32_t length; char value[CPG_MAX_NAME_LENGTH]; }; #define CPG_MEMBERS_MAX 128 typedef void (*cpg_deliver_fn_t) ( cpg_handle_t handle, struct cpg_name *group_name, uint32_t nodeid, uint32_t pid, void *msg, int msg_len); typedef void (*cpg_confchg_fn_t) ( cpg_handle_t handle, struct cpg_name *group_name, struct cpg_address *member_list, int member_list_entries, struct cpg_address *left_list, int left_list_entries, struct cpg_address *joined_list, int joined_list_entries); typedef void (*cpg_groups_get_fn_t) ( cpg_handle_t handle, uint32_t group_num, uint32_t group_total, struct cpg_name *group_name, struct cpg_address *member_list, int member_list_entries); typedef struct { cpg_deliver_fn_t cpg_deliver_fn; cpg_confchg_fn_t cpg_confchg_fn; cpg_groups_get_fn_t cpg_groups_get_fn; } cpg_callbacks_t; /** @} */ /* * Create a new cpg connection */ cs_error_t cpg_initialize ( cpg_handle_t *handle, cpg_callbacks_t *callbacks); /* * Close the cpg handle */ cs_error_t cpg_finalize ( cpg_handle_t handle); /* * Get a file descriptor on which to poll. cpg_handle_t is NOT a * file descriptor and may not be used directly. */ cs_error_t cpg_fd_get ( cpg_handle_t handle, int *fd); /* * Get and set contexts for a CPG handle */ cs_error_t cpg_context_get ( cpg_handle_t handle, void **context); cs_error_t cpg_context_set ( cpg_handle_t handle, void *context); /* * Dispatch messages and configuration changes */ cs_error_t cpg_dispatch ( cpg_handle_t handle, cs_dispatch_flags_t dispatch_types); /* * Join one or more groups. * messages multicasted with cpg_mcast_joined will be sent to every * group that has been joined on handle handle. Any message multicasted * to a group that has been previously joined will be delivered in cpg_dispatch */ cs_error_t cpg_join ( cpg_handle_t handle, struct cpg_name *group); /* * Leave one or more groups */ cs_error_t cpg_leave ( cpg_handle_t handle, struct cpg_name *group); /* * Multicast to groups joined with cpg_join. * The iovec described by iovec will be multicasted to all groups joined with * the cpg_join interface for handle. */ cs_error_t cpg_mcast_joined ( cpg_handle_t handle, cpg_guarantee_t guarantee, struct iovec *iovec, - int iov_len); + unsigned int iov_len); /* * Get membership information from cpg */ cs_error_t cpg_membership_get ( cpg_handle_t handle, struct cpg_name *groupName, struct cpg_address *member_list, int *member_list_entries); cs_error_t cpg_local_get ( cpg_handle_t handle, unsigned int *local_nodeid); cs_error_t cpg_groups_get ( cpg_handle_t handle, unsigned int *num_groups); cs_error_t cpg_flow_control_state_get ( cpg_handle_t handle, cpg_flow_control_state_t *flow_control_enabled); #endif /* COROSYNC_CPG_H_DEFINED */ diff --git a/include/corosync/engine/coroapi.h b/include/corosync/engine/coroapi.h index c77db7cc..f08e84b2 100644 --- a/include/corosync/engine/coroapi.h +++ b/include/corosync/engine/coroapi.h @@ -1,602 +1,602 @@ /* * Copyright (c) 2008, 2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef COROAPI_H_DEFINED #define COROAPI_H_DEFINED #include #ifdef COROSYNC_BSD #include #endif #include typedef void * corosync_timer_handle_t; struct corosync_tpg_group { const void *group; int group_len; }; #define TOTEMIP_ADDRLEN (sizeof(struct in6_addr)) #define PROCESSOR_COUNT_MAX 384 #define INTERFACE_MAX 2 #ifndef MESSAGE_SIZE_MAX #define MESSAGE_SIZE_MAX 1024*1024 /* (1MB) */ #endif /* MESSAGE_SIZE_MAX */ #ifndef MESSAGE_QUEUE_MAX #define MESSAGE_QUEUE_MAX MESSAGE_SIZE_MAX / totem_config->net_mtu #endif /* MESSAGE_QUEUE_MAX */ #define TOTEM_AGREED 0 #define TOTEM_SAFE 1 #define MILLI_2_NANO_SECONDS 1000000ULL #if !defined(TOTEM_IP_ADDRESS) struct totem_ip_address { unsigned int nodeid; unsigned short family; unsigned char addr[TOTEMIP_ADDRLEN]; } __attribute__((packed)); #endif #if !defined(MEMB_RING_ID) struct memb_ring_id { struct totem_ip_address rep; unsigned long long seq; } __attribute__((packed)); #endif #if !defined(TOTEM_CONFIGURATION_TYPE) enum totem_configuration_type { TOTEM_CONFIGURATION_REGULAR, TOTEM_CONFIGURATION_TRANSITIONAL }; #endif #if !defined(TOTEM_CALLBACK_TOKEN_TYPE) enum totem_callback_token_type { TOTEM_CALLBACK_TOKEN_RECEIVED = 1, TOTEM_CALLBACK_TOKEN_SENT = 2 }; #endif enum cs_lib_flow_control { CS_LIB_FLOW_CONTROL_REQUIRED = 1, CS_LIB_FLOW_CONTROL_NOT_REQUIRED = 2 }; #define corosync_lib_flow_control cs_lib_flow_control #define COROSYNC_LIB_FLOW_CONTROL_REQUIRED CS_LIB_FLOW_CONTROL_REQUIRED #define COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED CS_LIB_FLOW_CONTROL_NOT_REQUIRED enum cs_lib_allow_inquorate { CS_LIB_DISALLOW_INQUORATE = 0, /* default */ CS_LIB_ALLOW_INQUORATE = 1 }; #if !defined (COROSYNC_FLOW_CONTROL_STATE) enum cs_flow_control_state { CS_FLOW_CONTROL_STATE_DISABLED, CS_FLOW_CONTROL_STATE_ENABLED }; #define corosync_flow_control_state cs_flow_control_state #define CS_FLOW_CONTROL_STATE_DISABLED CS_FLOW_CONTROL_STATE_DISABLED #define CS_FLOW_CONTROL_STATE_ENABLED CS_FLOW_CONTROL_STATE_ENABLED #endif /* COROSYNC_FLOW_CONTROL_STATE */ typedef enum { COROSYNC_FATAL_ERROR_EXIT = -1, COROSYNC_LIBAIS_SOCKET = -6, COROSYNC_LIBAIS_BIND = -7, COROSYNC_READKEY = -8, COROSYNC_INVALID_CONFIG = -9, COROSYNC_DYNAMICLOAD = -12, COROSYNC_OUT_OF_MEMORY = -15, COROSYNC_FATAL_ERR = -16 } cs_fatal_error_t; #define corosync_fatal_error_t cs_fatal_error_t; #ifndef OBJECT_PARENT_HANDLE #define OBJECT_PARENT_HANDLE 0xffffffff00000000ULL struct object_valid { char *object_name; int object_len; }; struct object_key_valid { char *key_name; int key_len; int (*validate_callback) (const void *key, int key_len, const void *value, int value_len); }; /* deprecated */ typedef enum { OBJECT_TRACK_DEPTH_ONE, OBJECT_TRACK_DEPTH_RECURSIVE } object_track_depth_t; typedef enum { OBJECT_KEY_CREATED, OBJECT_KEY_REPLACED, OBJECT_KEY_DELETED } object_change_type_t; typedef enum { OBJDB_RELOAD_NOTIFY_START, OBJDB_RELOAD_NOTIFY_END, OBJDB_RELOAD_NOTIFY_FAILED } objdb_reload_notify_type_t; typedef void (*object_key_change_notify_fn_t)( object_change_type_t change_type, hdb_handle_t parent_object_handle, hdb_handle_t object_handle, const void *object_name_pt, int object_name_len, const void *key_name_pt, int key_len, const void *key_value_pt, int key_value_len, void *priv_data_pt); typedef void (*object_create_notify_fn_t) ( hdb_handle_t parent_object_handle, hdb_handle_t object_handle, const uint8_t *name_pt, int name_len, void *priv_data_pt); typedef void (*object_destroy_notify_fn_t) ( hdb_handle_t parent_object_handle, const uint8_t *name_pt, int name_len, void *priv_data_pt); typedef void (*object_notify_callback_fn_t)( hdb_handle_t object_handle, const void *key_name, int key_len, const void *value, int value_len, object_change_type_t type, void * priv_data_pt); typedef void (*object_reload_notify_fn_t) ( objdb_reload_notify_type_t, int flush, void *priv_data_pt); #endif /* OBJECT_PARENT_HANDLE_DEFINED */ #ifndef QUORUM_H_DEFINED typedef void (*quorum_callback_fn_t) (int quorate, void *context); struct quorum_callin_functions { int (*quorate) (void); int (*register_callback) (quorum_callback_fn_t callback_fn, void *context); int (*unregister_callback) (quorum_callback_fn_t callback_fn, void *context); }; typedef void (*sync_callback_fn_t) ( const unsigned int *view_list, size_t view_list_entries, int primary_designated, struct memb_ring_id *ring_id); #endif /* QUORUM_H_DEFINED */ struct corosync_api_v1 { /* * Object and configuration APIs */ int (*object_create) ( hdb_handle_t parent_object_handle, hdb_handle_t *object_handle, const void *object_name, unsigned int object_name_len); int (*object_priv_set) ( hdb_handle_t object_handle, void *priv); int (*object_key_create) ( hdb_handle_t object_handle, const void *key_name, int key_len, const void *value, int value_len); int (*object_destroy) ( hdb_handle_t object_handle); int (*object_valid_set) ( hdb_handle_t object_handle, struct object_valid *object_valid_list, unsigned int object_valid_list_entries); int (*object_key_valid_set) ( hdb_handle_t object_handle, struct object_key_valid *object_key_valid_list, unsigned int object_key_valid_list_entries); int (*object_find_create) ( hdb_handle_t parent_object_handle, const void *object_name, int object_name_len, hdb_handle_t *object_find_handle); int (*object_find_next) ( hdb_handle_t object_find_handle, hdb_handle_t *object_handle); int (*object_find_destroy) ( hdb_handle_t object_find_handle); int (*object_key_get) ( hdb_handle_t object_handle, const void *key_name, int key_len, void **value, int *value_len); int (*object_priv_get) ( hdb_handle_t jobject_handle, void **priv); int (*object_key_replace) ( hdb_handle_t object_handle, const void *key_name, int key_len, const void *new_value, int new_value_len); int (*object_key_delete) ( hdb_handle_t object_handle, const void *key_name, int key_len); int (*object_iter_reset) ( hdb_handle_t parent_object_handle); int (*object_iter) ( hdb_handle_t parent_object_handle, void **object_name, int *name_len, hdb_handle_t *object_handle); int (*object_key_iter_reset) ( hdb_handle_t object_handle); int (*object_key_iter) ( hdb_handle_t parent_object_handle, void **key_name, int *key_len, void **value, int *value_len); int (*object_parent_get) ( hdb_handle_t object_handle, hdb_handle_t *parent_handle); int (*object_name_get) ( hdb_handle_t object_handle, char *object_name, int *object_name_len); int (*object_dump) ( hdb_handle_t object_handle, FILE *file); int (*object_key_iter_from) ( hdb_handle_t parent_object_handle, hdb_handle_t start_pos, void **key_name, int *key_len, void **value, int *value_len); int (*object_track_start) ( hdb_handle_t object_handle, object_track_depth_t depth, object_key_change_notify_fn_t key_change_notify_fn, object_create_notify_fn_t object_create_notify_fn, object_destroy_notify_fn_t object_destroy_notify_fn, object_reload_notify_fn_t object_reload_notify_fn, void * priv_data_pt); void (*object_track_stop) ( object_key_change_notify_fn_t key_change_notify_fn, object_create_notify_fn_t object_create_notify_fn, object_destroy_notify_fn_t object_destroy_notify_fn, object_reload_notify_fn_t object_reload_notify_fn, void * priv_data_pt); int (*object_write_config) (const char **error_string); int (*object_reload_config) (int flush, const char **error_string); int (*object_key_increment) ( hdb_handle_t object_handle, const void *key_name, int key_len, unsigned int *value); int (*object_key_decrement) ( hdb_handle_t object_handle, const void *key_name, int key_len, unsigned int *value); /* * Time and timer APIs */ int (*timer_add_duration) ( unsigned long long nanoseconds_in_future, void *data, void (*timer_nf) (void *data), corosync_timer_handle_t *handle); int (*timer_add_absolute) ( unsigned long long nanoseconds_from_epoch, void *data, void (*timer_fn) (void *data), corosync_timer_handle_t *handle); void (*timer_delete) ( corosync_timer_handle_t timer_handle); unsigned long long (*timer_time_get) (void); unsigned long long (*timer_expire_time_get) ( corosync_timer_handle_t timer_handle); /* * IPC APIs */ void (*ipc_source_set) (mar_message_source_t *source, void *conn); int (*ipc_source_is_local) (const mar_message_source_t *source); void *(*ipc_private_data_get) (void *conn); int (*ipc_response_send) (void *conn, const void *msg, int mlen); int (*ipc_response_iov_send) (void *conn, - const struct iovec *iov, int iov_len); + const struct iovec *iov, unsigned int iov_len); int (*ipc_dispatch_send) (void *conn, const void *msg, int mlen); int (*ipc_dispatch_iov_send) (void *conn, - const struct iovec *iov, int iov_len); + const struct iovec *iov, unsigned int iov_len); void (*ipc_refcnt_inc) (void *conn); void (*ipc_refcnt_dec) (void *conn); /* * Totem APIs */ unsigned int (*totem_nodeid_get) (void); int (*totem_family_get) (void); int (*totem_ring_reenable) (void); - int (*totem_mcast) (struct iovec *iovec, int iov_len, unsigned int guarantee); + int (*totem_mcast) (struct iovec *iovec, unsigned int iov_len, unsigned int guarantee); int (*totem_ifaces_get) ( unsigned int nodeid, struct totem_ip_address *interfaces, char ***status, unsigned int *iface_count); const char *(*totem_ifaces_print) (unsigned int nodeid); const char *(*totem_ip_print) (const struct totem_ip_address *addr); int (*totem_callback_token_create) ( void **handle_out, enum totem_callback_token_type type, int delete, int (*callback_fn) (enum totem_callback_token_type type, const void *), const void *data); /* * Totem open process groups API for those service engines * wanting their own groups */ int (*tpg_init) ( hdb_handle_t *handle, void (*deliver_fn) ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required), void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id)); int (*tpg_exit) ( hdb_handle_t handle); int (*tpg_join) ( hdb_handle_t handle, struct corosync_tpg_group *groups, int group_cnt); int (*tpg_leave) ( hdb_handle_t handle, struct corosync_tpg_group *groups, int group_cnt); int (*tpg_joined_mcast) ( hdb_handle_t handle, const struct iovec *iovec, - int iov_len, + unsigned int iov_len, int guarantee); int (*tpg_joined_reserve) ( hdb_handle_t handle, const struct iovec *iovec, - int iov_len); + unsigned int iov_len); int (*tpg_joined_release) ( int reserved_msgs); int (*tpg_groups_mcast) ( hdb_handle_t handle, int guarantee, const struct corosync_tpg_group *groups, int groups_cnt, const struct iovec *iovec, - int iov_len); + unsigned int iov_len); int (*tpg_groups_reserve) ( hdb_handle_t handle, const struct corosync_tpg_group *groups, int groups_cnt, const struct iovec *iovec, - int iov_len); + unsigned int iov_len); int (*tpg_groups_release) ( int reserved_msgs); int (*sync_request) ( const char *service_name); /* * User plugin-callable functions for quorum */ int (*quorum_is_quorate) (void); int (*quorum_register_callback) (quorum_callback_fn_t callback_fn, void *context); int (*quorum_unregister_callback) (quorum_callback_fn_t callback_fn, void *context); /* * This one is for the quorum management plugin's use */ int (*quorum_initialize)(struct quorum_callin_functions *fns); /* * Plugin loading and unloading */ int (*plugin_interface_reference) ( hdb_handle_t *handle, const char *iface_name, int version, void **interface, void *context); int (*plugin_interface_release) (hdb_handle_t handle); /* * Service loading and unloading APIs */ unsigned int (*service_link_and_init) ( struct corosync_api_v1 *corosync_api_v1, const char *service_name, unsigned int service_ver); unsigned int (*service_unlink_and_exit) ( struct corosync_api_v1 *corosync_api_v1, const char *service_name, unsigned int service_ver); /* * Error handling APIs */ void (*error_memory_failure) (void); #define corosync_fatal_error(err) api->fatal_error ((err), __FILE__, __LINE__) void (*fatal_error) (cs_fatal_error_t err, const char *file, unsigned int line); }; #define SERVICE_ID_MAKE(a,b) ( ((a)<<16) | (b) ) #define SERVICE_HANDLER_MAXIMUM_COUNT 64 struct corosync_lib_handler { void (*lib_handler_fn) (void *conn, void *msg); int response_size; int response_id; enum cs_lib_flow_control flow_control; }; struct corosync_exec_handler { void (*exec_handler_fn) (const void *msg, unsigned int nodeid); void (*exec_endian_convert_fn) (void *msg); }; struct corosync_service_engine_iface_ver0 { struct corosync_service_engine *(*corosync_get_service_engine_ver0) (void); }; struct corosync_service_engine { const char *name; unsigned short id; unsigned int private_data_size; enum cs_lib_flow_control flow_control; enum cs_lib_allow_inquorate allow_inquorate; int (*exec_init_fn) (struct corosync_api_v1 *); int (*exec_exit_fn) (void); void (*exec_dump_fn) (void); int (*lib_init_fn) (void *conn); int (*lib_exit_fn) (void *conn); struct corosync_lib_handler *lib_engine; int lib_engine_count; struct corosync_exec_handler *exec_engine; int exec_engine_count; int (*config_init_fn) (struct corosync_api_v1 *); void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); void (*sync_init) (void); int (*sync_process) (void); void (*sync_activate) (void); void (*sync_abort) (void); }; #endif /* COROAPI_H_DEFINED */ diff --git a/include/corosync/totem/totempg.h b/include/corosync/totem/totempg.h index db1fe999..b0f489c0 100644 --- a/include/corosync/totem/totempg.h +++ b/include/corosync/totem/totempg.h @@ -1,150 +1,150 @@ /* * Copyright (c) 2003-2005 MontaVista Software, Inc. * Copyright (c) 2006-2007, 2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef TOTEMPG_H_DEFINED #define TOTEMPG_H_DEFINED #include #include "totem.h" #include "coropoll.h" #include struct totempg_group { const void *group; int group_len; }; #define TOTEMPG_AGREED 0 #define TOTEMPG_SAFE 1 /* * Totem Single Ring Protocol * depends on poll abstraction, POSIX, IPV4 */ /* * Initialize the totem process groups abstraction */ extern int totempg_initialize ( hdb_handle_t poll_handle, struct totem_config *totem_config ); extern void totempg_finalize (void); extern int totempg_callback_token_create (void **handle_out, enum totem_callback_token_type type, int delete, int (*callback_fn) (enum totem_callback_token_type type, const void *), const void *data); extern void totempg_callback_token_destroy (void *handle); /* * Initialize a groups instance */ extern int totempg_groups_initialize ( hdb_handle_t *handle, void (*deliver_fn) ( unsigned int nodeid, struct iovec *iovec, - int iov_len, + unsigned int iov_len, int endian_conversion_required), void (*confchg_fn) ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id)); extern int totempg_groups_finalize ( hdb_handle_t handle); extern int totempg_groups_join ( hdb_handle_t handle, const struct totempg_group *groups, size_t group_cnt); extern int totempg_groups_leave ( hdb_handle_t handle, const struct totempg_group *groups, size_t group_cnt); extern int totempg_groups_mcast_joined ( hdb_handle_t handle, const struct iovec *iovec, - int iov_len, + unsigned int iov_len, int guarantee); extern int totempg_groups_joined_reserve ( hdb_handle_t handle, const struct iovec *iovec, - int iov_len); + unsigned int iov_len); extern int totempg_groups_joined_release ( int msg_count); extern int totempg_groups_mcast_groups ( hdb_handle_t handle, int guarantee, const struct totempg_group *groups, size_t groups_cnt, const struct iovec *iovec, size_t iov_len); extern int totempg_groups_send_ok_groups ( hdb_handle_t handle, const struct totempg_group *groups, size_t groups_cnt, const struct iovec *iovec, size_t iov_len); extern int totempg_ifaces_get ( unsigned int nodeid, struct totem_ip_address *interfaces, char ***status, unsigned int *iface_count); extern const char *totempg_ifaces_print (unsigned int nodeid); extern unsigned int totempg_my_nodeid_get (void); extern int totempg_my_family_get (void); extern int totempg_ring_reenable (void); #endif /* TOTEMPG_H_DEFINED */ diff --git a/lib/coroipcc.c b/lib/coroipcc.c index c7ac176c..5cf39f80 100644 --- a/lib/coroipcc.c +++ b/lib/coroipcc.c @@ -1,899 +1,899 @@ /* * vi: set autoindent tabstop=4 shiftwidth=4 : * * Copyright (c) 2002-2006 MontaVista Software, Inc. * Copyright (c) 2006-2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include enum SA_HANDLE_STATE { SA_HANDLE_STATE_EMPTY, SA_HANDLE_STATE_PENDINGREMOVAL, SA_HANDLE_STATE_ACTIVE }; struct saHandle { int state; void *instance; int refCount; uint32_t check; }; struct ipc_segment { int fd; int shmid; int semid; int flow_control_state; struct shared_memory *shared_memory; uid_t euid; }; #if defined(COROSYNC_LINUX) /* SUN_LEN is broken for abstract namespace */ #define AIS_SUN_LEN(a) sizeof(*(a)) #else #define AIS_SUN_LEN(a) SUN_LEN(a) #endif #ifdef SO_NOSIGPIPE void socket_nosigpipe(int s) { int on = 1; setsockopt(s, SOL_SOCKET, SO_NOSIGPIPE, (void *)&on, sizeof(on)); } #endif #ifndef MSG_NOSIGNAL #define MSG_NOSIGNAL 0 #endif static int coroipcc_send ( int s, void *msg, size_t len) { int result; struct msghdr msg_send; struct iovec iov_send; char *rbuf = msg; int processed = 0; msg_send.msg_iov = &iov_send; msg_send.msg_iovlen = 1; msg_send.msg_name = 0; msg_send.msg_namelen = 0; msg_send.msg_control = 0; msg_send.msg_controllen = 0; msg_send.msg_flags = 0; retry_send: iov_send.iov_base = &rbuf[processed]; iov_send.iov_len = len - processed; result = sendmsg (s, &msg_send, MSG_NOSIGNAL); /* * return immediately on any kind of syscall error that maps to * CS_ERR if no part of message has been sent */ if (result == -1 && processed == 0) { if (errno == EINTR) { goto error_exit; } if (errno == EAGAIN) { goto error_exit; } if (errno == EFAULT) { goto error_exit; } } /* * retry read operations that are already started except * for fault in that case, return ERR_LIBRARY */ if (result == -1 && processed > 0) { if (errno == EINTR) { goto retry_send; } if (errno == EAGAIN) { goto retry_send; } if (errno == EFAULT) { goto error_exit; } } /* * return ERR_LIBRARY on any other syscall error */ if (result == -1) { goto error_exit; } processed += result; if (processed != len) { goto retry_send; } return (0); error_exit: return (-1); } static int coroipcc_recv ( int s, void *msg, size_t len) { int error = 0; int result; struct msghdr msg_recv; struct iovec iov_recv; char *rbuf = msg; int processed = 0; msg_recv.msg_iov = &iov_recv; msg_recv.msg_iovlen = 1; msg_recv.msg_name = 0; msg_recv.msg_namelen = 0; msg_recv.msg_control = 0; msg_recv.msg_controllen = 0; msg_recv.msg_flags = 0; retry_recv: iov_recv.iov_base = (void *)&rbuf[processed]; iov_recv.iov_len = len - processed; result = recvmsg (s, &msg_recv, MSG_NOSIGNAL|MSG_WAITALL); if (result == -1 && errno == EINTR) { goto retry_recv; } if (result == -1 && errno == EAGAIN) { goto retry_recv; } #if defined(COROSYNC_SOLARIS) || defined(COROSYNC_BSD) || defined(COROSYNC_DARWIN) /* On many OS poll never return POLLHUP or POLLERR. * EOF is detected when recvmsg return 0. */ if (result == 0) { error = -1; goto error_exit; } #endif if (result == -1 || result == 0) { error = -1; goto error_exit; } processed += result; if (processed != len) { goto retry_recv; } assert (processed == len); error_exit: return (0); } static int priv_change_send (struct ipc_segment *ipc_segment) { char buf_req; mar_req_priv_change req_priv_change; unsigned int res; req_priv_change.euid = geteuid(); /* * Don't resend request unless euid has changed */ if (ipc_segment->euid == req_priv_change.euid) { return (0); } req_priv_change.egid = getegid(); buf_req = MESSAGE_REQ_CHANGE_EUID; res = coroipcc_send (ipc_segment->fd, &buf_req, 1); if (res == -1) { return (-1); } res = coroipcc_send (ipc_segment->fd, &req_priv_change, sizeof (req_priv_change)); if (res == -1) { return (-1); } ipc_segment->euid = req_priv_change.euid; return (0); } #if defined(_SEM_SEMUN_UNDEFINED) union semun { int val; struct semid_ds *buf; unsigned short int *array; struct seminfo *__buf; }; #endif cs_error_t coroipcc_service_connect ( const char *socket_name, enum service_types service, void **shmseg) { int request_fd; struct sockaddr_un address; cs_error_t error; struct ipc_segment *ipc_segment; key_t shmkey = 0; key_t semkey = 0; int res; mar_req_setup_t req_setup; mar_res_setup_t res_setup; union semun semun; res_setup.error = CS_ERR_LIBRARY; request_fd = socket (PF_UNIX, SOCK_STREAM, 0); if (request_fd == -1) { return (-1); } memset (&address, 0, sizeof (struct sockaddr_un)); #if defined(COROSYNC_BSD) || defined(COROSYNC_DARWIN) address.sun_len = sizeof(struct sockaddr_un); #endif address.sun_family = PF_UNIX; #if defined(COROSYNC_LINUX) sprintf (address.sun_path + 1, "%s", socket_name); #else sprintf (address.sun_path, "%s/%s", SOCKETDIR, socket_name); #endif res = connect (request_fd, (struct sockaddr *)&address, AIS_SUN_LEN(&address)); if (res == -1) { close (request_fd); return (CS_ERR_TRY_AGAIN); } ipc_segment = malloc (sizeof (struct ipc_segment)); if (ipc_segment == NULL) { close (request_fd); return (-1); } bzero (ipc_segment, sizeof (struct ipc_segment)); /* * Allocate a shared memory segment */ while (1) { shmkey = random(); if ((ipc_segment->shmid = shmget (shmkey, sizeof (struct shared_memory), IPC_CREAT|IPC_EXCL|0600)) != -1) { break; } if (errno != EEXIST) { goto error_exit; } } /* * Allocate a semaphore segment */ while (1) { semkey = random(); ipc_segment->euid = geteuid (); if ((ipc_segment->semid = semget (semkey, 3, IPC_CREAT|IPC_EXCL|0600)) != -1) { break; } if (errno != EEXIST) { goto error_exit; } } /* * Attach to shared memory segment */ ipc_segment->shared_memory = shmat (ipc_segment->shmid, NULL, 0); if (ipc_segment->shared_memory == (void *)-1) { goto error_exit; } semun.val = 0; res = semctl (ipc_segment->semid, 0, SETVAL, semun); if (res != 0) { goto error_exit; } res = semctl (ipc_segment->semid, 1, SETVAL, semun); if (res != 0) { goto error_exit; } req_setup.shmkey = shmkey; req_setup.semkey = semkey; req_setup.service = service; error = coroipcc_send (request_fd, &req_setup, sizeof (mar_req_setup_t)); if (error != 0) { goto error_exit; } error = coroipcc_recv (request_fd, &res_setup, sizeof (mar_res_setup_t)); if (error != 0) { goto error_exit; } ipc_segment->fd = request_fd; ipc_segment->flow_control_state = 0; *shmseg = ipc_segment; /* * Something go wrong with server * Cleanup all */ if (res_setup.error == CS_ERR_TRY_AGAIN) { goto error_exit; } return (res_setup.error); error_exit: close (request_fd); if (ipc_segment->shmid > 0) shmctl (ipc_segment->shmid, IPC_RMID, NULL); if (ipc_segment->semid > 0) semctl (ipc_segment->semid, 0, IPC_RMID); return (res_setup.error); } cs_error_t coroipcc_service_disconnect ( void *ipc_context) { struct ipc_segment *ipc_segment = (struct ipc_segment *)ipc_context; shutdown (ipc_segment->fd, SHUT_RDWR); close (ipc_segment->fd); shmdt (ipc_segment->shared_memory); free (ipc_segment); return (CS_OK); } int coroipcc_dispatch_flow_control_get ( void *ipc_context) { struct ipc_segment *ipc_segment = (struct ipc_segment *)ipc_context; return (ipc_segment->flow_control_state); } int coroipcc_fd_get (void *ipc_ctx) { struct ipc_segment *ipc_segment = (struct ipc_segment *)ipc_ctx; return (ipc_segment->fd); } static void memcpy_swrap (void *dest, size_t dest_len, void *src, int len, unsigned int *read) { char *dest_chr = (char *)dest; char *src_chr = (char *)src; unsigned int first_read; unsigned int second_read; first_read = len; second_read = 0; if (len + *read >= DISPATCH_SIZE) { first_read = DISPATCH_SIZE - *read; second_read = (len + *read) % DISPATCH_SIZE; } memcpy (dest_chr, &src_chr[*read], first_read); if (second_read) { memcpy (&dest_chr[first_read], src_chr, second_read); } *read = (*read + len) % (DISPATCH_SIZE); } int original_flow = -1; int coroipcc_dispatch_recv (void *ipc_ctx, void *data, size_t buflen, int timeout) { struct pollfd ufds; struct sembuf sop; int poll_events; mar_res_header_t *header; char buf; struct ipc_segment *ipc_segment = (struct ipc_segment *)ipc_ctx; int res; unsigned int my_read; char buf_two = 1; ufds.fd = ipc_segment->fd; ufds.events = POLLIN; ufds.revents = 0; retry_poll: poll_events = poll (&ufds, 1, timeout); if (poll_events == -1 && errno == EINTR) { goto retry_poll; } else if (poll_events == -1) { return (-1); } else if (poll_events == 0) { return (0); } if (poll_events == 1 && (ufds.revents & (POLLERR|POLLHUP))) { return (-1); } retry_recv: res = recv (ipc_segment->fd, &buf, 1, 0); if (res == -1 && errno == EINTR) { goto retry_recv; } else if (res == -1) { return (-1); } if (res == 0) { return (-1); } ipc_segment->flow_control_state = 0; if (buf == 1 || buf == 2) { ipc_segment->flow_control_state = 1; } /* * Notify executive to flush any pending dispatch messages */ if (ipc_segment->flow_control_state) { buf_two = MESSAGE_REQ_OUTQ_FLUSH; res = coroipcc_send (ipc_segment->fd, &buf_two, 1); assert (res == 0); //TODO } /* * This is just a notification of flow control starting at the addition * of a new pending message, not a message to dispatch */ if (buf == 2) { return (0); } if (buf == 3) { return (0); } sop.sem_num = 2; sop.sem_op = -1; sop.sem_flg = 0; retry_semop: res = semop (ipc_segment->semid, &sop, 1); if (res == -1 && errno == EINTR) { goto retry_semop; } else if (res == -1 && errno == EACCES) { priv_change_send (ipc_segment); goto retry_semop; } else if (res == -1) { return (-1); } if (buflen < DISPATCH_SIZE) { return -1; } if (ipc_segment->shared_memory->read + sizeof (mar_res_header_t) >= DISPATCH_SIZE) { my_read = ipc_segment->shared_memory->read; memcpy_swrap (data, DISPATCH_SIZE, ipc_segment->shared_memory->dispatch_buffer, sizeof (mar_res_header_t), &ipc_segment->shared_memory->read); header = (mar_res_header_t *)data; memcpy_swrap ( (void *)((char *)data + sizeof (mar_res_header_t)), DISPATCH_SIZE, ipc_segment->shared_memory->dispatch_buffer, header->size - sizeof (mar_res_header_t), &ipc_segment->shared_memory->read); } else { header = (mar_res_header_t *)&ipc_segment->shared_memory->dispatch_buffer[ipc_segment->shared_memory->read]; memcpy_swrap ( data, DISPATCH_SIZE, ipc_segment->shared_memory->dispatch_buffer, header->size, &ipc_segment->shared_memory->read); } return (1); } static cs_error_t coroipcc_msg_send ( void *ipc_context, const struct iovec *iov, - int iov_len) + unsigned int iov_len) { struct ipc_segment *ipc_segment = (struct ipc_segment *)ipc_context; struct sembuf sop; int i; int res; int req_buffer_idx = 0; for (i = 0; i < iov_len; i++) { memcpy (&ipc_segment->shared_memory->req_buffer[req_buffer_idx], iov[i].iov_base, iov[i].iov_len); req_buffer_idx += iov[i].iov_len; } /* * Signal semaphore #0 indicting a new message from client * to server request queue */ sop.sem_num = 0; sop.sem_op = 1; sop.sem_flg = 0; retry_semop: res = semop (ipc_segment->semid, &sop, 1); if (res == -1 && errno == EINTR) { goto retry_semop; } else if (res == -1 && errno == EACCES) { priv_change_send (ipc_segment); goto retry_semop; } else if (res == -1) { return (CS_ERR_LIBRARY); } return (CS_OK); } static cs_error_t coroipcc_reply_receive ( void *ipc_context, void *res_msg, int res_len) { struct sembuf sop; struct ipc_segment *ipc_segment = (struct ipc_segment *)ipc_context; int res; /* * Wait for semaphore #1 indicating a new message from server * to client in the response queue */ sop.sem_num = 1; sop.sem_op = -1; sop.sem_flg = 0; retry_semop: res = semop (ipc_segment->semid, &sop, 1); if (res == -1 && errno == EINTR) { goto retry_semop; } else if (res == -1 && errno == EACCES) { priv_change_send (ipc_segment); goto retry_semop; } else if (res == -1) { return (CS_ERR_LIBRARY); } memcpy (res_msg, ipc_segment->shared_memory->res_buffer, res_len); return (CS_OK); } static cs_error_t coroipcc_reply_receive_in_buf ( void *ipc_context, void **res_msg) { struct sembuf sop; struct ipc_segment *ipc_segment = (struct ipc_segment *)ipc_context; int res; /* * Wait for semaphore #1 indicating a new message from server * to client in the response queue */ sop.sem_num = 1; sop.sem_op = -1; sop.sem_flg = 0; retry_semop: res = semop (ipc_segment->semid, &sop, 1); if (res == -1 && errno == EINTR) { goto retry_semop; } else if (res == -1 && errno == EACCES) { priv_change_send (ipc_segment); goto retry_semop; } else if (res == -1) { return (CS_ERR_LIBRARY); } *res_msg = (char *)ipc_segment->shared_memory->res_buffer; return (CS_OK); } cs_error_t coroipcc_msg_send_reply_receive ( void *ipc_context, const struct iovec *iov, - int iov_len, + unsigned int iov_len, void *res_msg, int res_len) { cs_error_t res; res = coroipcc_msg_send (ipc_context, iov, iov_len); if (res != CS_OK) { return (res); } res = coroipcc_reply_receive (ipc_context, res_msg, res_len); if (res != CS_OK) { return (res); } return (CS_OK); } cs_error_t coroipcc_msg_send_reply_receive_in_buf ( void *ipc_context, const struct iovec *iov, - int iov_len, + unsigned int iov_len, void **res_msg) { unsigned int res; res = coroipcc_msg_send (ipc_context, iov, iov_len); if (res != CS_OK) { return (res); } res = coroipcc_reply_receive_in_buf (ipc_context, res_msg); if (res != CS_OK) { return (res); } return (CS_OK); } cs_error_t saHandleCreate ( struct saHandleDatabase *handleDatabase, int instanceSize, uint64_t *handleOut) { uint32_t handle; uint32_t check; void *newHandles = NULL; int found = 0; void *instance; int i; pthread_mutex_lock (&handleDatabase->mutex); for (handle = 0; handle < handleDatabase->handleCount; handle++) { if (handleDatabase->handles[handle].state == SA_HANDLE_STATE_EMPTY) { found = 1; break; } } if (found == 0) { handleDatabase->handleCount += 1; newHandles = (struct saHandle *)realloc (handleDatabase->handles, sizeof (struct saHandle) * handleDatabase->handleCount); if (newHandles == NULL) { pthread_mutex_unlock (&handleDatabase->mutex); return (CS_ERR_NO_MEMORY); } handleDatabase->handles = newHandles; } instance = malloc (instanceSize); if (instance == 0) { free (newHandles); pthread_mutex_unlock (&handleDatabase->mutex); return (CS_ERR_NO_MEMORY); } /* * This code makes sure the random number isn't zero * We use 0 to specify an invalid handle out of the 1^64 address space * If we get 0 200 times in a row, the RNG may be broken */ for (i = 0; i < 200; i++) { check = random(); if (check != 0) { break; } } memset (instance, 0, instanceSize); handleDatabase->handles[handle].state = SA_HANDLE_STATE_ACTIVE; handleDatabase->handles[handle].instance = instance; handleDatabase->handles[handle].refCount = 1; handleDatabase->handles[handle].check = check; *handleOut = (uint64_t)((uint64_t)check << 32 | handle); pthread_mutex_unlock (&handleDatabase->mutex); return (CS_OK); } cs_error_t saHandleDestroy ( struct saHandleDatabase *handleDatabase, uint64_t inHandle) { cs_error_t error = CS_OK; uint32_t check = inHandle >> 32; uint32_t handle = inHandle & 0xffffffff; pthread_mutex_lock (&handleDatabase->mutex); if (check != handleDatabase->handles[handle].check) { pthread_mutex_unlock (&handleDatabase->mutex); error = CS_ERR_BAD_HANDLE; return (error); } handleDatabase->handles[handle].state = SA_HANDLE_STATE_PENDINGREMOVAL; pthread_mutex_unlock (&handleDatabase->mutex); saHandleInstancePut (handleDatabase, inHandle); return (error); } cs_error_t saHandleInstanceGet ( struct saHandleDatabase *handleDatabase, uint64_t inHandle, void **instance) { uint32_t check = inHandle >> 32; uint32_t handle = inHandle & 0xffffffff; cs_error_t error = CS_OK; pthread_mutex_lock (&handleDatabase->mutex); if (handle >= (uint64_t)handleDatabase->handleCount) { error = CS_ERR_BAD_HANDLE; goto error_exit; } if (handleDatabase->handles[handle].state != SA_HANDLE_STATE_ACTIVE) { error = CS_ERR_BAD_HANDLE; goto error_exit; } if (check != handleDatabase->handles[handle].check) { error = CS_ERR_BAD_HANDLE; goto error_exit; } *instance = handleDatabase->handles[handle].instance; handleDatabase->handles[handle].refCount += 1; error_exit: pthread_mutex_unlock (&handleDatabase->mutex); return (error); } cs_error_t saHandleInstancePut ( struct saHandleDatabase *handleDatabase, uint64_t inHandle) { void *instance; cs_error_t error = CS_OK; uint32_t check = inHandle >> 32; uint32_t handle = inHandle & 0xffffffff; pthread_mutex_lock (&handleDatabase->mutex); if (check != handleDatabase->handles[handle].check) { error = CS_ERR_BAD_HANDLE; goto error_exit; } handleDatabase->handles[handle].refCount -= 1; assert (handleDatabase->handles[handle].refCount >= 0); if (handleDatabase->handles[handle].refCount == 0) { instance = (handleDatabase->handles[handle].instance); handleDatabase->handleInstanceDestructor (instance); free (instance); memset (&handleDatabase->handles[handle], 0, sizeof (struct saHandle)); } error_exit: pthread_mutex_unlock (&handleDatabase->mutex); return (error); } diff --git a/lib/cpg.c b/lib/cpg.c index 74de3ae1..74b0e2f5 100644 --- a/lib/cpg.c +++ b/lib/cpg.c @@ -1,659 +1,659 @@ /* * vi: set autoindent tabstop=4 shiftwidth=4 : * * Copyright (c) 2006-2008 Red Hat, Inc. * * All rights reserved. * * Author: Patrick Caulfield (pcaulfie@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ /* * Provides a closed process group API using the coroipcc executive */ #include #include #include #include #include #include #include #include #include #include #include #include #include struct cpg_inst { void *ipc_ctx; int finalize; cpg_callbacks_t callbacks; void *context; pthread_mutex_t response_mutex; pthread_mutex_t dispatch_mutex; }; static void cpg_instance_destructor (void *instance); static struct saHandleDatabase cpg_handle_t_db = { .handleCount = 0, .handles = 0, .mutex = PTHREAD_MUTEX_INITIALIZER, .handleInstanceDestructor = cpg_instance_destructor }; /* * Clean up function for a cpg instance (cpg_nitialize) handle */ static void cpg_instance_destructor (void *instance) { struct cpg_inst *cpg_inst = instance; pthread_mutex_destroy (&cpg_inst->response_mutex); pthread_mutex_destroy (&cpg_inst->dispatch_mutex); } /** * @defgroup cpg_coroipcc The closed process group API * @ingroup coroipcc * * @{ */ cs_error_t cpg_initialize ( cpg_handle_t *handle, cpg_callbacks_t *callbacks) { cs_error_t error; struct cpg_inst *cpg_inst; error = saHandleCreate (&cpg_handle_t_db, sizeof (struct cpg_inst), handle); if (error != CS_OK) { goto error_no_destroy; } error = saHandleInstanceGet (&cpg_handle_t_db, *handle, (void *)&cpg_inst); if (error != CS_OK) { goto error_destroy; } error = coroipcc_service_connect (IPC_SOCKET_NAME, CPG_SERVICE, &cpg_inst->ipc_ctx); if (error != CS_OK) { goto error_put_destroy; } memcpy (&cpg_inst->callbacks, callbacks, sizeof (cpg_callbacks_t)); pthread_mutex_init (&cpg_inst->response_mutex, NULL); pthread_mutex_init (&cpg_inst->dispatch_mutex, NULL); saHandleInstancePut (&cpg_handle_t_db, *handle); return (CS_OK); error_put_destroy: saHandleInstancePut (&cpg_handle_t_db, *handle); error_destroy: saHandleDestroy (&cpg_handle_t_db, *handle); error_no_destroy: return (error); } cs_error_t cpg_finalize ( cpg_handle_t handle) { struct cpg_inst *cpg_inst; cs_error_t error; error = saHandleInstanceGet (&cpg_handle_t_db, handle, (void *)&cpg_inst); if (error != CS_OK) { return (error); } pthread_mutex_lock (&cpg_inst->response_mutex); /* * Another thread has already started finalizing */ if (cpg_inst->finalize) { pthread_mutex_unlock (&cpg_inst->response_mutex); saHandleInstancePut (&cpg_handle_t_db, handle); return (CPG_ERR_BAD_HANDLE); } cpg_inst->finalize = 1; coroipcc_service_disconnect (cpg_inst->ipc_ctx); pthread_mutex_unlock (&cpg_inst->response_mutex); saHandleDestroy (&cpg_handle_t_db, handle); saHandleInstancePut (&cpg_handle_t_db, handle); return (CPG_OK); } cs_error_t cpg_fd_get ( cpg_handle_t handle, int *fd) { cs_error_t error; struct cpg_inst *cpg_inst; error = saHandleInstanceGet (&cpg_handle_t_db, handle, (void *)&cpg_inst); if (error != CS_OK) { return (error); } *fd = coroipcc_fd_get (cpg_inst->ipc_ctx); saHandleInstancePut (&cpg_handle_t_db, handle); return (CS_OK); } cs_error_t cpg_context_get ( cpg_handle_t handle, void **context) { cs_error_t error; struct cpg_inst *cpg_inst; error = saHandleInstanceGet (&cpg_handle_t_db, handle, (void *)&cpg_inst); if (error != CS_OK) { return (error); } *context = cpg_inst->context; saHandleInstancePut (&cpg_handle_t_db, handle); return (CS_OK); } cs_error_t cpg_context_set ( cpg_handle_t handle, void *context) { cs_error_t error; struct cpg_inst *cpg_inst; error = saHandleInstanceGet (&cpg_handle_t_db, handle, (void *)&cpg_inst); if (error != CS_OK) { return (error); } cpg_inst->context = context; saHandleInstancePut (&cpg_handle_t_db, handle); return (CS_OK); } struct res_overlay { mar_res_header_t header __attribute__((aligned(8))); char data[512000]; }; cs_error_t cpg_dispatch ( cpg_handle_t handle, cs_dispatch_flags_t dispatch_types) { int timeout = -1; cs_error_t error; int cont = 1; /* always continue do loop except when set to 0 */ int dispatch_avail; struct cpg_inst *cpg_inst; struct res_lib_cpg_confchg_callback *res_cpg_confchg_callback; struct res_lib_cpg_deliver_callback *res_cpg_deliver_callback; cpg_callbacks_t callbacks; struct res_overlay dispatch_data; int ignore_dispatch = 0; struct cpg_address member_list[CPG_MEMBERS_MAX]; struct cpg_address left_list[CPG_MEMBERS_MAX]; struct cpg_address joined_list[CPG_MEMBERS_MAX]; struct cpg_name group_name; mar_cpg_address_t *left_list_start; mar_cpg_address_t *joined_list_start; unsigned int i; error = saHandleInstanceGet (&cpg_handle_t_db, handle, (void *)&cpg_inst); if (error != CS_OK) { return (error); } /* * Timeout instantly for SA_DISPATCH_ONE or SA_DISPATCH_ALL and * wait indefinately for SA_DISPATCH_BLOCKING */ if (dispatch_types == CPG_DISPATCH_ALL) { timeout = 0; } do { pthread_mutex_lock (&cpg_inst->dispatch_mutex); dispatch_avail = coroipcc_dispatch_recv (cpg_inst->ipc_ctx, (void *)&dispatch_data, sizeof (dispatch_data), timeout); pthread_mutex_unlock (&cpg_inst->dispatch_mutex); if (error != CS_OK) { goto error_put; } if (dispatch_avail == 0 && dispatch_types == CPG_DISPATCH_ALL) { pthread_mutex_unlock (&cpg_inst->dispatch_mutex); break; /* exit do while cont is 1 loop */ } else if (dispatch_avail == 0) { pthread_mutex_unlock (&cpg_inst->dispatch_mutex); continue; /* next poll */ } if (dispatch_avail == -1) { if (cpg_inst->finalize == 1) { error = CS_OK; } else { error = CS_ERR_LIBRARY; } goto error_put; } /* * Make copy of callbacks, message data, unlock instance, and call callback * A risk of this dispatch method is that the callback routines may * operate at the same time that cpgFinalize has been called. */ memcpy (&callbacks, &cpg_inst->callbacks, sizeof (cpg_callbacks_t)); /* * Dispatch incoming message */ switch (dispatch_data.header.id) { case MESSAGE_RES_CPG_DELIVER_CALLBACK: res_cpg_deliver_callback = (struct res_lib_cpg_deliver_callback *)&dispatch_data; marshall_from_mar_cpg_name_t ( &group_name, &res_cpg_deliver_callback->group_name); callbacks.cpg_deliver_fn (handle, &group_name, res_cpg_deliver_callback->nodeid, res_cpg_deliver_callback->pid, &res_cpg_deliver_callback->message, res_cpg_deliver_callback->msglen); break; case MESSAGE_RES_CPG_CONFCHG_CALLBACK: res_cpg_confchg_callback = (struct res_lib_cpg_confchg_callback *)&dispatch_data; for (i = 0; i < res_cpg_confchg_callback->member_list_entries; i++) { marshall_from_mar_cpg_address_t (&member_list[i], &res_cpg_confchg_callback->member_list[i]); } left_list_start = res_cpg_confchg_callback->member_list + res_cpg_confchg_callback->member_list_entries; for (i = 0; i < res_cpg_confchg_callback->left_list_entries; i++) { marshall_from_mar_cpg_address_t (&left_list[i], &left_list_start[i]); } joined_list_start = res_cpg_confchg_callback->member_list + res_cpg_confchg_callback->member_list_entries + res_cpg_confchg_callback->left_list_entries; for (i = 0; i < res_cpg_confchg_callback->joined_list_entries; i++) { marshall_from_mar_cpg_address_t (&joined_list[i], &joined_list_start[i]); } marshall_from_mar_cpg_name_t ( &group_name, &res_cpg_confchg_callback->group_name); callbacks.cpg_confchg_fn (handle, &group_name, member_list, res_cpg_confchg_callback->member_list_entries, left_list, res_cpg_confchg_callback->left_list_entries, joined_list, res_cpg_confchg_callback->joined_list_entries); break; default: error = CS_ERR_LIBRARY; goto error_put; break; } /* * Determine if more messages should be processed * */ switch (dispatch_types) { case CPG_DISPATCH_ONE: if (ignore_dispatch) { ignore_dispatch = 0; } else { cont = 0; } break; case CPG_DISPATCH_ALL: if (ignore_dispatch) { ignore_dispatch = 0; } break; case CPG_DISPATCH_BLOCKING: break; } } while (cont); error_put: saHandleInstancePut (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_join ( cpg_handle_t handle, struct cpg_name *group) { cs_error_t error; struct cpg_inst *cpg_inst; struct iovec iov[2]; struct req_lib_cpg_join req_lib_cpg_join; struct res_lib_cpg_join res_lib_cpg_join; struct req_lib_cpg_trackstart req_lib_cpg_trackstart; struct res_lib_cpg_trackstart res_lib_cpg_trackstart; error = saHandleInstanceGet (&cpg_handle_t_db, handle, (void *)&cpg_inst); if (error != CS_OK) { return (error); } pthread_mutex_lock (&cpg_inst->response_mutex); /* Automatically add a tracker */ req_lib_cpg_trackstart.header.size = sizeof (struct req_lib_cpg_trackstart); req_lib_cpg_trackstart.header.id = MESSAGE_REQ_CPG_TRACKSTART; marshall_to_mar_cpg_name_t (&req_lib_cpg_trackstart.group_name, group); iov[0].iov_base = &req_lib_cpg_trackstart; iov[0].iov_len = sizeof (struct req_lib_cpg_trackstart); error = coroipcc_msg_send_reply_receive (cpg_inst->ipc_ctx, iov, 1, &res_lib_cpg_trackstart, sizeof (struct res_lib_cpg_trackstart)); if (error != CS_OK) { pthread_mutex_unlock (&cpg_inst->response_mutex); goto error_exit; } /* Now join */ req_lib_cpg_join.header.size = sizeof (struct req_lib_cpg_join); req_lib_cpg_join.header.id = MESSAGE_REQ_CPG_JOIN; req_lib_cpg_join.pid = getpid(); marshall_to_mar_cpg_name_t (&req_lib_cpg_join.group_name, group); iov[0].iov_base = &req_lib_cpg_join; iov[0].iov_len = sizeof (struct req_lib_cpg_join); error = coroipcc_msg_send_reply_receive (cpg_inst->ipc_ctx, iov, 1, &res_lib_cpg_join, sizeof (struct res_lib_cpg_join)); pthread_mutex_unlock (&cpg_inst->response_mutex); if (error != CS_OK) { goto error_exit; } error = res_lib_cpg_join.header.error; error_exit: saHandleInstancePut (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_leave ( cpg_handle_t handle, struct cpg_name *group) { cs_error_t error; struct cpg_inst *cpg_inst; struct iovec iov[2]; struct req_lib_cpg_leave req_lib_cpg_leave; struct res_lib_cpg_leave res_lib_cpg_leave; error = saHandleInstanceGet (&cpg_handle_t_db, handle, (void *)&cpg_inst); if (error != CS_OK) { return (error); } req_lib_cpg_leave.header.size = sizeof (struct req_lib_cpg_leave); req_lib_cpg_leave.header.id = MESSAGE_REQ_CPG_LEAVE; req_lib_cpg_leave.pid = getpid(); marshall_to_mar_cpg_name_t (&req_lib_cpg_leave.group_name, group); iov[0].iov_base = &req_lib_cpg_leave; iov[0].iov_len = sizeof (struct req_lib_cpg_leave); pthread_mutex_lock (&cpg_inst->response_mutex); error = coroipcc_msg_send_reply_receive (cpg_inst->ipc_ctx, iov, 1, &res_lib_cpg_leave, sizeof (struct res_lib_cpg_leave)); pthread_mutex_unlock (&cpg_inst->response_mutex); if (error != CS_OK) { goto error_exit; } error = res_lib_cpg_leave.header.error; error_exit: saHandleInstancePut (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_mcast_joined ( cpg_handle_t handle, cpg_guarantee_t guarantee, struct iovec *iovec, - int iov_len) + unsigned int iov_len) { int i; cs_error_t error; struct cpg_inst *cpg_inst; struct iovec iov[64]; struct req_lib_cpg_mcast req_lib_cpg_mcast; struct res_lib_cpg_mcast res_lib_cpg_mcast; int msg_len = 0; error = saHandleInstanceGet (&cpg_handle_t_db, handle, (void *)&cpg_inst); if (error != CS_OK) { return (error); } for (i = 0; i < iov_len; i++ ) { msg_len += iovec[i].iov_len; } req_lib_cpg_mcast.header.size = sizeof (struct req_lib_cpg_mcast) + msg_len; req_lib_cpg_mcast.header.id = MESSAGE_REQ_CPG_MCAST; req_lib_cpg_mcast.guarantee = guarantee; req_lib_cpg_mcast.msglen = msg_len; iov[0].iov_base = &req_lib_cpg_mcast; iov[0].iov_len = sizeof (struct req_lib_cpg_mcast); memcpy (&iov[1], iovec, iov_len * sizeof (struct iovec)); pthread_mutex_lock (&cpg_inst->response_mutex); error = coroipcc_msg_send_reply_receive (cpg_inst->ipc_ctx, iov, iov_len + 1, &res_lib_cpg_mcast, sizeof (res_lib_cpg_mcast)); pthread_mutex_unlock (&cpg_inst->response_mutex); if (error != CS_OK) { goto error_exit; } error = res_lib_cpg_mcast.header.error; error_exit: saHandleInstancePut (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_membership_get ( cpg_handle_t handle, struct cpg_name *group_name, struct cpg_address *member_list, int *member_list_entries) { cs_error_t error; struct cpg_inst *cpg_inst; struct iovec iov; struct req_lib_cpg_membership req_lib_cpg_membership_get; struct res_lib_cpg_confchg_callback res_lib_cpg_membership_get; unsigned int i; error = saHandleInstanceGet (&cpg_handle_t_db, handle, (void *)&cpg_inst); if (error != CS_OK) { return (error); } req_lib_cpg_membership_get.header.size = sizeof (mar_req_header_t); req_lib_cpg_membership_get.header.id = MESSAGE_REQ_CPG_MEMBERSHIP; iov.iov_base = &req_lib_cpg_membership_get; iov.iov_len = sizeof (mar_req_header_t); pthread_mutex_lock (&cpg_inst->response_mutex); error = coroipcc_msg_send_reply_receive (cpg_inst->ipc_ctx, &iov, 1, &res_lib_cpg_membership_get, sizeof (mar_res_header_t)); pthread_mutex_unlock (&cpg_inst->response_mutex); if (error != CS_OK) { goto error_exit; } error = res_lib_cpg_membership_get.header.error; /* * Copy results to caller */ *member_list_entries = res_lib_cpg_membership_get.member_list_entries; if (member_list) { for (i = 0; i < res_lib_cpg_membership_get.member_list_entries; i++) { marshall_from_mar_cpg_address_t (&member_list[i], &res_lib_cpg_membership_get.member_list[i]); } } error_exit: saHandleInstancePut (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_local_get ( cpg_handle_t handle, unsigned int *local_nodeid) { cs_error_t error; struct cpg_inst *cpg_inst; struct iovec iov; struct req_lib_cpg_local_get req_lib_cpg_local_get; struct res_lib_cpg_local_get res_lib_cpg_local_get; error = saHandleInstanceGet (&cpg_handle_t_db, handle, (void *)&cpg_inst); if (error != CS_OK) { return (error); } req_lib_cpg_local_get.header.size = sizeof (mar_req_header_t); req_lib_cpg_local_get.header.id = MESSAGE_REQ_CPG_LOCAL_GET; iov.iov_base = &req_lib_cpg_local_get; iov.iov_len = sizeof (struct req_lib_cpg_local_get); pthread_mutex_lock (&cpg_inst->response_mutex); error = coroipcc_msg_send_reply_receive (cpg_inst->ipc_ctx, &iov, 1, &res_lib_cpg_local_get, sizeof (res_lib_cpg_local_get)); pthread_mutex_unlock (&cpg_inst->response_mutex); if (error != CS_OK) { goto error_exit; } error = res_lib_cpg_local_get.header.error; *local_nodeid = res_lib_cpg_local_get.local_nodeid; error_exit: saHandleInstancePut (&cpg_handle_t_db, handle); return (error); } cs_error_t cpg_flow_control_state_get ( cpg_handle_t handle, cpg_flow_control_state_t *flow_control_state) { cs_error_t error; struct cpg_inst *cpg_inst; error = saHandleInstanceGet (&cpg_handle_t_db, handle, (void *)&cpg_inst); if (error != CS_OK) { return (error); } *flow_control_state = coroipcc_dispatch_flow_control_get (cpg_inst->ipc_ctx); saHandleInstancePut (&cpg_handle_t_db, handle); return (error); } /** @} */ diff --git a/lib/util.h b/lib/util.h index a00da834..63a1bf30 100644 --- a/lib/util.h +++ b/lib/util.h @@ -1,151 +1,151 @@ /* * Copyright (c) 2002-2003 MontaVista Software, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #ifndef AIS_UTIL_H_DEFINED #define AIS_UTIL_H_DEFINED #include #include #include #include "../include/ipc_gen.h" /* Debug macro */ #ifdef DEBUG #define DPRINT(s) printf s #else #define DPRINT(s) #endif #ifdef SO_NOSIGPIPE #ifndef MSG_NOSIGNAL #define MSG_NOSIGNAL 0 #endif void socket_nosigpipe(int s); #else #define socket_nosigpipe(s) #endif struct saHandleDatabase { unsigned int handleCount; struct saHandle *handles; pthread_mutex_t mutex; void (*handleInstanceDestructor) (void *); }; struct saVersionDatabase { int versionCount; SaVersionT *versionsSupported; }; cs_error_t saServiceConnect ( int *responseOut, int *callbackOut, enum service_types service); cs_error_t saRecvRetry ( int s, void *msg, size_t len); cs_error_t saSendRetry ( int s, const void *msg, size_t len); cs_error_t saSendMsgRetry ( int s, struct iovec *iov, - int iov_len); + unsigned int iov_len); cs_error_t saSendMsgReceiveReply ( int s, struct iovec *iov, - int iov_len, + unsigned int iov_len, void *responseMessage, int responseLen); cs_error_t saSendReceiveReply ( int s, void *requestMessage, int requestLen, void *responseMessage, int responseLen); cs_error_t saPollRetry ( struct pollfd *ufds, unsigned int nfds, int timeout); cs_error_t saHandleCreate ( struct saHandleDatabase *handleDatabase, int instanceSize, SaUint64T *handleOut); cs_error_t saHandleDestroy ( struct saHandleDatabase *handleDatabase, SaUint64T handle); cs_error_t saHandleInstanceGet ( struct saHandleDatabase *handleDatabase, SaUint64T handle, void **instance); cs_error_t saHandleInstancePut ( struct saHandleDatabase *handleDatabase, SaUint64T handle); cs_error_t saVersionVerify ( struct saVersionDatabase *versionDatabase, SaVersionT *version); #define offset_of(type,member) (int)(&(((type *)0)->member)) SaTimeT clustTimeNow(void); #endif /* AIS_UTIL_H_DEFINED */ diff --git a/services/pload.c b/services/pload.c index 7ded4006..2dbe974b 100644 --- a/services/pload.c +++ b/services/pload.c @@ -1,359 +1,359 @@ /* * Copyright (c) 2008-2009 Red Hat, Inc. * * All rights reserved. * * Author: Steven Dake (sdake@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include LOGSYS_DECLARE_SUBSYS ("PLOAD", LOG_INFO); enum pload_exec_message_req_types { MESSAGE_REQ_EXEC_PLOAD_START = 0, MESSAGE_REQ_EXEC_PLOAD_MCAST = 1 }; /* * Service Interfaces required by service_message_handler struct */ static int pload_exec_init_fn ( struct corosync_api_v1 *corosync_api); static void pload_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); static void message_handler_req_exec_pload_start (const void *msg, unsigned int nodeid); static void message_handler_req_exec_pload_mcast (const void *msg, unsigned int nodeid); static void req_exec_pload_start_endian_convert (void *msg); static void req_exec_pload_mcast_endian_convert (void *msg); static void message_handler_req_pload_start (void *conn, void *msg); static int pload_lib_init_fn (void *conn); static int pload_lib_exit_fn (void *conn); static char buffer[1000000]; static unsigned int msgs_delivered = 0; static unsigned int msgs_wanted = 0; static unsigned int msg_size = 0; static unsigned int msg_code = 1; static unsigned int msgs_sent = 0; static struct corosync_api_v1 *api; struct req_exec_pload_start { mar_req_header_t header; unsigned int msg_code; unsigned int msg_count; unsigned int msg_size; unsigned int time_interval; }; struct req_exec_pload_mcast { mar_req_header_t header; unsigned int msg_code; }; static struct corosync_lib_handler pload_lib_engine[] = { { /* 0 */ .lib_handler_fn = message_handler_req_pload_start, .response_size = sizeof (struct res_lib_pload_start), .response_id = MESSAGE_RES_PLOAD_START, .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED } }; static struct corosync_exec_handler pload_exec_engine[] = { { .exec_handler_fn = message_handler_req_exec_pload_start, .exec_endian_convert_fn = req_exec_pload_start_endian_convert }, { .exec_handler_fn = message_handler_req_exec_pload_mcast, .exec_endian_convert_fn = req_exec_pload_mcast_endian_convert } }; struct corosync_service_engine pload_service_engine = { .name = "corosync profile loading service", .id = PLOAD_SERVICE, .private_data_size = 0, .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED, .lib_init_fn = pload_lib_init_fn, .lib_exit_fn = pload_lib_exit_fn, .lib_engine = pload_lib_engine, .lib_engine_count = sizeof (pload_lib_engine) / sizeof (struct corosync_lib_handler), .exec_engine = pload_exec_engine, .exec_engine_count = sizeof (pload_exec_engine) / sizeof (struct corosync_exec_handler), .confchg_fn = pload_confchg_fn, .exec_init_fn = pload_exec_init_fn, .exec_dump_fn = NULL }; static DECLARE_LIST_INIT (confchg_notify); /* * Dynamic loading descriptor */ static struct corosync_service_engine *pload_get_service_engine_ver0 (void); static struct corosync_service_engine_iface_ver0 pload_service_engine_iface = { .corosync_get_service_engine_ver0 = pload_get_service_engine_ver0 }; static struct lcr_iface corosync_pload_ver0[1] = { { .name = "corosync_pload", .version = 0, .versions_replace = 0, .versions_replace_count = 0, .dependencies = 0, .dependency_count = 0, .constructor = NULL, .destructor = NULL, .interfaces = NULL, } }; static struct lcr_comp pload_comp_ver0 = { .iface_count = 1, .ifaces = corosync_pload_ver0 }; static struct corosync_service_engine *pload_get_service_engine_ver0 (void) { return (&pload_service_engine); } __attribute__ ((constructor)) static void pload_comp_register (void) { lcr_interfaces_set (&corosync_pload_ver0[0], &pload_service_engine_iface); lcr_component_register (&pload_comp_ver0); } static int pload_exec_init_fn ( struct corosync_api_v1 *corosync_api) { api = corosync_api; return 0; } static void pload_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { } static int pload_lib_init_fn (void *conn) { return (0); } static int pload_lib_exit_fn (void *conn) { return (0); } static void message_handler_req_pload_start (void *conn, void *msg) { struct req_lib_pload_start *req_lib_pload_start = (struct req_lib_pload_start *)msg; struct req_exec_pload_start req_exec_pload_start; struct iovec iov; req_exec_pload_start.header.id = SERVICE_ID_MAKE (PLOAD_SERVICE, MESSAGE_REQ_EXEC_PLOAD_START); req_exec_pload_start.msg_code = req_lib_pload_start->msg_code; req_exec_pload_start.msg_size = req_lib_pload_start->msg_size; req_exec_pload_start.msg_count = req_lib_pload_start->msg_count; req_exec_pload_start.time_interval = req_lib_pload_start->time_interval; iov.iov_base = &req_exec_pload_start; iov.iov_len = sizeof (struct req_exec_pload_start); api->totem_mcast (&iov, 1, TOTEM_AGREED); } static void req_exec_pload_start_endian_convert (void *msg) { } static void req_exec_pload_mcast_endian_convert (void *msg) { } static int send_message (enum totem_callback_token_type type, const void *arg) { struct req_exec_pload_mcast req_exec_pload_mcast; struct iovec iov[2]; unsigned int res; - int iov_len = 1; + unsigned int iov_len = 1; req_exec_pload_mcast.header.id = SERVICE_ID_MAKE (PLOAD_SERVICE, MESSAGE_REQ_EXEC_PLOAD_MCAST); req_exec_pload_mcast.header.size = sizeof (struct req_exec_pload_mcast) + msg_size; iov[0].iov_base = &req_exec_pload_mcast; iov[0].iov_len = sizeof (struct req_exec_pload_mcast); if (msg_size > sizeof (req_exec_pload_mcast)) { iov[1].iov_base = buffer; iov[1].iov_len = msg_size - sizeof (req_exec_pload_mcast); iov_len = 2; } do { res = api->totem_mcast (iov, iov_len, TOTEM_AGREED); if (res == -1) { break; } else { msgs_sent++; msg_code++; } } while (msgs_sent <= msgs_wanted); if (msgs_sent == msgs_wanted) { return (0); } else { return (-1); } } static void *token_callback; static void start_mcasting (void) { api->totem_callback_token_create ( &token_callback, TOTEM_CALLBACK_TOKEN_RECEIVED, 1, send_message, &token_callback); } static void message_handler_req_exec_pload_start ( const void *msg, unsigned int nodeid) { const struct req_exec_pload_start *req_exec_pload_start = msg; msgs_wanted = req_exec_pload_start->msg_count; msg_size = req_exec_pload_start->msg_size; msg_code = req_exec_pload_start->msg_code; start_mcasting (); } # define timersub(a, b, result) \ do { \ (result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \ (result)->tv_usec = (a)->tv_usec - (b)->tv_usec; \ if ((result)->tv_usec < 0) { \ --(result)->tv_sec; \ (result)->tv_usec += 1000000; \ } \ } while (0) struct timeval tv1; struct timeval tv2; struct timeval tv_elapsed; int last_msg_no = 0; static void message_handler_req_exec_pload_mcast ( const void *msg, unsigned int nodeid) { const struct req_exec_pload_mcast *pload_mcast = msg; assert (pload_mcast->msg_code - 1 == last_msg_no); last_msg_no = pload_mcast->msg_code; if (msgs_delivered == 0) { gettimeofday (&tv1, NULL); } msgs_delivered += 1; if (msgs_delivered == msgs_wanted) { gettimeofday (&tv2, NULL); timersub (&tv2, &tv1, &tv_elapsed); printf ("%5d Writes ", msgs_delivered); printf ("%5d bytes per write ", msg_size); printf ("%7.3f Seconds runtime ", (tv_elapsed.tv_sec + (tv_elapsed.tv_usec / 1000000.0))); printf ("%9.3f TP/s ", ((float)msgs_delivered) / (tv_elapsed.tv_sec + (tv_elapsed.tv_usec / 1000000.0))); printf ("%7.3f MB/s.\n", ((float)msgs_delivered) * ((float)msg_size) / ((tv_elapsed.tv_sec + (tv_elapsed.tv_usec / 1000000.0)) * 1000000.0)); } } diff --git a/services/votequorum.c b/services/votequorum.c index 1ece0a4b..784d2cca 100644 --- a/services/votequorum.c +++ b/services/votequorum.c @@ -1,1699 +1,1699 @@ /* * Copyright (c) 2009 Red Hat, Inc. * * All rights reserved. * * Author: Christine Caulfield (ccaulfie@redhat.com) * * This software licensed under BSD license, the text of which follows: * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * - Redistributions of source code must retain the above copyright notice, * this list of conditions and the following disclaimer. * - Redistributions in binary form must reproduce the above copyright notice, * this list of conditions and the following disclaimer in the documentation * and/or other materials provided with the distribution. * - Neither the name of the MontaVista Software, Inc. nor the names of its * contributors may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF * THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #ifndef COROSYNC_BSD #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define VOTEQUORUM_MAJOR_VERSION 6 #define VOTEQUORUM_MINOR_VERSION 3 #define VOTEQUORUM_PATCH_VERSION 0 /* Silly default to prevent accidents! */ #define DEFAULT_EXPECTED 1024 #define DEFAULT_QDEV_POLL 10000 #define DEFAULT_LEAVE_TMO 10000 LOGSYS_DECLARE_SUBSYS ("VOTEQ", LOG_INFO); enum quorum_message_req_types { MESSAGE_REQ_EXEC_VOTEQUORUM_NODEINFO = 0, MESSAGE_REQ_EXEC_VOTEQUORUM_RECONFIGURE = 1, MESSAGE_REQ_EXEC_VOTEQUORUM_KILLNODE = 2, }; #define NODE_FLAGS_BEENDOWN 1 #define NODE_FLAGS_SEESDISALLOWED 8 #define NODE_FLAGS_HASSTATE 16 #define NODE_FLAGS_QDISK 32 #define NODE_FLAGS_REMOVED 64 #define NODE_FLAGS_US 128 typedef enum { NODESTATE_JOINING=1, NODESTATE_MEMBER, NODESTATE_DEAD, NODESTATE_LEAVING, NODESTATE_DISALLOWED } nodestate_t; /* This structure is tacked onto the start of a cluster message packet for our * own nefarious purposes. */ struct q_protheader { unsigned char tgtport; /* Target port number */ unsigned char srcport; /* Source (originating) port number */ unsigned short pad; unsigned int flags; int srcid; /* Node ID of the sender */ int tgtid; /* Node ID of the target */ } __attribute__((packed)); struct cluster_node { int flags; int node_id; unsigned int expected_votes; unsigned int votes; time_t join_time; nodestate_t state; struct timeval last_hello; /* Only used for quorum devices */ struct list_head list; }; static int quorum_flags; #define VOTEQUORUM_FLAG_FEATURE_DISALLOWED 1 #define VOTEQUORUM_FLAG_FEATURE_TWONODE 1 static int quorum; static int cluster_is_quorate; static int first_trans = 1; static unsigned int quorumdev_poll = DEFAULT_QDEV_POLL; static unsigned int leaving_timeout = DEFAULT_LEAVE_TMO; static struct cluster_node *us; static struct cluster_node *quorum_device = NULL; static char quorum_device_name[VOTEQUORUM_MAX_QDISK_NAME_LEN]; static corosync_timer_handle_t quorum_device_timer; static corosync_timer_handle_t leaving_timer; static struct list_head cluster_members_list; static struct corosync_api_v1 *corosync_api; static struct list_head trackers_list; static unsigned int quorum_members[PROCESSOR_COUNT_MAX+1]; static int quorum_members_entries = 0; static struct memb_ring_id quorum_ringid; static hdb_handle_t group_handle; #define max(a,b) (((a) > (b)) ? (a) : (b)) static struct cluster_node *find_node_by_nodeid(int nodeid); static struct cluster_node *allocate_node(int nodeid); static const char *kill_reason(int reason); static struct corosync_tpg_group quorum_group[1] = { { .group = "VOTEQ", .group_len = 5}, }; #define list_iterate(v, head) \ for (v = (head)->next; v != head; v = v->next) struct quorum_pd { unsigned char track_flags; int tracking_enabled; uint64_t tracking_context; struct list_head list; void *conn; }; /* * Service Interfaces required by service_message_handler struct */ static void votequorum_init(struct corosync_api_v1 *api, quorum_set_quorate_fn_t report); static void quorum_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id); -static void quorum_deliver_fn(unsigned int nodeid, struct iovec *iovec, int iov_len, +static void quorum_deliver_fn(unsigned int nodeid, struct iovec *iovec, unsigned int iov_len, int endian_conversion_required); static int votequorum_exec_init_fn (struct corosync_api_v1 *corosync_api); static int quorum_lib_init_fn (void *conn); static int quorum_lib_exit_fn (void *conn); static void message_handler_req_exec_quorum_nodeinfo ( void *message, unsigned int nodeid); static void message_handler_req_exec_quorum_reconfigure ( void *message, unsigned int nodeid); static void message_handler_req_exec_quorum_killnode ( void *message, unsigned int nodeid); static void message_handler_req_lib_votequorum_getinfo (void *conn, void *message); static void message_handler_req_lib_votequorum_setexpected (void *conn, void *message); static void message_handler_req_lib_votequorum_setvotes (void *conn, void *message); static void message_handler_req_lib_votequorum_qdisk_register (void *conn, void *message); static void message_handler_req_lib_votequorum_qdisk_unregister (void *conn, void *message); static void message_handler_req_lib_votequorum_qdisk_poll (void *conn, void *message); static void message_handler_req_lib_votequorum_qdisk_getinfo (void *conn, void *message); static void message_handler_req_lib_votequorum_setstate (void *conn, void *message); static void message_handler_req_lib_votequorum_leaving (void *conn, void *message); static void message_handler_req_lib_votequorum_trackstart (void *conn, void *msg); static void message_handler_req_lib_votequorum_trackstop (void *conn, void *msg); static int quorum_exec_send_nodeinfo(void); static int quorum_exec_send_reconfigure(int param, int nodeid, int value); static int quorum_exec_send_killnode(int nodeid, unsigned int reason); static void add_votequorum_config_notification(hdb_handle_t quorum_object_handle); static void recalculate_quorum(int allow_decrease, int by_current_nodes); /* * Library Handler Definition */ static struct corosync_lib_handler quorum_lib_service[] = { { /* 0 */ .lib_handler_fn = message_handler_req_lib_votequorum_getinfo, .response_size = sizeof (struct res_lib_votequorum_getinfo), .response_id = MESSAGE_RES_VOTEQUORUM_GETINFO, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 1 */ .lib_handler_fn = message_handler_req_lib_votequorum_setexpected, .response_size = sizeof (struct res_lib_votequorum_status), .response_id = MESSAGE_RES_VOTEQUORUM_STATUS, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 2 */ .lib_handler_fn = message_handler_req_lib_votequorum_setvotes, .response_size = sizeof (struct res_lib_votequorum_status), .response_id = MESSAGE_RES_VOTEQUORUM_STATUS, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 3 */ .lib_handler_fn = message_handler_req_lib_votequorum_qdisk_register, .response_size = sizeof (struct res_lib_votequorum_status), .response_id = MESSAGE_RES_VOTEQUORUM_STATUS, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 4 */ .lib_handler_fn = message_handler_req_lib_votequorum_qdisk_unregister, .response_size = sizeof (struct res_lib_votequorum_status), .response_id = MESSAGE_RES_VOTEQUORUM_STATUS, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 5 */ .lib_handler_fn = message_handler_req_lib_votequorum_qdisk_poll, .response_size = sizeof (struct res_lib_votequorum_status), .response_id = MESSAGE_RES_VOTEQUORUM_STATUS, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 6 */ .lib_handler_fn = message_handler_req_lib_votequorum_qdisk_getinfo, .response_size = sizeof (struct res_lib_votequorum_qdisk_getinfo), .response_id = MESSAGE_RES_VOTEQUORUM_QDISK_GETINFO, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 7 */ .lib_handler_fn = message_handler_req_lib_votequorum_setstate, .response_size = sizeof (struct res_lib_votequorum_status), .response_id = MESSAGE_RES_VOTEQUORUM_STATUS, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 8 */ .lib_handler_fn = message_handler_req_lib_votequorum_leaving, .response_size = sizeof (struct res_lib_votequorum_status), .response_id = MESSAGE_RES_VOTEQUORUM_STATUS, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 9 */ .lib_handler_fn = message_handler_req_lib_votequorum_trackstart, .response_size = sizeof (struct res_lib_votequorum_status), .response_id = MESSAGE_RES_VOTEQUORUM_STATUS, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED }, { /* 10 */ .lib_handler_fn = message_handler_req_lib_votequorum_trackstop, .response_size = sizeof (struct res_lib_votequorum_status), .response_id = MESSAGE_RES_VOTEQUORUM_STATUS, .flow_control = COROSYNC_LIB_FLOW_CONTROL_NOT_REQUIRED } }; static quorum_set_quorate_fn_t set_quorum; /* * lcrso object definition */ static struct quorum_services_api_ver1 votequorum_iface_ver0 = { .init = votequorum_init }; static struct corosync_service_engine quorum_service_handler = { .name = "corosync votes quorum service v0.90", .id = VOTEQUORUM_SERVICE, .private_data_size = sizeof (struct quorum_pd), .allow_inquorate = CS_LIB_ALLOW_INQUORATE, .flow_control = COROSYNC_LIB_FLOW_CONTROL_REQUIRED, .lib_init_fn = quorum_lib_init_fn, .lib_exit_fn = quorum_lib_exit_fn, .lib_engine = quorum_lib_service, .lib_engine_count = sizeof (quorum_lib_service) / sizeof (struct corosync_lib_handler), .exec_init_fn = votequorum_exec_init_fn, .exec_engine = NULL, .exec_engine_count = 0, .confchg_fn = NULL, }; /* * Dynamic loader definition */ static struct corosync_service_engine *quorum_get_service_handler_ver0 (void); static struct corosync_service_engine_iface_ver0 quorum_service_handler_iface = { .corosync_get_service_engine_ver0 = quorum_get_service_handler_ver0 }; static struct lcr_iface corosync_quorum_ver0[2] = { { .name = "corosync_votequorum", .version = 0, .versions_replace = 0, .versions_replace_count = 0, .dependencies = 0, .dependency_count = 0, .constructor = NULL, .destructor = NULL, .interfaces = (void **)(void *)&votequorum_iface_ver0 }, { .name = "corosync_votequorum_iface", .version = 0, .versions_replace = 0, .versions_replace_count = 0, .dependencies = 0, .dependency_count = 0, .constructor = NULL, .destructor = NULL, .interfaces = NULL } }; static struct lcr_comp quorum_comp_ver0 = { .iface_count = 2, .ifaces = corosync_quorum_ver0 }; static struct corosync_service_engine *quorum_get_service_handler_ver0 (void) { return (&quorum_service_handler); } __attribute__ ((constructor)) static void quorum_comp_register (void) { lcr_interfaces_set (&corosync_quorum_ver0[0], &votequorum_iface_ver0); lcr_interfaces_set (&corosync_quorum_ver0[1], &quorum_service_handler_iface); lcr_component_register (&quorum_comp_ver0); } static void votequorum_init(struct corosync_api_v1 *api, quorum_set_quorate_fn_t report) { ENTER(); set_quorum = report; /* Load the library-servicing part of this module */ api->service_link_and_init(api, "corosync_votequorum_iface", 0); LEAVE(); } /* Message types */ #define VOTEQUORUM_MSG_NODEINFO 5 #define VOTEQUORUM_MSG_KILLNODE 6 #define VOTEQUORUM_MSG_RECONFIGURE 8 struct req_exec_quorum_nodeinfo { unsigned char cmd; unsigned char first_trans; unsigned int votes; unsigned int expected_votes; unsigned int major_version; /* Not backwards compatible */ unsigned int minor_version; /* Backwards compatible */ unsigned int patch_version; /* Backwards/forwards compatible */ unsigned int config_version; unsigned int flags; } __attribute__((packed)); /* Parameters for RECONFIG command */ #define RECONFIG_PARAM_EXPECTED_VOTES 1 #define RECONFIG_PARAM_NODE_VOTES 2 #define RECONFIG_PARAM_LEAVING 3 struct req_exec_quorum_reconfigure { unsigned char cmd; unsigned char param; unsigned short pad; int nodeid; unsigned int value; }; struct req_exec_quorum_killnode { unsigned char cmd; unsigned char pad1; uint16_t reason; int nodeid; }; /* These just make the access a little neater */ static inline int objdb_get_string(struct corosync_api_v1 *corosync, unsigned int object_service_handle, char *key, char **value) { int res; *value = NULL; if ( !(res = corosync_api->object_key_get(object_service_handle, key, strlen(key), (void *)value, NULL))) { if (*value) return 0; } return -1; } static inline void objdb_get_int(struct corosync_api_v1 *corosync, unsigned int object_service_handle, const char *key, unsigned int *intvalue, unsigned int default_value) { char *value = NULL; *intvalue = default_value; if (!corosync_api->object_key_get(object_service_handle, key, strlen(key), (void *)&value, NULL)) { if (value) { *intvalue = atoi(value); } } } static int votequorum_send_message(void *message, int len) { struct iovec iov[2]; struct q_protheader header; header.tgtport = 0; header.srcport = 0; header.flags = 0; header.srcid = us->node_id; header.tgtid = 0; iov[0].iov_base = &header; iov[0].iov_len = sizeof(header); iov[1].iov_base = message; iov[1].iov_len = len; return corosync_api->tpg_joined_mcast(group_handle, iov, 2, TOTEM_AGREED); } static void read_quorum_config(unsigned int quorum_handle) { unsigned int value = 0; int cluster_members = 0; struct list_head *tmp; struct cluster_node *node; log_printf(LOG_INFO, "Reading configuration\n"); objdb_get_int(corosync_api, quorum_handle, "expected_votes", &us->expected_votes, DEFAULT_EXPECTED); objdb_get_int(corosync_api, quorum_handle, "votes", &us->votes, 1); objdb_get_int(corosync_api, quorum_handle, "quorumdev_poll", &quorumdev_poll, DEFAULT_QDEV_POLL); objdb_get_int(corosync_api, quorum_handle, "leaving_timeout", &leaving_timeout, DEFAULT_LEAVE_TMO); objdb_get_int(corosync_api, quorum_handle, "disallowed", &value, 0); if (value) quorum_flags |= VOTEQUORUM_FLAG_FEATURE_DISALLOWED; else quorum_flags &= ~VOTEQUORUM_FLAG_FEATURE_DISALLOWED; objdb_get_int(corosync_api, quorum_handle, "two_node", &value, 0); if (value) quorum_flags |= VOTEQUORUM_FLAG_FEATURE_TWONODE; else quorum_flags &= ~VOTEQUORUM_FLAG_FEATURE_TWONODE; /* * two_node mode is invalid if there are more than 2 nodes in the cluster! */ list_iterate(tmp, &cluster_members_list) { node = list_entry(tmp, struct cluster_node, list); cluster_members++; } if (quorum_flags & VOTEQUORUM_FLAG_FEATURE_TWONODE && cluster_members > 2) { log_printf(LOG_WARNING, "quorum.two_node was set but there are more than 2 nodes in the cluster. It will be ignored."); quorum_flags &= ~VOTEQUORUM_FLAG_FEATURE_TWONODE; } } static int votequorum_exec_init_fn (struct corosync_api_v1 *api) { hdb_handle_t object_handle; hdb_handle_t find_handle; ENTER(); corosync_api = api; list_init(&cluster_members_list); list_init(&trackers_list); /* Allocate a cluster_node for us */ us = allocate_node(corosync_api->totem_nodeid_get()); if (!us) return (1); us->flags |= NODE_FLAGS_US; us->state = NODESTATE_MEMBER; us->expected_votes = DEFAULT_EXPECTED; us->votes = 1; time(&us->join_time); /* Get configuration variables */ corosync_api->object_find_create(OBJECT_PARENT_HANDLE, "quorum", strlen("quorum"), &find_handle); if (corosync_api->object_find_next(find_handle, &object_handle) == 0) { read_quorum_config(object_handle); } recalculate_quorum(0, 0); /* Listen for changes */ add_votequorum_config_notification(object_handle); corosync_api->object_find_destroy(find_handle); api->tpg_init(&group_handle, quorum_deliver_fn, quorum_confchg_fn); api->tpg_join(group_handle, quorum_group, 1); LEAVE(); return (0); } static int quorum_lib_exit_fn (void *conn) { struct quorum_pd *quorum_pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn); ENTER(); if (quorum_pd->tracking_enabled) { list_del (&quorum_pd->list); list_init (&quorum_pd->list); } LEAVE(); return (0); } static int send_quorum_notification(void *conn, uint64_t context) { struct res_lib_votequorum_notification *res_lib_votequorum_notification; struct list_head *tmp; struct cluster_node *node; int cluster_members = 0; int i = 0; int size; char *buf; ENTER(); list_iterate(tmp, &cluster_members_list) { node = list_entry(tmp, struct cluster_node, list); cluster_members++; } if (quorum_device) cluster_members++; size = sizeof(struct res_lib_votequorum_notification) + sizeof(struct votequorum_node) * cluster_members; buf = alloca(size); if (!buf) { LEAVE(); return -1; } res_lib_votequorum_notification = (struct res_lib_votequorum_notification *)buf; res_lib_votequorum_notification->quorate = cluster_is_quorate; res_lib_votequorum_notification->node_list_entries = cluster_members; res_lib_votequorum_notification->context = context; list_iterate(tmp, &cluster_members_list) { node = list_entry(tmp, struct cluster_node, list); res_lib_votequorum_notification->node_list[i].nodeid = node->node_id; res_lib_votequorum_notification->node_list[i++].state = node->state; } if (quorum_device) { res_lib_votequorum_notification->node_list[i].nodeid = 0; res_lib_votequorum_notification->node_list[i++].state = quorum_device->state | 0x80; } res_lib_votequorum_notification->header.id = MESSAGE_RES_VOTEQUORUM_NOTIFICATION; res_lib_votequorum_notification->header.size = size; res_lib_votequorum_notification->header.error = CS_OK; /* Send it to all interested parties */ if (conn) { int ret = corosync_api->ipc_dispatch_send(conn, buf, size); LEAVE(); return ret; } else { struct quorum_pd *qpd; list_iterate(tmp, &trackers_list) { qpd = list_entry(tmp, struct quorum_pd, list); res_lib_votequorum_notification->context = qpd->tracking_context; corosync_api->ipc_dispatch_send(qpd->conn, buf, size); } } LEAVE(); return 0; } static void send_expectedvotes_notification(void) { struct res_lib_votequorum_expectedvotes_notification res_lib_votequorum_expectedvotes_notification; struct quorum_pd *qpd; struct list_head *tmp; log_printf(LOG_DEBUG, "Sending expected votes callback\n"); res_lib_votequorum_expectedvotes_notification.header.id = MESSAGE_RES_VOTEQUORUM_EXPECTEDVOTES_NOTIFICATION; res_lib_votequorum_expectedvotes_notification.header.size = sizeof(res_lib_votequorum_expectedvotes_notification); res_lib_votequorum_expectedvotes_notification.header.error = CS_OK; res_lib_votequorum_expectedvotes_notification.expected_votes = us->expected_votes; list_iterate(tmp, &trackers_list) { qpd = list_entry(tmp, struct quorum_pd, list); res_lib_votequorum_expectedvotes_notification.context = qpd->tracking_context; corosync_api->ipc_dispatch_send(qpd->conn, &res_lib_votequorum_expectedvotes_notification, sizeof(struct res_lib_votequorum_expectedvotes_notification)); } } static void set_quorate(int total_votes) { int quorate; ENTER(); if (quorum > total_votes) { quorate = 0; } else { quorate = 1; } if (cluster_is_quorate && !quorate) log_printf(LOG_INFO, "quorum lost, blocking activity\n"); if (!cluster_is_quorate && quorate) log_printf(LOG_INFO, "quorum regained, resuming activity\n"); /* If we are newly quorate, then kill any DISALLOWED nodes */ if (!cluster_is_quorate && quorate) { struct cluster_node *node = NULL; struct list_head *tmp; list_iterate(tmp, &cluster_members_list) { node = list_entry(tmp, struct cluster_node, list); if (node->state == NODESTATE_DISALLOWED) quorum_exec_send_killnode(node->node_id, VOTEQUORUM_REASON_KILL_REJOIN); } } cluster_is_quorate = quorate; set_quorum(quorum_members, quorum_members_entries, quorate, &quorum_ringid); ENTER(); } static int calculate_quorum(int allow_decrease, int max_expected, unsigned int *ret_total_votes) { struct list_head *nodelist; struct cluster_node *node; unsigned int total_votes = 0; unsigned int highest_expected = 0; unsigned int newquorum, q1, q2; unsigned int total_nodes = 0; ENTER(); list_iterate(nodelist, &cluster_members_list) { node = list_entry(nodelist, struct cluster_node, list); log_printf(LOG_DEBUG, "node %x state=%d, votes=%d, expected=%d\n", node->node_id, node->state, node->votes, node->expected_votes); if (node->state == NODESTATE_MEMBER) { if (max_expected) node->expected_votes = max_expected; else highest_expected = max(highest_expected, node->expected_votes); total_votes += node->votes; total_nodes++; } } if (quorum_device && quorum_device->state == NODESTATE_MEMBER) total_votes += quorum_device->votes; if (max_expected > 0) highest_expected = max_expected; /* This quorum calculation is taken from the OpenVMS Cluster Systems * manual, but, then, you guessed that didn't you */ q1 = (highest_expected + 2) / 2; q2 = (total_votes + 2) / 2; newquorum = max(q1, q2); /* Normally quorum never decreases but the system administrator can * force it down by setting expected votes to a maximum value */ if (!allow_decrease) newquorum = max(quorum, newquorum); /* The special two_node mode allows each of the two nodes to retain * quorum if the other fails. Only one of the two should live past * fencing (as both nodes try to fence each other in split-brain.) * Also: if there are more than two nodes, force us inquorate to avoid * any damage or confusion. */ if ((quorum_flags & VOTEQUORUM_FLAG_FEATURE_TWONODE) && total_nodes <= 2) newquorum = 1; if (ret_total_votes) *ret_total_votes = total_votes; LEAVE(); return newquorum; } /* Recalculate cluster quorum, set quorate and notify changes */ static void recalculate_quorum(int allow_decrease, int by_current_nodes) { unsigned int total_votes = 0; int cluster_members = 0; struct list_head *nodelist; struct cluster_node *node; ENTER(); list_iterate(nodelist, &cluster_members_list) { node = list_entry(nodelist, struct cluster_node, list); if (node->state == NODESTATE_MEMBER) { if (by_current_nodes) cluster_members++; total_votes += node->votes; } } /* Keep expected_votes at the highest number of votes in the cluster */ log_printf(LOG_DEBUG, "total_votes=%d, expected_votes=%d\n", total_votes, us->expected_votes); if (total_votes > us->expected_votes) { us->expected_votes = total_votes; send_expectedvotes_notification(); } quorum = calculate_quorum(allow_decrease, cluster_members, &total_votes); set_quorate(total_votes); send_quorum_notification(NULL, 0L); LEAVE(); } static int have_disallowed(void) { struct cluster_node *node; struct list_head *tmp; list_iterate(tmp, &cluster_members_list) { node = list_entry(tmp, struct cluster_node, list); if (node->state == NODESTATE_DISALLOWED) return 1; } return 0; } static void node_add_ordered(struct cluster_node *newnode) { struct cluster_node *node = NULL; struct list_head *tmp; struct list_head *newlist = &newnode->list; list_iterate(tmp, &cluster_members_list) { node = list_entry(tmp, struct cluster_node, list); if (newnode->node_id < node->node_id) break; } if (!node) list_add(&newnode->list, &cluster_members_list); else { newlist->prev = tmp->prev; newlist->next = tmp; tmp->prev->next = newlist; tmp->prev = newlist; } } static struct cluster_node *allocate_node(int nodeid) { struct cluster_node *cl; cl = malloc(sizeof(struct cluster_node)); if (cl) { memset(cl, 0, sizeof(struct cluster_node)); cl->node_id = nodeid; if (nodeid) node_add_ordered(cl); } return cl; } static struct cluster_node *find_node_by_nodeid(int nodeid) { struct cluster_node *node; struct list_head *tmp; list_iterate(tmp, &cluster_members_list) { node = list_entry(tmp, struct cluster_node, list); if (node->node_id == nodeid) return node; } return NULL; } static int quorum_exec_send_nodeinfo() { struct req_exec_quorum_nodeinfo req_exec_quorum_nodeinfo; int ret; ENTER(); req_exec_quorum_nodeinfo.cmd = VOTEQUORUM_MSG_NODEINFO; req_exec_quorum_nodeinfo.expected_votes = us->expected_votes; req_exec_quorum_nodeinfo.votes = us->votes; req_exec_quorum_nodeinfo.major_version = VOTEQUORUM_MAJOR_VERSION; req_exec_quorum_nodeinfo.minor_version = VOTEQUORUM_MINOR_VERSION; req_exec_quorum_nodeinfo.patch_version = VOTEQUORUM_PATCH_VERSION; req_exec_quorum_nodeinfo.flags = us->flags; req_exec_quorum_nodeinfo.first_trans = first_trans; if (have_disallowed()) req_exec_quorum_nodeinfo.flags |= NODE_FLAGS_SEESDISALLOWED; ret = votequorum_send_message(&req_exec_quorum_nodeinfo, sizeof(req_exec_quorum_nodeinfo)); LEAVE(); return ret; } static int quorum_exec_send_reconfigure(int param, int nodeid, int value) { struct req_exec_quorum_reconfigure req_exec_quorum_reconfigure; int ret; ENTER(); req_exec_quorum_reconfigure.cmd = VOTEQUORUM_MSG_RECONFIGURE; req_exec_quorum_reconfigure.param = param; req_exec_quorum_reconfigure.nodeid = nodeid; req_exec_quorum_reconfigure.value = value; ret = votequorum_send_message(&req_exec_quorum_reconfigure, sizeof(req_exec_quorum_reconfigure)); LEAVE(); return ret; } static int quorum_exec_send_killnode(int nodeid, unsigned int reason) { struct req_exec_quorum_killnode req_exec_quorum_killnode; int ret; ENTER(); req_exec_quorum_killnode.cmd = VOTEQUORUM_MSG_KILLNODE; req_exec_quorum_killnode.nodeid = nodeid; req_exec_quorum_killnode.reason = reason; ret = votequorum_send_message(&req_exec_quorum_killnode, sizeof(req_exec_quorum_killnode)); LEAVE(); return ret; } static void quorum_confchg_fn ( enum totem_configuration_type configuration_type, const unsigned int *member_list, size_t member_list_entries, const unsigned int *left_list, size_t left_list_entries, const unsigned int *joined_list, size_t joined_list_entries, const struct memb_ring_id *ring_id) { int i; int leaving = 0; struct cluster_node *node; ENTER(); if (member_list_entries > 1) first_trans = 0; if (left_list_entries) { for (i = 0; i< left_list_entries; i++) { node = find_node_by_nodeid(left_list[i]); if (node) { if (node->state == NODESTATE_LEAVING) leaving = 1; node->state = NODESTATE_DEAD; node->flags |= NODE_FLAGS_BEENDOWN; } } recalculate_quorum(leaving, leaving); } if (member_list_entries) { memcpy(quorum_members, member_list, sizeof(unsigned int) * member_list_entries); quorum_members_entries = member_list_entries; if (quorum_device) { quorum_members[quorum_members_entries++] = 0; } quorum_exec_send_nodeinfo(); } memcpy(&quorum_ringid, ring_id, sizeof(*ring_id)); LEAVE(); } static void exec_quorum_nodeinfo_endian_convert (void *msg) { struct req_exec_quorum_nodeinfo *nodeinfo = (struct req_exec_quorum_nodeinfo *)msg; nodeinfo->votes = swab32(nodeinfo->votes); nodeinfo->expected_votes = swab32(nodeinfo->expected_votes); nodeinfo->major_version = swab32(nodeinfo->major_version); nodeinfo->minor_version = swab32(nodeinfo->minor_version); nodeinfo->patch_version = swab32(nodeinfo->patch_version); nodeinfo->config_version = swab32(nodeinfo->config_version); nodeinfo->flags = swab32(nodeinfo->flags); } static void exec_quorum_reconfigure_endian_convert (void *msg) { struct req_exec_quorum_reconfigure *reconfigure = (struct req_exec_quorum_reconfigure *)msg; reconfigure->nodeid = swab32(reconfigure->nodeid); reconfigure->value = swab32(reconfigure->value); } static void exec_quorum_killnode_endian_convert (void *msg) { struct req_exec_quorum_killnode *killnode = (struct req_exec_quorum_killnode *)msg; killnode->reason = swab16(killnode->reason); killnode->nodeid = swab32(killnode->nodeid); } -static void quorum_deliver_fn(unsigned int nodeid, struct iovec *iovec, int iov_len, +static void quorum_deliver_fn(unsigned int nodeid, struct iovec *iovec, unsigned int iov_len, int endian_conversion_required) { struct q_protheader *header = iovec->iov_base; char *buf; ENTER(); if (endian_conversion_required) { header->srcid = swab32(header->srcid); header->tgtid = swab32(header->tgtid); header->flags = swab32(header->flags); } /* Only pass on messages for us or everyone */ if (header->tgtport == 0 && (header->tgtid == us->node_id || header->tgtid == 0)) { buf = (char *)(iovec->iov_base) + sizeof(struct q_protheader); switch (*buf) { case VOTEQUORUM_MSG_NODEINFO: if (endian_conversion_required) exec_quorum_nodeinfo_endian_convert(buf); message_handler_req_exec_quorum_nodeinfo (buf, header->srcid); break; case VOTEQUORUM_MSG_RECONFIGURE: if (endian_conversion_required) exec_quorum_reconfigure_endian_convert(buf); message_handler_req_exec_quorum_reconfigure (buf, header->srcid); break; case VOTEQUORUM_MSG_KILLNODE: if (endian_conversion_required) exec_quorum_killnode_endian_convert(buf); message_handler_req_exec_quorum_killnode (buf, header->srcid); break; /* Just ignore other messages */ } } LEAVE(); } static void message_handler_req_exec_quorum_nodeinfo ( void *message, unsigned int nodeid) { struct req_exec_quorum_nodeinfo *req_exec_quorum_nodeinfo = (struct req_exec_quorum_nodeinfo *)message; struct cluster_node *node; int old_votes; int old_expected; nodestate_t old_state; int new_node = 0; ENTER(); log_printf(LOG_LEVEL_DEBUG, "got nodeinfo message from cluster node %d\n", nodeid); node = find_node_by_nodeid(nodeid); if (!node) { node = allocate_node(nodeid); new_node = 1; } if (!node) { corosync_api->error_memory_failure(); return; } /* * If the node sending the message sees disallowed nodes and we don't, then * we have to leave */ if (req_exec_quorum_nodeinfo->flags & NODE_FLAGS_SEESDISALLOWED && !have_disallowed()) { /* Must use syslog directly here or the message will never arrive */ syslog(LOG_CRIT, "[VOTEQ]: Joined a cluster with disallowed nodes. must die"); corosync_api->fatal_error(2, __FILE__, __LINE__); exit(2); } old_votes = node->votes; old_expected = node->expected_votes; old_state = node->state; /* Update node state */ if (req_exec_quorum_nodeinfo->minor_version >= 2) node->votes = req_exec_quorum_nodeinfo->votes; node->expected_votes = req_exec_quorum_nodeinfo->expected_votes; node->state = NODESTATE_MEMBER; /* Check flags for disallowed (if enabled) */ if (quorum_flags & VOTEQUORUM_FLAG_FEATURE_DISALLOWED) { if ((req_exec_quorum_nodeinfo->flags & NODE_FLAGS_HASSTATE && node->flags & NODE_FLAGS_BEENDOWN) || (req_exec_quorum_nodeinfo->flags & NODE_FLAGS_HASSTATE && req_exec_quorum_nodeinfo->first_trans && !(node->flags & NODE_FLAGS_US) && (us->flags & NODE_FLAGS_HASSTATE))) { if (node->state != NODESTATE_DISALLOWED) { if (cluster_is_quorate) { log_printf(LOG_CRIT, "Killing node %d because it has rejoined the cluster with existing state", node->node_id); node->state = NODESTATE_DISALLOWED; quorum_exec_send_killnode(nodeid, VOTEQUORUM_REASON_KILL_REJOIN); } else { log_printf(LOG_CRIT, "Node %d not joined to quorum because it has existing state", node->node_id); node->state = NODESTATE_DISALLOWED; } } } } node->flags &= ~NODE_FLAGS_BEENDOWN; if (new_node || old_votes != node->votes || old_expected != node->expected_votes || old_state != node->state) recalculate_quorum(0, 0); LEAVE(); } static void message_handler_req_exec_quorum_killnode ( void *message, unsigned int nodeid) { struct req_exec_quorum_killnode *req_exec_quorum_killnode = (struct req_exec_quorum_killnode *)message; if (req_exec_quorum_killnode->nodeid == corosync_api->totem_nodeid_get()) { log_printf(LOG_CRIT, "Killed by node %d: %s\n", nodeid, kill_reason(req_exec_quorum_killnode->reason)); corosync_api->fatal_error(1, __FILE__, __LINE__); exit(1); } } static void message_handler_req_exec_quorum_reconfigure ( void *message, unsigned int nodeid) { struct req_exec_quorum_reconfigure *req_exec_quorum_reconfigure = (struct req_exec_quorum_reconfigure *)message; struct cluster_node *node; struct list_head *nodelist; log_printf(LOG_LEVEL_DEBUG, "got reconfigure message from cluster node %d\n", nodeid); node = find_node_by_nodeid(req_exec_quorum_reconfigure->nodeid); if (!node) return; switch(req_exec_quorum_reconfigure->param) { case RECONFIG_PARAM_EXPECTED_VOTES: list_iterate(nodelist, &cluster_members_list) { node = list_entry(nodelist, struct cluster_node, list); if (node->state == NODESTATE_MEMBER && node->expected_votes > req_exec_quorum_reconfigure->value) { node->expected_votes = req_exec_quorum_reconfigure->value; } } send_expectedvotes_notification(); recalculate_quorum(1, 0); /* Allow decrease */ break; case RECONFIG_PARAM_NODE_VOTES: node->votes = req_exec_quorum_reconfigure->value; recalculate_quorum(1, 0); /* Allow decrease */ break; case RECONFIG_PARAM_LEAVING: if (req_exec_quorum_reconfigure->value == 1 && node->state == NODESTATE_MEMBER) node->state = NODESTATE_LEAVING; if (req_exec_quorum_reconfigure->value == 0 && node->state == NODESTATE_LEAVING) node->state = NODESTATE_MEMBER; break; } } static int quorum_lib_init_fn (void *conn) { struct quorum_pd *pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn); ENTER(); list_init (&pd->list); pd->conn = conn; LEAVE(); return (0); } /* * Someone called votequorum_leave AGES ago! * Assume they forgot to shut down the node. */ static void leaving_timer_fn(void *arg) { ENTER(); if (us->state == NODESTATE_LEAVING) us->state = NODESTATE_MEMBER; /* Tell everyone else we made a mistake */ quorum_exec_send_reconfigure(RECONFIG_PARAM_LEAVING, us->node_id, 0); LEAVE(); } /* Message from the library */ static void message_handler_req_lib_votequorum_getinfo (void *conn, void *message) { struct req_lib_votequorum_getinfo *req_lib_votequorum_getinfo = (struct req_lib_votequorum_getinfo *)message; struct res_lib_votequorum_getinfo res_lib_votequorum_getinfo; struct cluster_node *node; unsigned int highest_expected = 0; unsigned int total_votes = 0; cs_error_t error = CS_OK; log_printf(LOG_LEVEL_DEBUG, "got getinfo request on %p for node %d\n", conn, req_lib_votequorum_getinfo->nodeid); if (req_lib_votequorum_getinfo->nodeid) { node = find_node_by_nodeid(req_lib_votequorum_getinfo->nodeid); } else { node = us; } if (node) { struct cluster_node *iternode; struct list_head *nodelist; list_iterate(nodelist, &cluster_members_list) { iternode = list_entry(nodelist, struct cluster_node, list); if (iternode->state == NODESTATE_MEMBER) { highest_expected = max(highest_expected, iternode->expected_votes); total_votes += iternode->votes; } } if (quorum_device && quorum_device->state == NODESTATE_MEMBER) { total_votes += quorum_device->votes; } res_lib_votequorum_getinfo.votes = us->votes; res_lib_votequorum_getinfo.expected_votes = us->expected_votes; res_lib_votequorum_getinfo.highest_expected = highest_expected; res_lib_votequorum_getinfo.quorum = quorum; res_lib_votequorum_getinfo.total_votes = total_votes; res_lib_votequorum_getinfo.flags = 0; res_lib_votequorum_getinfo.nodeid = node->node_id; if (us->flags & NODE_FLAGS_HASSTATE) res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_FLAG_HASSTATE; if (quorum_flags & VOTEQUORUM_FLAG_FEATURE_TWONODE) res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_FLAG_TWONODE; if (cluster_is_quorate) res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_FLAG_QUORATE; if (us->flags & NODE_FLAGS_SEESDISALLOWED) res_lib_votequorum_getinfo.flags |= VOTEQUORUM_INFO_FLAG_DISALLOWED; } else { error = CS_ERR_NOT_EXIST; } res_lib_votequorum_getinfo.header.size = sizeof(res_lib_votequorum_getinfo); res_lib_votequorum_getinfo.header.id = MESSAGE_RES_VOTEQUORUM_GETINFO; res_lib_votequorum_getinfo.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_getinfo, sizeof(res_lib_votequorum_getinfo)); log_printf(LOG_LEVEL_DEBUG, "getinfo response error: %d\n", error); } /* Message from the library */ static void message_handler_req_lib_votequorum_setexpected (void *conn, void *message) { struct req_lib_votequorum_setexpected *req_lib_votequorum_setexpected = (struct req_lib_votequorum_setexpected *)message; struct res_lib_votequorum_status res_lib_votequorum_status; cs_error_t error = CS_OK; unsigned int newquorum; unsigned int total_votes; ENTER(); /* * If there are disallowed nodes, then we can't allow the user * to bypass them by fiddling with expected votes. */ if (quorum_flags & VOTEQUORUM_FLAG_FEATURE_DISALLOWED && have_disallowed()) { error = CS_ERR_EXIST; goto error_exit; } /* Validate new expected votes */ newquorum = calculate_quorum(1, req_lib_votequorum_setexpected->expected_votes, &total_votes); if (newquorum < total_votes / 2 || newquorum > total_votes) { error = CS_ERR_INVALID_PARAM; goto error_exit; } quorum_exec_send_reconfigure(RECONFIG_PARAM_EXPECTED_VOTES, us->node_id, req_lib_votequorum_setexpected->expected_votes); /* send status */ error_exit: res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } /* Message from the library */ static void message_handler_req_lib_votequorum_setvotes (void *conn, void *message) { struct req_lib_votequorum_setvotes *req_lib_votequorum_setvotes = (struct req_lib_votequorum_setvotes *)message; struct res_lib_votequorum_status res_lib_votequorum_status; struct cluster_node *node; unsigned int newquorum; unsigned int total_votes; unsigned int saved_votes; cs_error_t error = CS_OK; ENTER(); node = find_node_by_nodeid(req_lib_votequorum_setvotes->nodeid); if (!node) { error = CS_ERR_NAME_NOT_FOUND; goto error_exit; } /* Check votes is valid */ saved_votes = node->votes; node->votes = req_lib_votequorum_setvotes->votes; newquorum = calculate_quorum(1, 0, &total_votes); if (newquorum < total_votes / 2 || newquorum > total_votes) { node->votes = saved_votes; error = CS_ERR_INVALID_PARAM; goto error_exit; } if (!req_lib_votequorum_setvotes->nodeid) req_lib_votequorum_setvotes->nodeid = corosync_api->totem_nodeid_get(); quorum_exec_send_reconfigure(RECONFIG_PARAM_NODE_VOTES, req_lib_votequorum_setvotes->nodeid, req_lib_votequorum_setvotes->votes); error_exit: /* send status */ res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static void message_handler_req_lib_votequorum_leaving (void *conn, void *message) { struct res_lib_votequorum_status res_lib_votequorum_status; cs_error_t error = CS_OK; ENTER(); quorum_exec_send_reconfigure(RECONFIG_PARAM_LEAVING, us->node_id, 1); /* * If we don't shut down in a sensible amount of time then cancel the * leave status. */ if (leaving_timeout) corosync_api->timer_add_duration((unsigned long long)leaving_timeout*1000000, NULL, leaving_timer_fn, &leaving_timer); /* send status */ res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static void quorum_device_timer_fn(void *arg) { struct timeval now; ENTER(); if (!quorum_device || quorum_device->state == NODESTATE_DEAD) return; gettimeofday(&now, NULL); if (quorum_device->last_hello.tv_sec + quorumdev_poll/1000 < now.tv_sec) { quorum_device->state = NODESTATE_DEAD; log_printf(LOG_INFO, "lost contact with quorum device\n"); recalculate_quorum(0, 0); } else { corosync_api->timer_add_duration((unsigned long long)quorumdev_poll*1000000, quorum_device, quorum_device_timer_fn, &quorum_device_timer); } LEAVE(); } static void message_handler_req_lib_votequorum_qdisk_register (void *conn, void *message) { struct req_lib_votequorum_qdisk_register *req_lib_votequorum_qdisk_register = (struct req_lib_votequorum_qdisk_register *)message; struct res_lib_votequorum_status res_lib_votequorum_status; cs_error_t error = CS_OK; ENTER(); if (quorum_device) { error = CS_ERR_EXIST; } else { quorum_device = allocate_node(0); quorum_device->state = NODESTATE_DEAD; quorum_device->votes = req_lib_votequorum_qdisk_register->votes; strcpy(quorum_device_name, req_lib_votequorum_qdisk_register->name); list_add(&quorum_device->list, &cluster_members_list); } /* send status */ res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static void message_handler_req_lib_votequorum_qdisk_unregister (void *conn, void *message) { struct res_lib_votequorum_status res_lib_votequorum_status; cs_error_t error = CS_OK; ENTER(); if (quorum_device) { struct cluster_node *node = quorum_device; quorum_device = NULL; list_del(&node->list); free(node); recalculate_quorum(0, 0); } else { error = CS_ERR_NOT_EXIST; } /* send status */ res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static void message_handler_req_lib_votequorum_qdisk_poll (void *conn, void *message) { struct req_lib_votequorum_qdisk_poll *req_lib_votequorum_qdisk_poll = (struct req_lib_votequorum_qdisk_poll *)message; struct res_lib_votequorum_status res_lib_votequorum_status; cs_error_t error = CS_OK; ENTER(); if (quorum_device) { if (req_lib_votequorum_qdisk_poll->state) { gettimeofday(&quorum_device->last_hello, NULL); if (quorum_device->state == NODESTATE_DEAD) { quorum_device->state = NODESTATE_MEMBER; recalculate_quorum(0, 0); corosync_api->timer_add_duration((unsigned long long)quorumdev_poll*1000000, quorum_device, quorum_device_timer_fn, &quorum_device_timer); } } else { if (quorum_device->state == NODESTATE_MEMBER) { quorum_device->state = NODESTATE_DEAD; recalculate_quorum(0, 0); corosync_api->timer_delete(quorum_device_timer); } } } else { error = CS_ERR_NOT_EXIST; } /* send status */ res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static void message_handler_req_lib_votequorum_qdisk_getinfo (void *conn, void *message) { struct res_lib_votequorum_qdisk_getinfo res_lib_votequorum_qdisk_getinfo; cs_error_t error = CS_OK; ENTER(); if (quorum_device) { log_printf(LOG_LEVEL_DEBUG, "got qdisk_getinfo state %d\n", quorum_device->state); res_lib_votequorum_qdisk_getinfo.votes = quorum_device->votes; if (quorum_device->state == NODESTATE_MEMBER) res_lib_votequorum_qdisk_getinfo.state = 1; else res_lib_votequorum_qdisk_getinfo.state = 0; strcpy(res_lib_votequorum_qdisk_getinfo.name, quorum_device_name); } else { error = CS_ERR_NOT_EXIST; } /* send status */ res_lib_votequorum_qdisk_getinfo.header.size = sizeof(res_lib_votequorum_qdisk_getinfo); res_lib_votequorum_qdisk_getinfo.header.id = MESSAGE_RES_VOTEQUORUM_GETINFO; res_lib_votequorum_qdisk_getinfo.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_qdisk_getinfo, sizeof(res_lib_votequorum_qdisk_getinfo)); LEAVE(); } static void message_handler_req_lib_votequorum_setstate (void *conn, void *message) { struct res_lib_votequorum_status res_lib_votequorum_status; cs_error_t error = CS_OK; ENTER(); us->flags |= NODE_FLAGS_HASSTATE; /* send status */ res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static void message_handler_req_lib_votequorum_trackstart (void *conn, void *msg) { struct req_lib_votequorum_trackstart *req_lib_votequorum_trackstart = (struct req_lib_votequorum_trackstart *)msg; struct res_lib_votequorum_status res_lib_votequorum_status; struct quorum_pd *quorum_pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn); ENTER(); /* * If an immediate listing of the current cluster membership * is requested, generate membership list */ if (req_lib_votequorum_trackstart->track_flags & CS_TRACK_CURRENT || req_lib_votequorum_trackstart->track_flags & CS_TRACK_CHANGES) { log_printf(LOG_LEVEL_DEBUG, "sending initial status to %p\n", conn); send_quorum_notification(conn, req_lib_votequorum_trackstart->context); } /* * Record requests for tracking */ if (req_lib_votequorum_trackstart->track_flags & CS_TRACK_CHANGES || req_lib_votequorum_trackstart->track_flags & CS_TRACK_CHANGES_ONLY) { quorum_pd->track_flags = req_lib_votequorum_trackstart->track_flags; quorum_pd->tracking_enabled = 1; quorum_pd->tracking_context = req_lib_votequorum_trackstart->context; list_add (&quorum_pd->list, &trackers_list); } /* Send status */ res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = CS_OK; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static void message_handler_req_lib_votequorum_trackstop (void *conn, void *msg) { struct res_lib_votequorum_status res_lib_votequorum_status; struct quorum_pd *quorum_pd = (struct quorum_pd *)corosync_api->ipc_private_data_get (conn); int error = CS_OK; ENTER(); if (quorum_pd->tracking_enabled) { error = CS_OK; quorum_pd->tracking_enabled = 0; list_del (&quorum_pd->list); list_init (&quorum_pd->list); } else { error = CS_ERR_NOT_EXIST; } /* send status */ res_lib_votequorum_status.header.size = sizeof(res_lib_votequorum_status); res_lib_votequorum_status.header.id = MESSAGE_RES_VOTEQUORUM_STATUS; res_lib_votequorum_status.header.error = error; corosync_api->ipc_response_send(conn, &res_lib_votequorum_status, sizeof(res_lib_votequorum_status)); LEAVE(); } static const char *kill_reason(int reason) { static char msg[1024]; switch (reason) { case VOTEQUORUM_REASON_KILL_REJECTED: return "our membership application was rejected"; case VOTEQUORUM_REASON_KILL_APPLICATION: return "we were killed by an application request"; case VOTEQUORUM_REASON_KILL_REJOIN: return "we rejoined the cluster without a full restart"; default: sprintf(msg, "we got kill message number %d", reason); return msg; } } static void reread_config(hdb_handle_t object_handle) { unsigned int old_votes; unsigned int old_expected; old_votes = us->votes; old_expected = us->expected_votes; /* * Reload the configuration */ read_quorum_config(object_handle); /* * Check for fundamental changes that we need to propogate */ if (old_votes != us->votes) { quorum_exec_send_reconfigure(RECONFIG_PARAM_NODE_VOTES, us->node_id, us->votes); } if (old_expected != us->expected_votes) { quorum_exec_send_reconfigure(RECONFIG_PARAM_EXPECTED_VOTES, us->node_id, us->expected_votes); } } static void quorum_key_change_notify(object_change_type_t change_type, hdb_handle_t parent_object_handle, hdb_handle_t object_handle, const void *object_name_pt, int object_name_len, const void *key_name_pt, int key_len, const void *key_value_pt, int key_value_len, void *priv_data_pt) { if (memcmp(object_name_pt, "quorum", object_name_len) == 0) reread_config(object_handle); } /* Called when the objdb is reloaded */ static void votequorum_objdb_reload_notify( objdb_reload_notify_type_t type, int flush, void *priv_data_pt) { /* * A new quorum {} key might exist, cancel the * existing notification at the start of reload, * and start a new one on the new object when * it's all settled. */ if (type == OBJDB_RELOAD_NOTIFY_START) { corosync_api->object_track_stop( quorum_key_change_notify, NULL, NULL, NULL, NULL); } if (type == OBJDB_RELOAD_NOTIFY_END || type == OBJDB_RELOAD_NOTIFY_FAILED) { hdb_handle_t find_handle; hdb_handle_t object_handle; corosync_api->object_find_create(OBJECT_PARENT_HANDLE, "quorum", strlen("quorum"), &find_handle); if (corosync_api->object_find_next(find_handle, &object_handle) == 0) { add_votequorum_config_notification(object_handle); reread_config(object_handle); } else { log_printf(LOG_LEVEL_ERROR, "votequorum objdb tracking stopped, cannot find quorum{} handle in objdb\n"); } } } static void add_votequorum_config_notification( hdb_handle_t quorum_object_handle) { corosync_api->object_track_start(quorum_object_handle, 1, quorum_key_change_notify, NULL, NULL, NULL, NULL); /* * Reload notify must be on the parent object */ corosync_api->object_track_start(OBJECT_PARENT_HANDLE, 1, NULL, NULL, NULL, votequorum_objdb_reload_notify, NULL); }