Page Menu
Home
ClusterLabs Projects
Search
Configure Global Search
Log In
Files
F3155447
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
127 KB
Referenced Files
None
Subscribers
None
View Options
diff --git a/exec/main.c b/exec/main.c
index fde77da5..582f1e2c 100644
--- a/exec/main.c
+++ b/exec/main.c
@@ -1,1540 +1,1540 @@
/*
* Copyright (c) 2002-2006 MontaVista Software, Inc.
* Copyright (c) 2006-2009 Red Hat, Inc.
*
* All rights reserved.
*
* Author: Steven Dake (sdake@redhat.com)
*
* This software licensed under BSD license, the text of which follows:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the MontaVista Software, Inc. nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* \mainpage Corosync
*
* This is the doxygen generated developer documentation for the Corosync
* project. For more information about Corosync, please see the project
* web site, <a href="http://www.corosync.org">corosync.org</a>.
*
* \section license License
*
* This software licensed under BSD license, the text of which follows:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the MontaVista Software, Inc. nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <config.h>
#include <pthread.h>
#include <assert.h>
#include <sys/types.h>
#include <sys/file.h>
#include <sys/poll.h>
#include <sys/uio.h>
#include <sys/mman.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/stat.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <signal.h>
#include <sched.h>
#include <time.h>
#include <semaphore.h>
#include <qb/qbdefs.h>
#include <qb/qblog.h>
#include <qb/qbloop.h>
#include <qb/qbutil.h>
#include <qb/qbipcs.h>
#include <corosync/swab.h>
#include <corosync/corotypes.h>
#include <corosync/corodefs.h>
#include <corosync/list.h>
#include <corosync/lcr/lcr_ifact.h>
#include <corosync/totem/totempg.h>
#include <corosync/engine/objdb.h>
#include <corosync/engine/config.h>
#include <corosync/engine/logsys.h>
#include "quorum.h"
#include "totemsrp.h"
#include "mainconfig.h"
#include "totemconfig.h"
#include "main.h"
#include "sync.h"
#include "syncv2.h"
#include "timer.h"
#include "util.h"
#include "apidef.h"
#include "service.h"
#include "schedwrk.h"
#include "evil.h"
#ifdef HAVE_SMALL_MEMORY_FOOTPRINT
#define IPC_LOGSYS_SIZE 1024*64
#else
#define IPC_LOGSYS_SIZE 8192*128
#endif
LOGSYS_DECLARE_SYSTEM ("corosync",
LOGSYS_MODE_OUTPUT_STDERR,
LOG_DAEMON,
LOG_INFO);
LOGSYS_DECLARE_SUBSYS ("MAIN");
#define SERVER_BACKLOG 5
static int sched_priority = 0;
static unsigned int service_count = 32;
static struct totem_logging_configuration totem_logging_configuration;
static int num_config_modules;
static struct config_iface_ver0 *config_modules[MAX_DYNAMIC_SERVICES];
static struct objdb_iface_ver0 *objdb = NULL;
static struct corosync_api_v1 *api = NULL;
static enum cs_sync_mode minimum_sync_mode;
static int sync_in_process = 1;
static qb_loop_t *corosync_poll_handle;
struct sched_param global_sched_param;
static hdb_handle_t object_memb_handle;
static corosync_timer_handle_t corosync_stats_timer_handle;
static const char *corosync_lock_file = LOCALSTATEDIR"/run/corosync.pid";
qb_loop_t *cs_poll_handle_get (void)
{
return (corosync_poll_handle);
}
int cs_poll_dispatch_add (qb_loop_t * handle,
int fd,
int events,
void *data,
int (*dispatch_fn) (int fd,
int revents,
void *data))
{
return qb_loop_poll_add(handle, QB_LOOP_MED, fd, events, data,
dispatch_fn);
}
int cs_poll_dispatch_delete(qb_loop_t * handle, int fd)
{
return qb_loop_poll_del(handle, fd);
}
void corosync_state_dump (void)
{
int i;
for (i = 0; i < SERVICE_HANDLER_MAXIMUM_COUNT; i++) {
if (ais_service[i] && ais_service[i]->exec_dump_fn) {
ais_service[i]->exec_dump_fn ();
}
}
}
static void unlink_all_completed (void)
{
api->timer_delete (corosync_stats_timer_handle);
qb_loop_stop (corosync_poll_handle);
}
void corosync_shutdown_request (void)
{
corosync_service_unlink_all (api, unlink_all_completed);
}
static int32_t sig_diag_handler (int num, void *data)
{
corosync_state_dump ();
qb_log_blackbox_write_to_file(LOCALSTATEDIR "/lib/corosync/fdata");
return 0;
}
static int32_t sig_exit_handler (int num, void *data)
{
corosync_service_unlink_all (api, unlink_all_completed);
return 0;
}
static void sigsegv_handler (int num)
{
(void)signal (SIGSEGV, SIG_DFL);
qb_log_blackbox_write_to_file(LOCALSTATEDIR "/lib/corosync/fdata");
qb_log_fini();
raise (SIGSEGV);
}
static void sigabrt_handler (int num)
{
(void)signal (SIGABRT, SIG_DFL);
qb_log_blackbox_write_to_file(LOCALSTATEDIR "/lib/corosync/fdata");
qb_log_fini();
raise (SIGABRT);
}
#define LOCALHOST_IP inet_addr("127.0.0.1")
-static hdb_handle_t corosync_group_handle;
+static void *corosync_group_handle;
static struct totempg_group corosync_group = {
.group = "a",
.group_len = 1
};
static void serialize_lock (void)
{
}
static void serialize_unlock (void)
{
}
static void corosync_sync_completed (void)
{
log_printf (LOGSYS_LEVEL_NOTICE,
"Completed service synchronization, ready to provide service.\n");
sync_in_process = 0;
cs_ipcs_sync_state_changed(sync_in_process);
}
static int corosync_sync_callbacks_retrieve (int sync_id,
struct sync_callbacks *callbacks)
{
unsigned int ais_service_index;
int res;
for (ais_service_index = 0;
ais_service_index < SERVICE_HANDLER_MAXIMUM_COUNT;
ais_service_index++) {
if (ais_service[ais_service_index] != NULL
&& (ais_service[ais_service_index]->sync_mode == CS_SYNC_V1
|| ais_service[ais_service_index]->sync_mode == CS_SYNC_V1_APIV2)) {
if (ais_service_index == sync_id) {
break;
}
}
}
/*
* Try to load backwards compat sync engines
*/
if (ais_service_index == SERVICE_HANDLER_MAXIMUM_COUNT) {
res = evil_callbacks_load (sync_id, callbacks);
return (res);
}
callbacks->name = ais_service[ais_service_index]->name;
callbacks->sync_init_api.sync_init_v1 = ais_service[ais_service_index]->sync_init;
callbacks->api_version = 1;
if (ais_service[ais_service_index]->sync_mode == CS_SYNC_V1_APIV2) {
callbacks->api_version = 2;
}
callbacks->sync_process = ais_service[ais_service_index]->sync_process;
callbacks->sync_activate = ais_service[ais_service_index]->sync_activate;
callbacks->sync_abort = ais_service[ais_service_index]->sync_abort;
return (0);
}
static int corosync_sync_v2_callbacks_retrieve (
int service_id,
struct sync_callbacks *callbacks)
{
int res;
if (minimum_sync_mode == CS_SYNC_V2 && service_id == CLM_SERVICE && ais_service[CLM_SERVICE] == NULL) {
res = evil_callbacks_load (service_id, callbacks);
return (res);
}
if (minimum_sync_mode == CS_SYNC_V2 && service_id == EVT_SERVICE && ais_service[EVT_SERVICE] == NULL) {
res = evil_callbacks_load (service_id, callbacks);
return (res);
}
if (ais_service[service_id] == NULL) {
return (-1);
}
if (minimum_sync_mode == CS_SYNC_V1 && ais_service[service_id]->sync_mode != CS_SYNC_V2) {
return (-1);
}
callbacks->name = ais_service[service_id]->name;
callbacks->api_version = 1;
if (ais_service[service_id]->sync_mode == CS_SYNC_V1_APIV2) {
callbacks->api_version = 2;
}
callbacks->sync_init_api.sync_init_v1 = ais_service[service_id]->sync_init;
callbacks->sync_process = ais_service[service_id]->sync_process;
callbacks->sync_activate = ais_service[service_id]->sync_activate;
callbacks->sync_abort = ais_service[service_id]->sync_abort;
return (0);
}
static struct memb_ring_id corosync_ring_id;
static void member_object_joined (unsigned int nodeid)
{
hdb_handle_t object_find_handle;
hdb_handle_t object_node_handle;
char * nodeint_str;
char nodeid_str[64];
unsigned int key_incr_dummy;
snprintf (nodeid_str, 64, "%d", nodeid);
objdb->object_find_create (
object_memb_handle,
nodeid_str,
strlen (nodeid_str),
&object_find_handle);
if (objdb->object_find_next (object_find_handle,
&object_node_handle) == 0) {
objdb->object_key_increment (object_node_handle,
"join_count", strlen("join_count"),
&key_incr_dummy);
objdb->object_key_replace (object_node_handle,
"status", strlen("status"),
"joined", strlen("joined"));
} else {
nodeint_str = (char*)api->totem_ifaces_print (nodeid);
objdb->object_create (object_memb_handle,
&object_node_handle,
nodeid_str, strlen (nodeid_str));
objdb->object_key_create_typed (object_node_handle,
"ip",
nodeint_str, strlen(nodeint_str),
OBJDB_VALUETYPE_STRING);
key_incr_dummy = 1;
objdb->object_key_create_typed (object_node_handle,
"join_count",
&key_incr_dummy, sizeof (key_incr_dummy),
OBJDB_VALUETYPE_UINT32);
objdb->object_key_create_typed (object_node_handle,
"status",
"joined", strlen("joined"),
OBJDB_VALUETYPE_STRING);
}
}
static void member_object_left (unsigned int nodeid)
{
hdb_handle_t object_find_handle;
hdb_handle_t object_node_handle;
char nodeid_str[64];
snprintf (nodeid_str, 64, "%u", nodeid);
objdb->object_find_create (
object_memb_handle,
nodeid_str,
strlen (nodeid_str),
&object_find_handle);
if (objdb->object_find_next (object_find_handle,
&object_node_handle) == 0) {
objdb->object_key_replace (object_node_handle,
"status", strlen("status"),
"left", strlen("left"));
}
}
static void confchg_fn (
enum totem_configuration_type configuration_type,
const unsigned int *member_list, size_t member_list_entries,
const unsigned int *left_list, size_t left_list_entries,
const unsigned int *joined_list, size_t joined_list_entries,
const struct memb_ring_id *ring_id)
{
int i;
int abort_activate = 0;
if (sync_in_process == 1) {
abort_activate = 1;
}
sync_in_process = 1;
cs_ipcs_sync_state_changed(sync_in_process);
memcpy (&corosync_ring_id, ring_id, sizeof (struct memb_ring_id));
for (i = 0; i < left_list_entries; i++) {
member_object_left (left_list[i]);
}
for (i = 0; i < joined_list_entries; i++) {
member_object_joined (joined_list[i]);
}
/*
* Call configuration change for all services
*/
for (i = 0; i < service_count; i++) {
if (ais_service[i] && ais_service[i]->confchg_fn) {
ais_service[i]->confchg_fn (configuration_type,
member_list, member_list_entries,
left_list, left_list_entries,
joined_list, joined_list_entries, ring_id);
}
}
if (abort_activate) {
sync_v2_abort ();
}
if (minimum_sync_mode == CS_SYNC_V2 && configuration_type == TOTEM_CONFIGURATION_TRANSITIONAL) {
sync_v2_save_transitional (member_list, member_list_entries, ring_id);
}
if (minimum_sync_mode == CS_SYNC_V2 && configuration_type == TOTEM_CONFIGURATION_REGULAR) {
sync_v2_start (member_list, member_list_entries, ring_id);
}
}
static void priv_drop (void)
{
return; /* TODO: we are still not dropping privs */
}
static void corosync_tty_detach (void)
{
FILE *r;
/*
* Disconnect from TTY if this is not a debug run
*/
switch (fork ()) {
case -1:
corosync_exit_error (AIS_DONE_FORK);
break;
case 0:
/*
* child which is disconnected, run this process
*/
break;
default:
exit (0);
break;
}
/* Create new session */
(void)setsid();
/*
* Map stdin/out/err to /dev/null.
*/
r = freopen("/dev/null", "r", stdin);
if (r == NULL) {
corosync_exit_error (AIS_DONE_STD_TO_NULL_REDIR);
}
r = freopen("/dev/null", "a", stderr);
if (r == NULL) {
corosync_exit_error (AIS_DONE_STD_TO_NULL_REDIR);
}
r = freopen("/dev/null", "a", stdout);
if (r == NULL) {
corosync_exit_error (AIS_DONE_STD_TO_NULL_REDIR);
}
}
static void corosync_mlockall (void)
{
#if !defined(COROSYNC_BSD) || defined(COROSYNC_FREEBSD_GE_8)
int res;
#endif
struct rlimit rlimit;
rlimit.rlim_cur = RLIM_INFINITY;
rlimit.rlim_max = RLIM_INFINITY;
#ifndef COROSYNC_SOLARIS
setrlimit (RLIMIT_MEMLOCK, &rlimit);
#else
setrlimit (RLIMIT_VMEM, &rlimit);
#endif
#if defined(COROSYNC_BSD) && !defined(COROSYNC_FREEBSD_GE_8)
/* under FreeBSD < 8 a process with locked page cannot call dlopen
* code disabled until FreeBSD bug i386/93396 was solved
*/
log_printf (LOGSYS_LEVEL_WARNING, "Could not lock memory of service to avoid page faults\n");
#else
res = mlockall (MCL_CURRENT | MCL_FUTURE);
if (res == -1) {
LOGSYS_PERROR (errno, LOGSYS_LEVEL_WARNING,
"Could not lock memory of service to avoid page faults");
};
#endif
}
static void corosync_totem_stats_updater (void *data)
{
totempg_stats_t * stats;
uint32_t mtt_rx_token;
uint32_t total_mtt_rx_token;
uint32_t avg_backlog_calc;
uint32_t total_backlog_calc;
uint32_t avg_token_holdtime;
uint32_t total_token_holdtime;
int t, prev;
int32_t token_count;
uint32_t firewall_enabled_or_nic_failure;
stats = api->totem_get_stats();
objdb->object_key_replace (stats->hdr.handle,
"msg_reserved", strlen("msg_reserved"),
&stats->msg_reserved, sizeof (stats->msg_reserved));
objdb->object_key_replace (stats->hdr.handle,
"msg_queue_avail", strlen("msg_queue_avail"),
&stats->msg_queue_avail, sizeof (stats->msg_queue_avail));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"orf_token_tx", strlen("orf_token_tx"),
&stats->mrp->srp->orf_token_tx, sizeof (stats->mrp->srp->orf_token_tx));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"orf_token_rx", strlen("orf_token_rx"),
&stats->mrp->srp->orf_token_rx, sizeof (stats->mrp->srp->orf_token_rx));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"memb_merge_detect_tx", strlen("memb_merge_detect_tx"),
&stats->mrp->srp->memb_merge_detect_tx, sizeof (stats->mrp->srp->memb_merge_detect_tx));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"memb_merge_detect_rx", strlen("memb_merge_detect_rx"),
&stats->mrp->srp->memb_merge_detect_rx, sizeof (stats->mrp->srp->memb_merge_detect_rx));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"memb_join_tx", strlen("memb_join_tx"),
&stats->mrp->srp->memb_join_tx, sizeof (stats->mrp->srp->memb_join_tx));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"memb_join_rx", strlen("memb_join_rx"),
&stats->mrp->srp->memb_join_rx, sizeof (stats->mrp->srp->memb_join_rx));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"mcast_tx", strlen("mcast_tx"),
&stats->mrp->srp->mcast_tx, sizeof (stats->mrp->srp->mcast_tx));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"mcast_retx", strlen("mcast_retx"),
&stats->mrp->srp->mcast_retx, sizeof (stats->mrp->srp->mcast_retx));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"mcast_rx", strlen("mcast_rx"),
&stats->mrp->srp->mcast_rx, sizeof (stats->mrp->srp->mcast_rx));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"memb_commit_token_tx", strlen("memb_commit_token_tx"),
&stats->mrp->srp->memb_commit_token_tx, sizeof (stats->mrp->srp->memb_commit_token_tx));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"memb_commit_token_rx", strlen("memb_commit_token_rx"),
&stats->mrp->srp->memb_commit_token_rx, sizeof (stats->mrp->srp->memb_commit_token_rx));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"token_hold_cancel_tx", strlen("token_hold_cancel_tx"),
&stats->mrp->srp->token_hold_cancel_tx, sizeof (stats->mrp->srp->token_hold_cancel_tx));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"token_hold_cancel_rx", strlen("token_hold_cancel_rx"),
&stats->mrp->srp->token_hold_cancel_rx, sizeof (stats->mrp->srp->token_hold_cancel_rx));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"operational_entered", strlen("operational_entered"),
&stats->mrp->srp->operational_entered, sizeof (stats->mrp->srp->operational_entered));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"operational_token_lost", strlen("operational_token_lost"),
&stats->mrp->srp->operational_token_lost, sizeof (stats->mrp->srp->operational_token_lost));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"gather_entered", strlen("gather_entered"),
&stats->mrp->srp->gather_entered, sizeof (stats->mrp->srp->gather_entered));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"gather_token_lost", strlen("gather_token_lost"),
&stats->mrp->srp->gather_token_lost, sizeof (stats->mrp->srp->gather_token_lost));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"commit_entered", strlen("commit_entered"),
&stats->mrp->srp->commit_entered, sizeof (stats->mrp->srp->commit_entered));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"commit_token_lost", strlen("commit_token_lost"),
&stats->mrp->srp->commit_token_lost, sizeof (stats->mrp->srp->commit_token_lost));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"recovery_entered", strlen("recovery_entered"),
&stats->mrp->srp->recovery_entered, sizeof (stats->mrp->srp->recovery_entered));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"recovery_token_lost", strlen("recovery_token_lost"),
&stats->mrp->srp->recovery_token_lost, sizeof (stats->mrp->srp->recovery_token_lost));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"consensus_timeouts", strlen("consensus_timeouts"),
&stats->mrp->srp->consensus_timeouts, sizeof (stats->mrp->srp->consensus_timeouts));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"rx_msg_dropped", strlen("rx_msg_dropped"),
&stats->mrp->srp->rx_msg_dropped, sizeof (stats->mrp->srp->rx_msg_dropped));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"continuous_gather", strlen("continuous_gather"),
&stats->mrp->srp->continuous_gather, sizeof (stats->mrp->srp->continuous_gather));
firewall_enabled_or_nic_failure = (stats->mrp->srp->continuous_gather > MAX_NO_CONT_GATHER ? 1 : 0);
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"firewall_enabled_or_nic_failure", strlen("firewall_enabled_or_nic_failure"),
&firewall_enabled_or_nic_failure, sizeof (firewall_enabled_or_nic_failure));
total_mtt_rx_token = 0;
total_token_holdtime = 0;
total_backlog_calc = 0;
token_count = 0;
t = stats->mrp->srp->latest_token;
while (1) {
if (t == 0)
prev = TOTEM_TOKEN_STATS_MAX - 1;
else
prev = t - 1;
if (prev == stats->mrp->srp->earliest_token)
break;
/* if tx == 0, then dropped token (not ours) */
if (stats->mrp->srp->token[t].tx != 0 ||
(stats->mrp->srp->token[t].rx - stats->mrp->srp->token[prev].rx) > 0 ) {
total_mtt_rx_token += (stats->mrp->srp->token[t].rx - stats->mrp->srp->token[prev].rx);
total_token_holdtime += (stats->mrp->srp->token[t].tx - stats->mrp->srp->token[t].rx);
total_backlog_calc += stats->mrp->srp->token[t].backlog_calc;
token_count++;
}
t = prev;
}
if (token_count) {
mtt_rx_token = (total_mtt_rx_token / token_count);
avg_backlog_calc = (total_backlog_calc / token_count);
avg_token_holdtime = (total_token_holdtime / token_count);
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"mtt_rx_token", strlen("mtt_rx_token"),
&mtt_rx_token, sizeof (mtt_rx_token));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"avg_token_workload", strlen("avg_token_workload"),
&avg_token_holdtime, sizeof (avg_token_holdtime));
objdb->object_key_replace (stats->mrp->srp->hdr.handle,
"avg_backlog_calc", strlen("avg_backlog_calc"),
&avg_backlog_calc, sizeof (avg_backlog_calc));
}
cs_ipcs_stats_update();
api->timer_add_duration (1500 * MILLI_2_NANO_SECONDS, NULL,
corosync_totem_stats_updater,
&corosync_stats_timer_handle);
}
static void corosync_totem_stats_init (void)
{
totempg_stats_t * stats;
hdb_handle_t object_find_handle;
hdb_handle_t object_runtime_handle;
hdb_handle_t object_totem_handle;
uint32_t zero_32 = 0;
uint64_t zero_64 = 0;
stats = api->totem_get_stats();
objdb->object_find_create (
OBJECT_PARENT_HANDLE,
"runtime",
strlen ("runtime"),
&object_find_handle);
if (objdb->object_find_next (object_find_handle,
&object_runtime_handle) == 0) {
objdb->object_create (object_runtime_handle,
&object_totem_handle,
"totem", strlen ("totem"));
objdb->object_create (object_totem_handle,
&stats->hdr.handle,
"pg", strlen ("pg"));
objdb->object_create (stats->hdr.handle,
&stats->mrp->hdr.handle,
"mrp", strlen ("mrp"));
objdb->object_create (stats->mrp->hdr.handle,
&stats->mrp->srp->hdr.handle,
"srp", strlen ("srp"));
objdb->object_key_create_typed (stats->hdr.handle,
"msg_reserved", &stats->msg_reserved,
sizeof (stats->msg_reserved), OBJDB_VALUETYPE_UINT32);
objdb->object_key_create_typed (stats->hdr.handle,
"msg_queue_avail", &stats->msg_queue_avail,
sizeof (stats->msg_queue_avail), OBJDB_VALUETYPE_UINT32);
/* Members object */
objdb->object_create (stats->mrp->srp->hdr.handle,
&object_memb_handle,
"members", strlen ("members"));
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"orf_token_tx", &stats->mrp->srp->orf_token_tx,
sizeof (stats->mrp->srp->orf_token_tx), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"orf_token_rx", &stats->mrp->srp->orf_token_rx,
sizeof (stats->mrp->srp->orf_token_rx), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"memb_merge_detect_tx", &stats->mrp->srp->memb_merge_detect_tx,
sizeof (stats->mrp->srp->memb_merge_detect_tx), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"memb_merge_detect_rx", &stats->mrp->srp->memb_merge_detect_rx,
sizeof (stats->mrp->srp->memb_merge_detect_rx), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"memb_join_tx", &stats->mrp->srp->memb_join_tx,
sizeof (stats->mrp->srp->memb_join_tx), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"memb_join_rx", &stats->mrp->srp->memb_join_rx,
sizeof (stats->mrp->srp->memb_join_rx), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"mcast_tx", &stats->mrp->srp->mcast_tx,
sizeof (stats->mrp->srp->mcast_tx), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"mcast_retx", &stats->mrp->srp->mcast_retx,
sizeof (stats->mrp->srp->mcast_retx), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"mcast_rx", &stats->mrp->srp->mcast_rx,
sizeof (stats->mrp->srp->mcast_rx), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"memb_commit_token_tx", &stats->mrp->srp->memb_commit_token_tx,
sizeof (stats->mrp->srp->memb_commit_token_tx), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"memb_commit_token_rx", &stats->mrp->srp->memb_commit_token_rx,
sizeof (stats->mrp->srp->memb_commit_token_rx), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"token_hold_cancel_tx", &stats->mrp->srp->token_hold_cancel_tx,
sizeof (stats->mrp->srp->token_hold_cancel_tx), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"token_hold_cancel_rx", &stats->mrp->srp->token_hold_cancel_rx,
sizeof (stats->mrp->srp->token_hold_cancel_rx), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"operational_entered", &stats->mrp->srp->operational_entered,
sizeof (stats->mrp->srp->operational_entered), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"operational_token_lost", &stats->mrp->srp->operational_token_lost,
sizeof (stats->mrp->srp->operational_token_lost), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"gather_entered", &stats->mrp->srp->gather_entered,
sizeof (stats->mrp->srp->gather_entered), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"gather_token_lost", &stats->mrp->srp->gather_token_lost,
sizeof (stats->mrp->srp->gather_token_lost), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"commit_entered", &stats->mrp->srp->commit_entered,
sizeof (stats->mrp->srp->commit_entered), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"commit_token_lost", &stats->mrp->srp->commit_token_lost,
sizeof (stats->mrp->srp->commit_token_lost), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"recovery_entered", &stats->mrp->srp->recovery_entered,
sizeof (stats->mrp->srp->recovery_entered), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"recovery_token_lost", &stats->mrp->srp->recovery_token_lost,
sizeof (stats->mrp->srp->recovery_token_lost), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"consensus_timeouts", &stats->mrp->srp->consensus_timeouts,
sizeof (stats->mrp->srp->consensus_timeouts), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"mtt_rx_token", &zero_32,
sizeof (zero_32), OBJDB_VALUETYPE_UINT32);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"avg_token_workload", &zero_32,
sizeof (zero_32), OBJDB_VALUETYPE_UINT32);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"avg_backlog_calc", &zero_32,
sizeof (zero_32), OBJDB_VALUETYPE_UINT32);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"rx_msg_dropped", &zero_64,
sizeof (zero_64), OBJDB_VALUETYPE_UINT64);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"continuous_gather", &zero_32,
sizeof (zero_32), OBJDB_VALUETYPE_UINT32);
objdb->object_key_create_typed (stats->mrp->srp->hdr.handle,
"firewall_enabled_or_nic_failure", &zero_32,
sizeof (zero_32), OBJDB_VALUETYPE_UINT32);
}
/* start stats timer */
api->timer_add_duration (1500 * MILLI_2_NANO_SECONDS, NULL,
corosync_totem_stats_updater,
&corosync_stats_timer_handle);
}
static void deliver_fn (
unsigned int nodeid,
const void *msg,
unsigned int msg_len,
int endian_conversion_required)
{
const struct qb_ipc_request_header *header;
int32_t service;
int32_t fn_id;
uint32_t id;
uint32_t key_incr_dummy;
header = msg;
if (endian_conversion_required) {
id = swab32 (header->id);
} else {
id = header->id;
}
/*
* Call the proper executive handler
*/
service = id >> 16;
fn_id = id & 0xffff;
if (ais_service[service] == NULL && service == EVT_SERVICE) {
evil_deliver_fn (nodeid, service, fn_id, msg,
endian_conversion_required);
}
if (!ais_service[service]) {
return;
}
if (fn_id >= ais_service[service]->exec_engine_count) {
log_printf(LOGSYS_LEVEL_WARNING, "discarded unknown message %d for service %d (max id %d)",
fn_id, service, ais_service[service]->exec_engine_count);
return;
}
objdb->object_key_increment (service_stats_handle[service][fn_id],
"rx", strlen("rx"),
&key_incr_dummy);
if (endian_conversion_required) {
assert(ais_service[service]->exec_engine[fn_id].exec_endian_convert_fn != NULL);
ais_service[service]->exec_engine[fn_id].exec_endian_convert_fn
((void *)msg);
}
ais_service[service]->exec_engine[fn_id].exec_handler_fn
(msg, nodeid);
}
void main_get_config_modules(struct config_iface_ver0 ***modules, int *num)
{
*modules = config_modules;
*num = num_config_modules;
}
int main_mcast (
const struct iovec *iovec,
unsigned int iov_len,
unsigned int guarantee)
{
const struct qb_ipc_request_header *req = iovec->iov_base;
int32_t service;
int32_t fn_id;
uint32_t key_incr_dummy;
service = req->id >> 16;
fn_id = req->id & 0xffff;
if (ais_service[service]) {
objdb->object_key_increment (service_stats_handle[service][fn_id],
"tx", strlen("tx"), &key_incr_dummy);
}
return (totempg_groups_mcast_joined (corosync_group_handle, iovec, iov_len, guarantee));
}
static qb_loop_timer_handle recheck_the_q_level_timer;
void corosync_recheck_the_q_level(void *data)
{
totempg_check_q_level(corosync_group_handle);
if (cs_ipcs_q_level_get() == TOTEM_Q_LEVEL_CRITICAL) {
qb_loop_timer_add(cs_poll_handle_get(), QB_LOOP_MED, 1*QB_TIME_NS_IN_MSEC,
NULL, corosync_recheck_the_q_level, &recheck_the_q_level_timer);
}
}
struct sending_allowed_private_data_struct {
int reserved_msgs;
};
int corosync_sending_allowed (
unsigned int service,
unsigned int id,
const void *msg,
void *sending_allowed_private_data)
{
struct sending_allowed_private_data_struct *pd =
(struct sending_allowed_private_data_struct *)sending_allowed_private_data;
struct iovec reserve_iovec;
struct qb_ipc_request_header *header = (struct qb_ipc_request_header *)msg;
int sending_allowed;
reserve_iovec.iov_base = (char *)header;
reserve_iovec.iov_len = header->size;
pd->reserved_msgs = totempg_groups_joined_reserve (
corosync_group_handle,
&reserve_iovec, 1);
if (pd->reserved_msgs == -1) {
return -EINVAL;
}
sending_allowed = QB_FALSE;
if (corosync_quorum_is_quorate() == 1 ||
ais_service[service]->allow_inquorate == CS_LIB_ALLOW_INQUORATE) {
// we are quorate
// now check flow control
if (ais_service[service]->lib_engine[id].flow_control == CS_LIB_FLOW_CONTROL_NOT_REQUIRED) {
sending_allowed = QB_TRUE;
} else if (pd->reserved_msgs && sync_in_process == 0) {
sending_allowed = QB_TRUE;
} else if (pd->reserved_msgs == 0) {
return -ENOBUFS;
} else /* (sync_in_process) */ {
return -EINPROGRESS;
}
} else {
return -EHOSTUNREACH;
}
return (sending_allowed);
}
void corosync_sending_allowed_release (void *sending_allowed_private_data)
{
struct sending_allowed_private_data_struct *pd =
(struct sending_allowed_private_data_struct *)sending_allowed_private_data;
if (pd->reserved_msgs == -1) {
return;
}
totempg_groups_joined_release (pd->reserved_msgs);
}
int message_source_is_local (const mar_message_source_t *source)
{
int ret = 0;
assert (source != NULL);
if (source->nodeid == totempg_my_nodeid_get ()) {
ret = 1;
}
return ret;
}
void message_source_set (
mar_message_source_t *source,
void *conn)
{
assert ((source != NULL) && (conn != NULL));
memset (source, 0, sizeof (mar_message_source_t));
source->nodeid = totempg_my_nodeid_get ();
source->conn = conn;
}
static void corosync_setscheduler (void)
{
#if defined(HAVE_PTHREAD_SETSCHEDPARAM) && defined(HAVE_SCHED_GET_PRIORITY_MAX) && defined(HAVE_SCHED_SETSCHEDULER)
int res;
sched_priority = sched_get_priority_max (SCHED_RR);
if (sched_priority != -1) {
global_sched_param.sched_priority = sched_priority;
res = sched_setscheduler (0, SCHED_RR, &global_sched_param);
if (res == -1) {
LOGSYS_PERROR(errno, LOGSYS_LEVEL_WARNING,
"Could not set SCHED_RR at priority %d",
global_sched_param.sched_priority);
global_sched_param.sched_priority = 0;
#ifdef HAVE_QB_LOG_THREAD_PRIORITY_SET
qb_log_thread_priority_set (SCHED_OTHER, 0);
#endif
} else {
/*
* Turn on SCHED_RR in logsys system
*/
#ifdef HAVE_QB_LOG_THREAD_PRIORITY_SET
res = qb_log_thread_priority_set (SCHED_RR, sched_priority);
#else
res = -1;
#endif
if (res == -1) {
log_printf (LOGSYS_LEVEL_ERROR,
"Could not set logsys thread priority."
" Can't continue because of priority inversions.");
corosync_exit_error (AIS_DONE_LOGSETUP);
}
}
} else {
LOGSYS_PERROR (errno, LOGSYS_LEVEL_WARNING,
"Could not get maximum scheduler priority");
sched_priority = 0;
}
#else
log_printf(LOGSYS_LEVEL_WARNING,
"The Platform is missing process priority setting features. Leaving at default.");
#endif
}
static void
_logsys_log_printf(int level, int subsys,
const char *function_name,
const char *file_name,
int file_line,
const char *format,
...) __attribute__((format(printf, 6, 7)));
static void
_logsys_log_printf(int level, int subsys,
const char *function_name,
const char *file_name,
int file_line,
const char *format, ...)
{
va_list ap;
char buf[QB_LOG_MAX_LEN];
size_t len;
va_start(ap, format);
qb_log_from_external_source_va(function_name, file_name,
format, level, file_line,
subsys, ap);
va_end(ap);
}
static void fplay_key_change_notify_fn (
object_change_type_t change_type,
hdb_handle_t parent_object_handle,
hdb_handle_t object_handle,
const void *object_name_pt, size_t object_name_len,
const void *key_name_pt, size_t key_len,
const void *key_value_pt, size_t key_value_len,
void *priv_data_pt)
{
if (key_len == strlen ("dump_flight_data") &&
memcmp ("dump_flight_data", key_name_pt, key_len) == 0) {
qb_log_blackbox_write_to_file (LOCALSTATEDIR "/lib/corosync/fdata");
}
if (key_len == strlen ("dump_state") &&
memcmp ("dump_state", key_name_pt, key_len) == 0) {
corosync_state_dump ();
}
}
static void corosync_fplay_control_init (void)
{
hdb_handle_t object_find_handle;
hdb_handle_t object_runtime_handle;
hdb_handle_t object_blackbox_handle;
objdb->object_find_create (OBJECT_PARENT_HANDLE,
"runtime", strlen ("runtime"),
&object_find_handle);
if (objdb->object_find_next (object_find_handle,
&object_runtime_handle) != 0) {
return;
}
objdb->object_create (object_runtime_handle,
&object_blackbox_handle,
"blackbox", strlen ("blackbox"));
objdb->object_key_create_typed (object_blackbox_handle,
"dump_flight_data", "no", strlen("no"),
OBJDB_VALUETYPE_STRING);
objdb->object_key_create_typed (object_blackbox_handle,
"dump_state", "no", strlen("no"),
OBJDB_VALUETYPE_STRING);
objdb->object_track_start (object_blackbox_handle,
OBJECT_TRACK_DEPTH_RECURSIVE,
fplay_key_change_notify_fn,
NULL, NULL, NULL, NULL);
}
static void main_service_ready (void)
{
int res;
/*
* This must occur after totempg is initialized because "this_ip" must be set
*/
res = corosync_service_defaults_link_and_init (api);
if (res == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "Could not initialize default services\n");
corosync_exit_error (AIS_DONE_INIT_SERVICES);
}
evil_init (api);
cs_ipcs_init();
corosync_totem_stats_init ();
corosync_fplay_control_init ();
if (minimum_sync_mode == CS_SYNC_V2) {
log_printf (LOGSYS_LEVEL_NOTICE, "Compatibility mode set to none. Using V2 of the synchronization engine.\n");
sync_v2_init (
corosync_sync_v2_callbacks_retrieve,
corosync_sync_completed);
} else
if (minimum_sync_mode == CS_SYNC_V1) {
log_printf (LOGSYS_LEVEL_NOTICE, "Compatibility mode set to whitetank. Using V1 and V2 of the synchronization engine.\n");
sync_register (
corosync_sync_callbacks_retrieve,
sync_v2_memb_list_determine,
sync_v2_memb_list_abort,
sync_v2_start);
sync_v2_init (
corosync_sync_v2_callbacks_retrieve,
corosync_sync_completed);
}
}
static enum e_ais_done corosync_flock (const char *lockfile, pid_t pid)
{
struct flock lock;
enum e_ais_done err;
char pid_s[17];
int fd_flag;
int lf;
err = AIS_DONE_EXIT;
lf = open (lockfile, O_WRONLY | O_CREAT, 0640);
if (lf == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't create lock file.\n");
return (AIS_DONE_AQUIRE_LOCK);
}
retry_fcntl:
lock.l_type = F_WRLCK;
lock.l_start = 0;
lock.l_whence = SEEK_SET;
lock.l_len = 0;
if (fcntl (lf, F_SETLK, &lock) == -1) {
switch (errno) {
case EINTR:
goto retry_fcntl;
break;
case EAGAIN:
case EACCES:
log_printf (LOGSYS_LEVEL_ERROR, "Another Corosync instance is already running.\n");
err = AIS_DONE_ALREADY_RUNNING;
goto error_close;
break;
default:
log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't aquire lock. Error was %s\n",
strerror(errno));
err = AIS_DONE_AQUIRE_LOCK;
goto error_close;
break;
}
}
if (ftruncate (lf, 0) == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't truncate lock file. Error was %s\n",
strerror (errno));
err = AIS_DONE_AQUIRE_LOCK;
goto error_close_unlink;
}
memset (pid_s, 0, sizeof (pid_s));
snprintf (pid_s, sizeof (pid_s) - 1, "%u\n", pid);
retry_write:
if (write (lf, pid_s, strlen (pid_s)) != strlen (pid_s)) {
if (errno == EINTR) {
goto retry_write;
} else {
log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't write pid to lock file. "
"Error was %s\n", strerror (errno));
err = AIS_DONE_AQUIRE_LOCK;
goto error_close_unlink;
}
}
if ((fd_flag = fcntl (lf, F_GETFD, 0)) == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't get close-on-exec flag from lock file. "
"Error was %s\n", strerror (errno));
err = AIS_DONE_AQUIRE_LOCK;
goto error_close_unlink;
}
fd_flag |= FD_CLOEXEC;
if (fcntl (lf, F_SETFD, fd_flag) == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't set close-on-exec flag to lock file. "
"Error was %s\n", strerror (errno));
err = AIS_DONE_AQUIRE_LOCK;
goto error_close_unlink;
}
return (err);
error_close_unlink:
unlink (lockfile);
error_close:
close (lf);
return (err);
}
int main (int argc, char **argv, char **envp)
{
const char *error_string;
struct totem_config totem_config;
hdb_handle_t objdb_handle;
hdb_handle_t config_handle;
unsigned int config_version = 0;
void *objdb_p;
struct config_iface_ver0 *config;
void *config_p;
const char *config_iface_init;
char *config_iface;
char *iface;
char *strtok_save_pt;
int res, ch;
int background, setprio;
struct stat stat_out;
char corosync_lib_dir[PATH_MAX];
hdb_handle_t object_runtime_handle;
enum e_ais_done flock_err;
/* default configuration
*/
background = 1;
setprio = 0;
while ((ch = getopt (argc, argv, "fprv")) != EOF) {
switch (ch) {
case 'f':
background = 0;
logsys_config_mode_set (NULL, LOGSYS_MODE_OUTPUT_STDERR|LOGSYS_MODE_THREADED|LOGSYS_MODE_FORK);
break;
case 'p':
break;
case 'r':
setprio = 1;
break;
case 'v':
printf ("Corosync Cluster Engine, version '%s'\n", VERSION);
printf ("Copyright (c) 2006-2009 Red Hat, Inc.\n");
return EXIT_SUCCESS;
break;
default:
fprintf(stderr, \
"usage:\n"\
" -f : Start application in foreground.\n"\
" -p : Does nothing. \n"\
" -r : Set round robin realtime scheduling \n"\
" -v : Display version and SVN revision of Corosync and exit.\n");
return EXIT_FAILURE;
}
}
/*
* Set round robin realtime scheduling with priority 99
* Lock all memory to avoid page faults which may interrupt
* application healthchecking
*/
if (setprio) {
corosync_setscheduler ();
}
corosync_mlockall ();
log_printf (LOGSYS_LEVEL_NOTICE, "Corosync Cluster Engine ('%s'): started and ready to provide service.\n", VERSION);
log_printf (LOGSYS_LEVEL_INFO, "Corosync built-in features:" PACKAGE_FEATURES "\n");
corosync_poll_handle = qb_loop_create ();
qb_loop_signal_add(corosync_poll_handle, QB_LOOP_LOW,
SIGUSR2, NULL, sig_diag_handler, NULL);
qb_loop_signal_add(corosync_poll_handle, QB_LOOP_HIGH,
SIGINT, NULL, sig_exit_handler, NULL);
qb_loop_signal_add(corosync_poll_handle, QB_LOOP_HIGH,
SIGQUIT, NULL, sig_exit_handler, NULL);
qb_loop_signal_add(corosync_poll_handle, QB_LOOP_HIGH,
SIGTERM, NULL, sig_exit_handler, NULL);
(void)signal (SIGSEGV, sigsegv_handler);
(void)signal (SIGABRT, sigabrt_handler);
#if MSG_NOSIGNAL != 0
(void)signal (SIGPIPE, SIG_IGN);
#endif
/*
* Load the object database interface
*/
res = lcr_ifact_reference (
&objdb_handle,
"objdb",
0,
&objdb_p,
0);
if (res == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't open configuration object database component.\n");
corosync_exit_error (AIS_DONE_OBJDB);
}
objdb = (struct objdb_iface_ver0 *)objdb_p;
objdb->objdb_init ();
/*
* Initialize the corosync_api_v1 definition
*/
apidef_init (objdb);
api = apidef_get ();
num_config_modules = 0;
/*
* Bootstrap in the default configuration parser or use
* the corosync default built in parser if the configuration parser
* isn't overridden
*/
config_iface_init = getenv("COROSYNC_DEFAULT_CONFIG_IFACE");
if (!config_iface_init) {
config_iface_init = "corosync_parser";
}
/* Make a copy so we can deface it with strtok */
if ((config_iface = strdup(config_iface_init)) == NULL) {
log_printf (LOGSYS_LEVEL_ERROR, "exhausted virtual memory");
corosync_exit_error (AIS_DONE_OBJDB);
}
iface = strtok_r(config_iface, ":", &strtok_save_pt);
while (iface)
{
res = lcr_ifact_reference (
&config_handle,
iface,
config_version,
&config_p,
0);
config = (struct config_iface_ver0 *)config_p;
if (res == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "Corosync Executive couldn't open configuration component '%s'\n", iface);
corosync_exit_error (AIS_DONE_MAINCONFIGREAD);
}
res = config->config_readconfig(objdb, &error_string);
if (res == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string);
corosync_exit_error (AIS_DONE_MAINCONFIGREAD);
}
log_printf (LOGSYS_LEVEL_NOTICE, "%s", error_string);
config_modules[num_config_modules++] = config;
iface = strtok_r(NULL, ":", &strtok_save_pt);
}
free(config_iface);
res = corosync_main_config_read (objdb, &error_string);
if (res == -1) {
/*
* if we are here, we _must_ flush the logsys queue
* and try to inform that we couldn't read the config.
* this is a desperate attempt before certain death
* and there is no guarantee that we can print to stderr
* nor that logsys is sending the messages where we expect.
*/
log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string);
fprintf(stderr, "%s", error_string);
syslog (LOGSYS_LEVEL_ERROR, "%s", error_string);
corosync_exit_error (AIS_DONE_MAINCONFIGREAD);
}
/*
* Make sure required directory is present
*/
sprintf (corosync_lib_dir, "%s/lib/corosync", LOCALSTATEDIR);
res = stat (corosync_lib_dir, &stat_out);
if ((res == -1) || (res == 0 && !S_ISDIR(stat_out.st_mode))) {
log_printf (LOGSYS_LEVEL_ERROR, "Required directory not present %s. Please create it.\n", corosync_lib_dir);
corosync_exit_error (AIS_DONE_DIR_NOT_PRESENT);
}
res = totem_config_read (objdb, &totem_config, &error_string);
if (res == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string);
corosync_exit_error (AIS_DONE_MAINCONFIGREAD);
}
res = totem_config_keyread (objdb, &totem_config, &error_string);
if (res == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string);
corosync_exit_error (AIS_DONE_MAINCONFIGREAD);
}
res = totem_config_validate (&totem_config, &error_string);
if (res == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string);
corosync_exit_error (AIS_DONE_MAINCONFIGREAD);
}
totem_config.totem_logging_configuration = totem_logging_configuration;
totem_config.totem_logging_configuration.log_subsys_id = _logsys_subsys_create("TOTEM", "totem");
totem_config.totem_logging_configuration.log_level_security = LOGSYS_LEVEL_WARNING;
totem_config.totem_logging_configuration.log_level_error = LOGSYS_LEVEL_ERROR;
totem_config.totem_logging_configuration.log_level_warning = LOGSYS_LEVEL_WARNING;
totem_config.totem_logging_configuration.log_level_notice = LOGSYS_LEVEL_NOTICE;
totem_config.totem_logging_configuration.log_level_debug = LOGSYS_LEVEL_DEBUG;
totem_config.totem_logging_configuration.log_printf = _logsys_log_printf;
logsys_config_apply();
res = corosync_main_config_compatibility_read (objdb,
&minimum_sync_mode,
&error_string);
if (res == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string);
corosync_exit_error (AIS_DONE_MAINCONFIGREAD);
}
res = corosync_main_config_compatibility_read (objdb,
&minimum_sync_mode,
&error_string);
if (res == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "%s", error_string);
corosync_exit_error (AIS_DONE_MAINCONFIGREAD);
}
/* create the main runtime object */
objdb->object_create (OBJECT_PARENT_HANDLE,
&object_runtime_handle,
"runtime", strlen ("runtime"));
/*
* Now we are fully initialized.
*/
if (background) {
corosync_tty_detach ();
}
qb_log_thread_start();
if ((flock_err = corosync_flock (corosync_lock_file, getpid ())) != AIS_DONE_EXIT) {
corosync_exit_error (flock_err);
}
/*
* if totempg_initialize doesn't have root priveleges, it cannot
* bind to a specific interface. This only matters if
* there is more then one interface in a system, so
* in this case, only a warning is printed
*/
/*
* Join multicast group and setup delivery
* and configuration change functions
*/
totempg_initialize (
corosync_poll_handle,
&totem_config);
totempg_service_ready_register (
main_service_ready);
totempg_groups_initialize (
&corosync_group_handle,
deliver_fn,
confchg_fn);
totempg_groups_join (
corosync_group_handle,
&corosync_group,
1);
/*
* Drop root privleges to user 'ais'
* TODO: Don't really need full root capabilities;
* needed capabilities are:
* CAP_NET_RAW (bindtodevice)
* CAP_SYS_NICE (setscheduler)
* CAP_IPC_LOCK (mlockall)
*/
priv_drop ();
schedwrk_init (
serialize_lock,
serialize_unlock);
/*
* Start main processing loop
*/
qb_loop_run (corosync_poll_handle);
/*
* Exit was requested
*/
totempg_finalize ();
/*
* Remove pid lock file
*/
unlink (corosync_lock_file);
corosync_exit_error (AIS_DONE_EXIT);
return EXIT_SUCCESS;
}
diff --git a/exec/sync.c b/exec/sync.c
index b9cc84a5..ce991299 100644
--- a/exec/sync.c
+++ b/exec/sync.c
@@ -1,487 +1,487 @@
/*
* Copyright (c) 2005-2006 MontaVista Software, Inc.
* Copyright (c) 2006-2007, 2009 Red Hat, Inc.
*
* All rights reserved.
*
* Author: Steven Dake (sdake@redhat.com)
*
* This software licensed under BSD license, the text of which follows:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the MontaVista Software, Inc. nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <config.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/ioctl.h>
#include <netinet/in.h>
#include <sys/uio.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <time.h>
#include <unistd.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <corosync/corotypes.h>
#include <corosync/swab.h>
#include <corosync/totem/totempg.h>
#include <corosync/totem/totem.h>
#include <corosync/lcr/lcr_ifact.h>
#include <corosync/engine/logsys.h>
#include <qb/qbipc_common.h>
#include "quorum.h"
#include "sync.h"
LOGSYS_DECLARE_SUBSYS ("SYNC");
#define MESSAGE_REQ_SYNC_BARRIER 0
struct barrier_data {
unsigned int nodeid;
int completed;
};
static const struct memb_ring_id *sync_ring_id;
static int (*sync_callbacks_retrieve) (int sync_id, struct sync_callbacks *callack);
static void (*sync_started) (
const struct memb_ring_id *ring_id);
static void (*sync_aborted) (void);
static struct sync_callbacks sync_callbacks;
static int sync_processing = 0;
static void (*sync_next_start) (
const unsigned int *member_list,
size_t member_list_entries,
const struct memb_ring_id *ring_id);
static int sync_recovery_index = 0;
static void *sync_callback_token_handle = 0;
static struct barrier_data barrier_data_confchg[PROCESSOR_COUNT_MAX];
static size_t barrier_data_confchg_entries;
static struct barrier_data barrier_data_process[PROCESSOR_COUNT_MAX];
static unsigned int my_member_list[PROCESSOR_COUNT_MAX];
static unsigned int my_trans_list[PROCESSOR_COUNT_MAX];
static unsigned int my_member_list_entries;
static unsigned int my_trans_list_entries;
static int sync_barrier_send (const struct memb_ring_id *ring_id);
static int sync_start_process (enum totem_callback_token_type type,
const void *data);
static void sync_service_init (struct memb_ring_id *ring_id);
static int sync_service_process (enum totem_callback_token_type type,
const void *data);
static void sync_deliver_fn (
unsigned int nodeid,
const void *msg,
unsigned int msg_len,
int endian_conversion_required);
static void sync_confchg_fn (
enum totem_configuration_type configuration_type,
const unsigned int *member_list,
size_t member_list_entries,
const unsigned int *left_list,
size_t left_list_entries,
const unsigned int *joined_list,
size_t joined_list_entries,
const struct memb_ring_id *ring_id);
static void sync_primary_callback_fn (
const unsigned int *view_list,
size_t view_list_entries,
const struct memb_ring_id *ring_id);
static struct totempg_group sync_group = {
.group = "sync",
.group_len = 4
};
-static hdb_handle_t sync_group_handle;
+static void *sync_group_handle;
struct req_exec_sync_barrier_start {
struct qb_ipc_request_header header;
struct memb_ring_id ring_id;
};
/*
* Send a barrier data structure
*/
static int sync_barrier_send (const struct memb_ring_id *ring_id)
{
struct req_exec_sync_barrier_start req_exec_sync_barrier_start;
struct iovec iovec;
int res;
req_exec_sync_barrier_start.header.size = sizeof (struct req_exec_sync_barrier_start);
req_exec_sync_barrier_start.header.id = MESSAGE_REQ_SYNC_BARRIER;
memcpy (&req_exec_sync_barrier_start.ring_id, ring_id,
sizeof (struct memb_ring_id));
iovec.iov_base = (char *)&req_exec_sync_barrier_start;
iovec.iov_len = sizeof (req_exec_sync_barrier_start);
res = totempg_groups_mcast_joined (sync_group_handle, &iovec, 1, TOTEMPG_AGREED);
return (res);
}
static void sync_start_init (const struct memb_ring_id *ring_id)
{
totempg_callback_token_create (
&sync_callback_token_handle,
TOTEM_CALLBACK_TOKEN_SENT,
0, /* don't delete after callback */
sync_start_process,
ring_id);
}
static void sync_service_init (struct memb_ring_id *ring_id)
{
if (sync_callbacks.api_version == 1) {
sync_callbacks.sync_init_api.sync_init_v1 (my_member_list,
my_member_list_entries, ring_id);
} else {
sync_callbacks.sync_init_api.sync_init_v2 (my_trans_list,
my_trans_list_entries,
my_member_list, my_member_list_entries, ring_id);
}
totempg_callback_token_destroy (&sync_callback_token_handle);
/*
* Create the token callback for the processing
*/
totempg_callback_token_create (
&sync_callback_token_handle,
TOTEM_CALLBACK_TOKEN_SENT,
0, /* don't delete after callback */
sync_service_process,
ring_id);
}
static int sync_start_process (enum totem_callback_token_type type,
const void *data)
{
int res;
const struct memb_ring_id *ring_id = data;
res = sync_barrier_send (ring_id);
if (res == 0) {
/*
* Delete the token callback for the barrier
*/
totempg_callback_token_destroy (&sync_callback_token_handle);
}
return (0);
}
static void sync_callbacks_load (void)
{
int res;
for (;;) {
res = sync_callbacks_retrieve (sync_recovery_index,
&sync_callbacks);
/*
* No more service handlers have sync callbacks at this time
` */
if (res == -1) {
sync_processing = 0;
break;
}
sync_recovery_index += 1;
if (sync_callbacks.sync_init_api.sync_init_v1) {
break;
}
}
}
static int sync_service_process (enum totem_callback_token_type type,
const void *data)
{
int res;
const struct memb_ring_id *ring_id = data;
/*
* If process operation not from this ring id, then ignore it and stop
* processing
*/
if (memcmp (ring_id, sync_ring_id, sizeof (struct memb_ring_id)) != 0) {
return (0);
}
/*
* If process returns 0, then its time to activate
* and start the next service's synchronization
*/
res = sync_callbacks.sync_process ();
if (res != 0) {
return (0);
}
totempg_callback_token_destroy (&sync_callback_token_handle);
sync_start_init (ring_id);
return (0);
}
int sync_register (
int (*callbacks_retrieve) (
int sync_id,
struct sync_callbacks *callbacks),
void (*started) (
const struct memb_ring_id *ring_id),
void (*aborted) (void),
void (*next_start) (
const unsigned int *member_list,
size_t member_list_entries,
const struct memb_ring_id *ring_id))
{
unsigned int res;
res = totempg_groups_initialize (
&sync_group_handle,
sync_deliver_fn,
sync_confchg_fn);
if (res == -1) {
log_printf (LOGSYS_LEVEL_ERROR,
"Couldn't initialize groups interface.\n");
return (-1);
}
res = totempg_groups_join (
sync_group_handle,
&sync_group,
1);
if (res == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "Couldn't join group.\n");
return (-1);
}
sync_callbacks_retrieve = callbacks_retrieve;
sync_next_start = next_start;
sync_started = started;
sync_aborted = aborted;
return (0);
}
static void sync_primary_callback_fn (
const unsigned int *view_list,
size_t view_list_entries,
const struct memb_ring_id *ring_id)
{
int i;
/*
* Execute configuration change for synchronization service
*/
sync_processing = 1;
totempg_callback_token_destroy (&sync_callback_token_handle);
sync_recovery_index = 0;
memset (&barrier_data_confchg, 0, sizeof (barrier_data_confchg));
for (i = 0; i < view_list_entries; i++) {
barrier_data_confchg[i].nodeid = view_list[i];
barrier_data_confchg[i].completed = 0;
}
memcpy (barrier_data_process, barrier_data_confchg,
sizeof (barrier_data_confchg));
barrier_data_confchg_entries = view_list_entries;
sync_start_init (sync_ring_id);
}
static struct memb_ring_id deliver_ring_id;
static void sync_endian_convert (struct req_exec_sync_barrier_start
*req_exec_sync_barrier_start)
{
totemip_copy_endian_convert(&req_exec_sync_barrier_start->ring_id.rep,
&req_exec_sync_barrier_start->ring_id.rep);
req_exec_sync_barrier_start->ring_id.seq = swab64 (req_exec_sync_barrier_start->ring_id.seq);
}
static void sync_deliver_fn (
unsigned int nodeid,
const void *msg,
unsigned int msg_len,
int endian_conversion_required)
{
struct req_exec_sync_barrier_start *req_exec_sync_barrier_start =
(struct req_exec_sync_barrier_start *)msg;
unsigned int barrier_completed;
int i;
log_printf (LOGSYS_LEVEL_DEBUG, "confchg entries %lu\n",
(unsigned long int) barrier_data_confchg_entries);
if (endian_conversion_required) {
sync_endian_convert (req_exec_sync_barrier_start);
}
barrier_completed = 1;
memcpy (&deliver_ring_id, &req_exec_sync_barrier_start->ring_id,
sizeof (struct memb_ring_id));
/*
* Is this barrier from this configuration, if not, ignore it
*/
if (memcmp (&req_exec_sync_barrier_start->ring_id, sync_ring_id,
sizeof (struct memb_ring_id)) != 0) {
return;
}
/*
* Set completion for source_addr's address
*/
for (i = 0; i < barrier_data_confchg_entries; i++) {
if (nodeid == barrier_data_process[i].nodeid) {
barrier_data_process[i].completed = 1;
log_printf (LOGSYS_LEVEL_DEBUG,
"Barrier Start Received From %d\n",
barrier_data_process[i].nodeid);
break;
}
}
/*
* Test if barrier is complete
*/
for (i = 0; i < barrier_data_confchg_entries; i++) {
log_printf (LOGSYS_LEVEL_DEBUG,
"Barrier completion status for nodeid %d = %d. \n",
barrier_data_process[i].nodeid,
barrier_data_process[i].completed);
if (barrier_data_process[i].completed == 0) {
barrier_completed = 0;
}
}
if (barrier_completed) {
log_printf (LOGSYS_LEVEL_DEBUG,
"Synchronization barrier completed\n");
}
/*
* This sync is complete so activate and start next service sync
*/
if (barrier_completed && sync_callbacks.sync_activate) {
sync_callbacks.sync_activate ();
log_printf (LOGSYS_LEVEL_DEBUG,
"Committing synchronization for (%s)\n",
sync_callbacks.name);
}
/*
* Start synchronization if the barrier has completed
*/
if (barrier_completed) {
memcpy (barrier_data_process, barrier_data_confchg,
sizeof (barrier_data_confchg));
sync_callbacks_load();
/*
* if sync service found, execute it
*/
if (sync_processing && sync_callbacks.sync_init_api.sync_init_v1) {
log_printf (LOGSYS_LEVEL_DEBUG,
"Synchronization actions starting for (%s)\n",
sync_callbacks.name);
sync_service_init (&deliver_ring_id);
}
if (sync_processing == 0) {
sync_next_start (my_member_list, my_member_list_entries, sync_ring_id);
}
}
return;
}
static void sync_confchg_fn (
enum totem_configuration_type configuration_type,
const unsigned int *member_list,
size_t member_list_entries,
const unsigned int *left_list,
size_t left_list_entries,
const unsigned int *joined_list,
size_t joined_list_entries,
const struct memb_ring_id *ring_id)
{
sync_ring_id = ring_id;
if (configuration_type != TOTEM_CONFIGURATION_REGULAR) {
memcpy (my_trans_list, member_list, member_list_entries *
sizeof (unsigned int));
my_trans_list_entries = member_list_entries;
return;
}
memcpy (my_member_list, member_list, member_list_entries * sizeof (unsigned int));
my_member_list_entries = member_list_entries;
sync_aborted ();
if (sync_processing && sync_callbacks.sync_abort != NULL) {
sync_callbacks.sync_abort ();
sync_callbacks.sync_activate = NULL;
}
sync_started (
ring_id);
sync_primary_callback_fn (
member_list,
member_list_entries,
ring_id);
}
diff --git a/exec/syncv2.c b/exec/syncv2.c
index f9eebacf..8a966152 100644
--- a/exec/syncv2.c
+++ b/exec/syncv2.c
@@ -1,624 +1,624 @@
/*
* Copyright (c) 2009 Red Hat, Inc.
*
* All rights reserved.
*
* Author: Steven Dake (sdake@redhat.com)
*
* This software licensed under BSD license, the text of which follows:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the MontaVista Software, Inc. nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <config.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <sys/ioctl.h>
#include <netinet/in.h>
#include <sys/uio.h>
#include <unistd.h>
#include <fcntl.h>
#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <time.h>
#include <unistd.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <corosync/corotypes.h>
#include <corosync/swab.h>
#include <corosync/totem/totempg.h>
#include <corosync/totem/totem.h>
#include <corosync/lcr/lcr_ifact.h>
#include <corosync/engine/logsys.h>
#include <qb/qbipc_common.h>
#include "schedwrk.h"
#include "quorum.h"
#include "sync.h"
#include "syncv2.h"
LOGSYS_DECLARE_SUBSYS ("SYNCV2");
#define MESSAGE_REQ_SYNC_BARRIER 0
#define MESSAGE_REQ_SYNC_SERVICE_BUILD 1
#define MESSAGE_REQ_SYNC_MEMB_DETERMINE 2
enum sync_process_state {
INIT,
PROCESS,
ACTIVATE
};
enum sync_state {
SYNC_SERVICELIST_BUILD,
SYNC_PROCESS,
SYNC_BARRIER
};
struct service_entry {
int service_id;
int api_version;
union sync_init_api sync_init_api;
void (*sync_abort) (void);
int (*sync_process) (void);
void (*sync_activate) (void);
enum sync_process_state state;
char name[128];
};
struct processor_entry {
int nodeid;
int received;
};
struct req_exec_memb_determine_message {
struct qb_ipc_request_header header __attribute__((aligned(8)));
struct memb_ring_id ring_id __attribute__((aligned(8)));
};
struct req_exec_service_build_message {
struct qb_ipc_request_header header __attribute__((aligned(8)));
struct memb_ring_id ring_id __attribute__((aligned(8)));
int service_list_entries __attribute__((aligned(8)));
int service_list[128] __attribute__((aligned(8)));
};
struct req_exec_barrier_message {
struct qb_ipc_request_header header __attribute__((aligned(8)));
struct memb_ring_id ring_id __attribute__((aligned(8)));
};
static enum sync_state my_state = SYNC_BARRIER;
static struct memb_ring_id my_ring_id;
static struct memb_ring_id my_memb_determine_ring_id;
static int my_memb_determine = 0;
static unsigned int my_memb_determine_list[PROCESSOR_COUNT_MAX];
static unsigned int my_memb_determine_list_entries = 0;
static int my_processing_idx = 0;
static hdb_handle_t my_schedwrk_handle;
static struct processor_entry my_processor_list[PROCESSOR_COUNT_MAX];
static unsigned int my_member_list[PROCESSOR_COUNT_MAX];
static unsigned int my_trans_list[PROCESSOR_COUNT_MAX];
static size_t my_member_list_entries = 0;
static size_t my_trans_list_entries = 0;
static int my_processor_list_entries = 0;
static struct service_entry my_service_list[128];
static int my_service_list_entries = 0;
static const struct memb_ring_id sync_ring_id;
static struct service_entry my_initial_service_list[PROCESSOR_COUNT_MAX];
static int my_initial_service_list_entries;
static void (*sync_synchronization_completed) (void);
static void sync_deliver_fn (
unsigned int nodeid,
const void *msg,
unsigned int msg_len,
int endian_conversion_required);
static int schedwrk_processor (const void *context);
static void sync_process_enter (void);
static struct totempg_group sync_group = {
.group = "syncv2",
.group_len = 6
};
-static hdb_handle_t sync_group_handle;
+static void *sync_group_handle;
int sync_v2_init (
int (*sync_callbacks_retrieve) (
int service_id,
struct sync_callbacks *callbacks),
void (*synchronization_completed) (void))
{
unsigned int res;
int i;
struct sync_callbacks sync_callbacks;
res = totempg_groups_initialize (
&sync_group_handle,
sync_deliver_fn,
NULL);
if (res == -1) {
log_printf (LOGSYS_LEVEL_ERROR,
"Couldn't initialize groups interface.");
return (-1);
}
res = totempg_groups_join (
sync_group_handle,
&sync_group,
1);
if (res == -1) {
log_printf (LOGSYS_LEVEL_ERROR, "Couldn't join group.\n");
return (-1);
}
sync_synchronization_completed = synchronization_completed;
for (i = 0; i < 64; i++) {
res = sync_callbacks_retrieve (i, &sync_callbacks);
if (res == -1) {
continue;
}
if (sync_callbacks.sync_init_api.sync_init_v1 == NULL) {
continue;
}
my_initial_service_list[my_initial_service_list_entries].state =
INIT;
my_initial_service_list[my_initial_service_list_entries].service_id = i;
strcpy (my_initial_service_list[my_initial_service_list_entries].name,
sync_callbacks.name);
my_initial_service_list[my_initial_service_list_entries].api_version = sync_callbacks.api_version;
my_initial_service_list[my_initial_service_list_entries].sync_init_api = sync_callbacks.sync_init_api;
my_initial_service_list[my_initial_service_list_entries].sync_process = sync_callbacks.sync_process;
my_initial_service_list[my_initial_service_list_entries].sync_abort = sync_callbacks.sync_abort;
my_initial_service_list[my_initial_service_list_entries].sync_activate = sync_callbacks.sync_activate;
my_initial_service_list_entries += 1;
}
return (0);
}
static void sync_barrier_handler (unsigned int nodeid, const void *msg)
{
const struct req_exec_barrier_message *req_exec_barrier_message = msg;
int i;
int barrier_reached = 1;
if (memcmp (&my_ring_id, &req_exec_barrier_message->ring_id,
sizeof (struct memb_ring_id)) != 0) {
log_printf (LOGSYS_LEVEL_DEBUG, "barrier for old ring - discarding\n");
return;
}
for (i = 0; i < my_processor_list_entries; i++) {
if (my_processor_list[i].nodeid == nodeid) {
my_processor_list[i].received = 1;
}
}
for (i = 0; i < my_processor_list_entries; i++) {
if (my_processor_list[i].received == 0) {
barrier_reached = 0;
}
}
if (barrier_reached) {
log_printf (LOGSYS_LEVEL_DEBUG, "Committing synchronization for %s\n",
my_service_list[my_processing_idx].name);
my_service_list[my_processing_idx].state = ACTIVATE;
my_service_list[my_processing_idx].sync_activate ();
my_processing_idx += 1;
if (my_service_list_entries == my_processing_idx) {
my_memb_determine_list_entries = 0;
sync_synchronization_completed ();
} else {
sync_process_enter ();
}
}
}
static void dummy_sync_init (
const unsigned int *member_list,
size_t member_list_entries,
const struct memb_ring_id *ring_id)
{
}
static void dummy_sync_abort (void)
{
}
static int dummy_sync_process (void)
{
return (0);
}
static void dummy_sync_activate (void)
{
}
static int service_entry_compare (const void *a, const void *b)
{
const struct service_entry *service_entry_a = a;
const struct service_entry *service_entry_b = b;
return (service_entry_a->service_id > service_entry_b->service_id);
}
static void sync_memb_determine (unsigned int nodeid, const void *msg)
{
const struct req_exec_memb_determine_message *req_exec_memb_determine_message = msg;
int found = 0;
int i;
if (memcmp (&req_exec_memb_determine_message->ring_id,
&my_memb_determine_ring_id, sizeof (struct memb_ring_id)) != 0) {
log_printf (LOGSYS_LEVEL_DEBUG, "memb determine for old ring - discarding\n");
return;
}
my_memb_determine = 1;
for (i = 0; i < my_memb_determine_list_entries; i++) {
if (my_memb_determine_list[i] == nodeid) {
found = 1;
}
}
if (found == 0) {
my_memb_determine_list[my_memb_determine_list_entries] = nodeid;
my_memb_determine_list_entries += 1;
}
}
static void sync_service_build_handler (unsigned int nodeid, const void *msg)
{
const struct req_exec_service_build_message *req_exec_service_build_message = msg;
int i, j;
int barrier_reached = 1;
int found;
int qsort_trigger = 0;
if (memcmp (&my_ring_id, &req_exec_service_build_message->ring_id,
sizeof (struct memb_ring_id)) != 0) {
log_printf (LOGSYS_LEVEL_DEBUG, "service build for old ring - discarding\n");
return;
}
for (i = 0; i < req_exec_service_build_message->service_list_entries; i++) {
found = 0;
for (j = 0; j < my_service_list_entries; j++) {
if (req_exec_service_build_message->service_list[i] ==
my_service_list[j].service_id) {
found = 1;
break;
}
}
if (found == 0) {
my_service_list[my_service_list_entries].state =
INIT;
my_service_list[my_service_list_entries].service_id =
req_exec_service_build_message->service_list[i];
sprintf (my_service_list[my_service_list_entries].name,
"External Service (id = %d)\n",
req_exec_service_build_message->service_list[i]);
my_service_list[my_service_list_entries].api_version = 1;
my_service_list[my_service_list_entries].sync_init_api.sync_init_v1 =
dummy_sync_init;
my_service_list[my_service_list_entries].sync_abort =
dummy_sync_abort;
my_service_list[my_service_list_entries].sync_process =
dummy_sync_process;
my_service_list[my_service_list_entries].sync_activate =
dummy_sync_activate;
my_service_list_entries += 1;
qsort_trigger = 1;
}
}
if (qsort_trigger) {
qsort (my_service_list, my_service_list_entries,
sizeof (struct service_entry), service_entry_compare);
}
for (i = 0; i < my_processor_list_entries; i++) {
if (my_processor_list[i].nodeid == nodeid) {
my_processor_list[i].received = 1;
}
}
for (i = 0; i < my_processor_list_entries; i++) {
if (my_processor_list[i].received == 0) {
barrier_reached = 0;
}
}
if (barrier_reached) {
sync_process_enter ();
}
}
static void sync_deliver_fn (
unsigned int nodeid,
const void *msg,
unsigned int msg_len,
int endian_conversion_required)
{
struct qb_ipc_request_header *header = (struct qb_ipc_request_header *)msg;
switch (header->id) {
case MESSAGE_REQ_SYNC_BARRIER:
sync_barrier_handler (nodeid, msg);
break;
case MESSAGE_REQ_SYNC_SERVICE_BUILD:
sync_service_build_handler (nodeid, msg);
break;
case MESSAGE_REQ_SYNC_MEMB_DETERMINE:
sync_memb_determine (nodeid, msg);
break;
}
}
static void memb_determine_message_transmit (void)
{
struct iovec iovec;
struct req_exec_memb_determine_message req_exec_memb_determine_message;
req_exec_memb_determine_message.header.size = sizeof (struct req_exec_memb_determine_message);
req_exec_memb_determine_message.header.id = MESSAGE_REQ_SYNC_MEMB_DETERMINE;
memcpy (&req_exec_memb_determine_message.ring_id,
&my_memb_determine_ring_id,
sizeof (struct memb_ring_id));
iovec.iov_base = (char *)&req_exec_memb_determine_message;
iovec.iov_len = sizeof (req_exec_memb_determine_message);
(void)totempg_groups_mcast_joined (sync_group_handle,
&iovec, 1, TOTEMPG_AGREED);
}
static void barrier_message_transmit (void)
{
struct iovec iovec;
struct req_exec_barrier_message req_exec_barrier_message;
req_exec_barrier_message.header.size = sizeof (struct req_exec_barrier_message);
req_exec_barrier_message.header.id = MESSAGE_REQ_SYNC_BARRIER;
memcpy (&req_exec_barrier_message.ring_id, &my_ring_id,
sizeof (struct memb_ring_id));
iovec.iov_base = (char *)&req_exec_barrier_message;
iovec.iov_len = sizeof (req_exec_barrier_message);
(void)totempg_groups_mcast_joined (sync_group_handle,
&iovec, 1, TOTEMPG_AGREED);
}
static void service_build_message_transmit (struct req_exec_service_build_message *service_build_message)
{
struct iovec iovec;
service_build_message->header.size = sizeof (struct req_exec_service_build_message);
service_build_message->header.id = MESSAGE_REQ_SYNC_SERVICE_BUILD;
memcpy (&service_build_message->ring_id, &my_ring_id,
sizeof (struct memb_ring_id));
iovec.iov_base = (void *)service_build_message;
iovec.iov_len = sizeof (struct req_exec_service_build_message);
(void)totempg_groups_mcast_joined (sync_group_handle,
&iovec, 1, TOTEMPG_AGREED);
}
static void sync_barrier_enter (void)
{
my_state = SYNC_BARRIER;
barrier_message_transmit ();
}
static void sync_process_enter (void)
{
int i;
my_state = SYNC_PROCESS;
/*
* No syncv2 services
*/
if (my_service_list_entries == 0) {
my_state = SYNC_SERVICELIST_BUILD;
my_memb_determine_list_entries = 0;
sync_synchronization_completed ();
return;
}
for (i = 0; i < my_processor_list_entries; i++) {
my_processor_list[i].received = 0;
}
schedwrk_create (&my_schedwrk_handle,
schedwrk_processor,
NULL);
}
static void sync_servicelist_build_enter (
const unsigned int *member_list,
size_t member_list_entries,
const struct memb_ring_id *ring_id)
{
struct req_exec_service_build_message service_build;
int i;
my_state = SYNC_SERVICELIST_BUILD;
for (i = 0; i < member_list_entries; i++) {
my_processor_list[i].nodeid = member_list[i];
my_processor_list[i].received = 0;
}
my_processor_list_entries = member_list_entries;
memcpy (my_member_list, member_list,
member_list_entries * sizeof (unsigned int));
my_member_list_entries = member_list_entries;
my_processing_idx = 0;
memcpy (my_service_list, my_initial_service_list,
sizeof (struct service_entry) *
my_initial_service_list_entries);
my_service_list_entries = my_initial_service_list_entries;
for (i = 0; i < my_initial_service_list[i].service_id; i++) {
service_build.service_list[i] =
my_initial_service_list[i].service_id;
}
service_build.service_list_entries = i;
service_build_message_transmit (&service_build);
}
static int schedwrk_processor (const void *context)
{
int res = 0;
if (my_service_list[my_processing_idx].state == INIT) {
my_service_list[my_processing_idx].state = PROCESS;
if (my_service_list[my_processing_idx].api_version == 1) {
my_service_list[my_processing_idx].sync_init_api.sync_init_v1 (my_member_list,
my_member_list_entries,
&my_ring_id);
} else {
unsigned int old_trans_list[PROCESSOR_COUNT_MAX];
size_t old_trans_list_entries = 0;
int o, m;
memcpy (old_trans_list, my_trans_list, my_trans_list_entries *
sizeof (unsigned int));
old_trans_list_entries = my_trans_list_entries;
my_trans_list_entries = 0;
for (o = 0; o < old_trans_list_entries; o++) {
for (m = 0; m < my_member_list_entries; m++) {
if (old_trans_list[o] == my_member_list[m]) {
my_trans_list[my_trans_list_entries] = my_member_list[m];
my_trans_list_entries++;
break;
}
}
}
my_service_list[my_processing_idx].sync_init_api.sync_init_v2 (my_trans_list,
my_trans_list_entries, my_member_list,
my_member_list_entries,
&my_ring_id);
}
}
if (my_service_list[my_processing_idx].state == PROCESS) {
my_service_list[my_processing_idx].state = PROCESS;
res = my_service_list[my_processing_idx].sync_process ();
if (res == 0) {
sync_barrier_enter();
} else {
return (-1);
}
}
return (0);
}
void sync_v2_start (
const unsigned int *member_list,
size_t member_list_entries,
const struct memb_ring_id *ring_id)
{
ENTER();
memcpy (&my_ring_id, ring_id, sizeof (struct memb_ring_id));
if (my_memb_determine) {
my_memb_determine = 0;
sync_servicelist_build_enter (my_memb_determine_list,
my_memb_determine_list_entries, ring_id);
} else {
sync_servicelist_build_enter (member_list, member_list_entries,
ring_id);
}
}
void sync_v2_save_transitional (
const unsigned int *member_list,
size_t member_list_entries,
const struct memb_ring_id *ring_id)
{
ENTER();
memcpy (my_trans_list, member_list, member_list_entries *
sizeof (unsigned int));
my_trans_list_entries = member_list_entries;
}
void sync_v2_abort (void)
{
ENTER();
if (my_state == SYNC_PROCESS) {
schedwrk_destroy (my_schedwrk_handle);
my_service_list[my_processing_idx].sync_abort ();
}
/* this will cause any "old" barrier messages from causing
* problems.
*/
memset (&my_ring_id, 0, sizeof (struct memb_ring_id));
}
void sync_v2_memb_list_determine (const struct memb_ring_id *ring_id)
{
ENTER();
memcpy (&my_memb_determine_ring_id, ring_id,
sizeof (struct memb_ring_id));
memb_determine_message_transmit ();
}
void sync_v2_memb_list_abort (void)
{
ENTER();
my_memb_determine_list_entries = 0;
memset (&my_memb_determine_ring_id, 0, sizeof (struct memb_ring_id));
}
diff --git a/exec/totempg.c b/exec/totempg.c
index c5ba01c8..a3eee15b 100644
--- a/exec/totempg.c
+++ b/exec/totempg.c
@@ -1,1531 +1,1460 @@
/*
* Copyright (c) 2003-2005 MontaVista Software, Inc.
* Copyright (c) 2005 OSDL.
* Copyright (c) 2006-2009 Red Hat, Inc.
*
* All rights reserved.
*
* Author: Steven Dake (sdake@redhat.com)
* Author: Mark Haverkamp (markh@osdl.org)
*
* This software licensed under BSD license, the text of which follows:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the MontaVista Software, Inc. nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
/*
* FRAGMENTATION AND PACKING ALGORITHM:
*
* Assemble the entire message into one buffer
* if full fragment
* store fragment into lengths list
* for each full fragment
* multicast fragment
* set length and fragment fields of pg mesage
* store remaining multicast into head of fragmentation data and set lens field
*
* If a message exceeds the maximum packet size allowed by the totem
* single ring protocol, the protocol could lose forward progress.
* Statically calculating the allowed data amount doesn't work because
* the amount of data allowed depends on the number of fragments in
* each message. In this implementation, the maximum fragment size
* is dynamically calculated for each fragment added to the message.
* It is possible for a message to be two bytes short of the maximum
* packet size. This occurs when a message or collection of
* messages + the mcast header + the lens are two bytes short of the
* end of the packet. Since another len field consumes two bytes, the
* len field would consume the rest of the packet without room for data.
*
* One optimization would be to forgo the final len field and determine
* it from the size of the udp datagram. Then this condition would no
* longer occur.
*/
/*
* ASSEMBLY AND UNPACKING ALGORITHM:
*
* copy incoming packet into assembly data buffer indexed by current
* location of end of fragment
*
* if not fragmented
* deliver all messages in assembly data buffer
* else
* if msg_count > 1 and fragmented
* deliver all messages except last message in assembly data buffer
* copy last fragmented section to start of assembly data buffer
* else
* if msg_count = 1 and fragmented
* do nothing
*
*/
#include <config.h>
#ifdef HAVE_ALLOCA_H
#include <alloca.h>
#endif
#include <netinet/in.h>
#include <sys/uio.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <pthread.h>
#include <errno.h>
#include <limits.h>
#include <corosync/swab.h>
-#include <corosync/hdb.h>
#include <corosync/list.h>
#include <qb/qbloop.h>
#include <qb/qbipcs.h>
#include <corosync/totem/totempg.h>
#define LOGSYS_UTILS_ONLY 1
#include <corosync/engine/logsys.h>
#include "totemmrp.h"
#include "totemsrp.h"
#define min(a,b) ((a) < (b)) ? a : b
struct totempg_mcast_header {
short version;
short type;
};
#if !(defined(__i386__) || defined(__x86_64__))
/*
* Need align on architectures different then i386 or x86_64
*/
#define TOTEMPG_NEED_ALIGN 1
#endif
/*
* totempg_mcast structure
*
* header: Identify the mcast.
* fragmented: Set if this message continues into next message
* continuation: Set if this message is a continuation from last message
* msg_count Indicates how many packed messages are contained
* in the mcast.
* Also, the size of each packed message and the messages themselves are
* appended to the end of this structure when sent.
*/
struct totempg_mcast {
struct totempg_mcast_header header;
unsigned char fragmented;
unsigned char continuation;
unsigned short msg_count;
/*
* short msg_len[msg_count];
*/
/*
* data for messages
*/
};
/*
* Maximum packet size for totem pg messages
*/
#define TOTEMPG_PACKET_SIZE (totempg_totem_config->net_mtu - \
sizeof (struct totempg_mcast))
/*
* Local variables used for packing small messages
*/
static unsigned short mcast_packed_msg_lens[FRAME_SIZE_MAX];
static int mcast_packed_msg_count = 0;
static int totempg_reserved = 1;
static unsigned int totempg_size_limit;
static totem_queue_level_changed_fn totem_queue_level_changed = NULL;
static uint32_t totempg_threaded_mode = 0;
/*
* Function and data used to log messages
*/
static int totempg_log_level_security;
static int totempg_log_level_error;
static int totempg_log_level_warning;
static int totempg_log_level_notice;
static int totempg_log_level_debug;
static int totempg_subsys_id;
static void (*totempg_log_printf) (
int level,
int subsys,
const char *function,
const char *file,
int line,
const char *format, ...) __attribute__((format(printf, 6, 7)));
struct totem_config *totempg_totem_config;
static totempg_stats_t totempg_stats;
enum throw_away_mode {
THROW_AWAY_INACTIVE,
THROW_AWAY_ACTIVE
};
struct assembly {
unsigned int nodeid;
unsigned char data[MESSAGE_SIZE_MAX];
int index;
unsigned char last_frag_num;
enum throw_away_mode throw_away_mode;
struct list_head list;
};
static void assembly_deref (struct assembly *assembly);
static int callback_token_received_fn (enum totem_callback_token_type type,
const void *data);
DECLARE_LIST_INIT(assembly_list_inuse);
DECLARE_LIST_INIT(assembly_list_free);
+DECLARE_LIST_INIT(totempg_groups_list);
+
/*
* Staging buffer for packed messages. Messages are staged in this buffer
* before sending. Multiple messages may fit which cuts down on the
* number of mcasts sent. If a message doesn't completely fit, then
* the mcast header has a fragment bit set that says that there are more
* data to follow. fragment_size is an index into the buffer. It indicates
* the size of message data and where to place new message data.
* fragment_contuation indicates whether the first packed message in
* the buffer is a continuation of a previously packed fragment.
*/
static unsigned char *fragmentation_data;
static int fragment_size = 0;
static int fragment_continuation = 0;
static struct iovec iov_delv;
-static unsigned int totempg_max_handle = 0;
-
struct totempg_group_instance {
void (*deliver_fn) (
unsigned int nodeid,
const void *msg,
unsigned int msg_len,
int endian_conversion_required);
void (*confchg_fn) (
enum totem_configuration_type configuration_type,
const unsigned int *member_list, size_t member_list_entries,
const unsigned int *left_list, size_t left_list_entries,
const unsigned int *joined_list, size_t joined_list_entries,
const struct memb_ring_id *ring_id);
struct totempg_group *groups;
int groups_cnt;
int32_t q_level;
+
+ struct list_head list;
};
DECLARE_HDB_DATABASE (totempg_groups_instance_database,NULL);
static unsigned char next_fragment = 1;
static pthread_mutex_t totempg_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t callback_token_mutex = PTHREAD_MUTEX_INITIALIZER;
static pthread_mutex_t mcast_msg_mutex = PTHREAD_MUTEX_INITIALIZER;
#define log_printf(level, format, args...) \
do { \
totempg_log_printf(level, \
totempg_subsys_id, \
__FUNCTION__, __FILE__, __LINE__, \
format, ##args); \
} while (0);
static int msg_count_send_ok (int msg_count);
static int byte_count_send_ok (int byte_count);
static struct assembly *assembly_ref (unsigned int nodeid)
{
struct assembly *assembly;
struct list_head *list;
/*
* Search inuse list for node id and return assembly buffer if found
*/
for (list = assembly_list_inuse.next;
list != &assembly_list_inuse;
list = list->next) {
assembly = list_entry (list, struct assembly, list);
if (nodeid == assembly->nodeid) {
return (assembly);
}
}
/*
* Nothing found in inuse list get one from free list if available
*/
if (list_empty (&assembly_list_free) == 0) {
assembly = list_entry (assembly_list_free.next, struct assembly, list);
list_del (&assembly->list);
list_add (&assembly->list, &assembly_list_inuse);
assembly->nodeid = nodeid;
assembly->index = 0;
assembly->last_frag_num = 0;
assembly->throw_away_mode = THROW_AWAY_INACTIVE;
return (assembly);
}
/*
* Nothing available in inuse or free list, so allocate a new one
*/
assembly = malloc (sizeof (struct assembly));
/*
* TODO handle memory allocation failure here
*/
assert (assembly);
assembly->nodeid = nodeid;
assembly->data[0] = 0;
assembly->index = 0;
assembly->last_frag_num = 0;
assembly->throw_away_mode = THROW_AWAY_INACTIVE;
list_init (&assembly->list);
list_add (&assembly->list, &assembly_list_inuse);
return (assembly);
}
static void assembly_deref (struct assembly *assembly)
{
list_del (&assembly->list);
list_add (&assembly->list, &assembly_list_free);
}
static inline void app_confchg_fn (
enum totem_configuration_type configuration_type,
const unsigned int *member_list, size_t member_list_entries,
const unsigned int *left_list, size_t left_list_entries,
const unsigned int *joined_list, size_t joined_list_entries,
const struct memb_ring_id *ring_id)
{
int i;
struct totempg_group_instance *instance;
struct assembly *assembly;
- unsigned int res;
+ struct list_head *list;
/*
* For every leaving processor, add to free list
* This also has the side effect of clearing out the dataset
* In the leaving processor's assembly buffer.
*/
for (i = 0; i < left_list_entries; i++) {
assembly = assembly_ref (left_list[i]);
list_del (&assembly->list);
list_add (&assembly->list, &assembly_list_free);
}
- for (i = 0; i <= totempg_max_handle; i++) {
- res = hdb_handle_get (&totempg_groups_instance_database,
- hdb_nocheck_convert (i), (void *)&instance);
-
- if (res == 0) {
- if (instance->confchg_fn) {
- instance->confchg_fn (
- configuration_type,
- member_list,
- member_list_entries,
- left_list,
- left_list_entries,
- joined_list,
- joined_list_entries,
- ring_id);
- }
- hdb_handle_put (&totempg_groups_instance_database,
- hdb_nocheck_convert (i));
+ for (list = totempg_groups_list.next;
+ list != &totempg_groups_list;
+ list = list->next) {
+
+ instance = list_entry (list, struct totempg_group_instance, list);
+
+ if (instance->confchg_fn) {
+ instance->confchg_fn (
+ configuration_type,
+ member_list,
+ member_list_entries,
+ left_list,
+ left_list_entries,
+ joined_list,
+ joined_list_entries,
+ ring_id);
}
}
}
static inline void group_endian_convert (
void *msg,
int msg_len)
{
unsigned short *group_len;
int i;
char *aligned_msg;
#ifdef TOTEMPG_NEED_ALIGN
/*
* Align data structure for not i386 or x86_64
*/
if ((size_t)msg % 4 != 0) {
aligned_msg = alloca(msg_len);
memcpy(aligned_msg, msg, msg_len);
} else {
aligned_msg = msg;
}
#else
aligned_msg = msg;
#endif
group_len = (unsigned short *)aligned_msg;
group_len[0] = swab16(group_len[0]);
for (i = 1; i < group_len[0] + 1; i++) {
group_len[i] = swab16(group_len[i]);
}
if (aligned_msg != msg) {
memcpy(msg, aligned_msg, msg_len);
}
}
static inline int group_matches (
struct iovec *iovec,
unsigned int iov_len,
struct totempg_group *groups_b,
unsigned int group_b_cnt,
unsigned int *adjust_iovec)
{
unsigned short *group_len;
char *group_name;
int i;
int j;
#ifdef TOTEMPG_NEED_ALIGN
struct iovec iovec_aligned = { NULL, 0 };
#endif
assert (iov_len == 1);
#ifdef TOTEMPG_NEED_ALIGN
/*
* Align data structure for not i386 or x86_64
*/
if ((size_t)iovec->iov_base % 4 != 0) {
iovec_aligned.iov_base = alloca(iovec->iov_len);
memcpy(iovec_aligned.iov_base, iovec->iov_base, iovec->iov_len);
iovec_aligned.iov_len = iovec->iov_len;
iovec = &iovec_aligned;
}
#endif
group_len = (unsigned short *)iovec->iov_base;
group_name = ((char *)iovec->iov_base) +
sizeof (unsigned short) * (group_len[0] + 1);
/*
* Calculate amount to adjust the iovec by before delivering to app
*/
*adjust_iovec = sizeof (unsigned short) * (group_len[0] + 1);
for (i = 1; i < group_len[0] + 1; i++) {
*adjust_iovec += group_len[i];
}
/*
* Determine if this message should be delivered to this instance
*/
for (i = 1; i < group_len[0] + 1; i++) {
for (j = 0; j < group_b_cnt; j++) {
if ((group_len[i] == groups_b[j].group_len) &&
(memcmp (groups_b[j].group, group_name, group_len[i]) == 0)) {
return (1);
}
}
group_name += group_len[i];
}
return (0);
}
static inline void app_deliver_fn (
unsigned int nodeid,
void *msg,
unsigned int msg_len,
int endian_conversion_required)
{
- int i;
struct totempg_group_instance *instance;
struct iovec stripped_iovec;
unsigned int adjust_iovec;
- unsigned int res;
struct iovec *iovec;
+ struct list_head *list;
struct iovec aligned_iovec = { NULL, 0 };
if (endian_conversion_required) {
group_endian_convert (msg, msg_len);
}
/*
* TODO: segmentation/assembly need to be redesigned to provide aligned access
* in all cases to avoid memory copies on non386 archs. Probably broke backwars
* compatibility
*/
#ifdef TOTEMPG_NEED_ALIGN
/*
* Align data structure for not i386 or x86_64
*/
aligned_iovec.iov_base = alloca(msg_len);
aligned_iovec.iov_len = msg_len;
memcpy(aligned_iovec.iov_base, msg, msg_len);
#else
aligned_iovec.iov_base = msg;
aligned_iovec.iov_len = msg_len;
#endif
iovec = &aligned_iovec;
- for (i = 0; i <= totempg_max_handle; i++) {
- res = hdb_handle_get (&totempg_groups_instance_database,
- hdb_nocheck_convert (i), (void *)&instance);
+ for (list = totempg_groups_list.next;
+ list != &totempg_groups_list;
+ list = list->next) {
- if (res == 0) {
- if (group_matches (iovec, 1, instance->groups, instance->groups_cnt, &adjust_iovec)) {
- stripped_iovec.iov_len = iovec->iov_len - adjust_iovec;
- stripped_iovec.iov_base = (char *)iovec->iov_base + adjust_iovec;
+ instance = list_entry (list, struct totempg_group_instance, list);
+ if (group_matches (iovec, 1, instance->groups, instance->groups_cnt, &adjust_iovec)) {
+ stripped_iovec.iov_len = iovec->iov_len - adjust_iovec;
+ stripped_iovec.iov_base = (char *)iovec->iov_base + adjust_iovec;
#ifdef TOTEMPG_NEED_ALIGN
+ /*
+ * Align data structure for not i386 or x86_64
+ */
+ if ((char *)iovec->iov_base + adjust_iovec % 4 != 0) {
/*
- * Align data structure for not i386 or x86_64
+ * Deal with misalignment
*/
- if ((char *)iovec->iov_base + adjust_iovec % 4 != 0) {
- /*
- * Deal with misalignment
- */
- stripped_iovec.iov_base =
- alloca (stripped_iovec.iov_len);
- memcpy (stripped_iovec.iov_base,
- (char *)iovec->iov_base + adjust_iovec,
- stripped_iovec.iov_len);
- }
-#endif
- instance->deliver_fn (
- nodeid,
- stripped_iovec.iov_base,
- stripped_iovec.iov_len,
- endian_conversion_required);
+ stripped_iovec.iov_base =
+ alloca (stripped_iovec.iov_len);
+ memcpy (stripped_iovec.iov_base,
+ (char *)iovec->iov_base + adjust_iovec,
+ stripped_iovec.iov_len);
}
-
- hdb_handle_put (&totempg_groups_instance_database, hdb_nocheck_convert(i));
+#endif
+ instance->deliver_fn (
+ nodeid,
+ stripped_iovec.iov_base,
+ stripped_iovec.iov_len,
+ endian_conversion_required);
}
}
}
static void totempg_confchg_fn (
enum totem_configuration_type configuration_type,
const unsigned int *member_list, size_t member_list_entries,
const unsigned int *left_list, size_t left_list_entries,
const unsigned int *joined_list, size_t joined_list_entries,
const struct memb_ring_id *ring_id)
{
// TODO optimize this
app_confchg_fn (configuration_type,
member_list, member_list_entries,
left_list, left_list_entries,
joined_list, joined_list_entries,
ring_id);
}
static void totempg_deliver_fn (
unsigned int nodeid,
const void *msg,
unsigned int msg_len,
int endian_conversion_required)
{
struct totempg_mcast *mcast;
unsigned short *msg_lens;
int i;
struct assembly *assembly;
char header[FRAME_SIZE_MAX];
int msg_count;
int continuation;
int start;
const char *data;
int datasize;
assembly = assembly_ref (nodeid);
assert (assembly);
/*
* Assemble the header into one block of data and
* assemble the packet contents into one block of data to simplify delivery
*/
mcast = (struct totempg_mcast *)msg;
if (endian_conversion_required) {
mcast->msg_count = swab16 (mcast->msg_count);
}
msg_count = mcast->msg_count;
datasize = sizeof (struct totempg_mcast) +
msg_count * sizeof (unsigned short);
memcpy (header, msg, datasize);
data = msg;
msg_lens = (unsigned short *) (header + sizeof (struct totempg_mcast));
if (endian_conversion_required) {
for (i = 0; i < mcast->msg_count; i++) {
msg_lens[i] = swab16 (msg_lens[i]);
}
}
memcpy (&assembly->data[assembly->index], &data[datasize],
msg_len - datasize);
/*
* If the last message in the buffer is a fragment, then we
* can't deliver it. We'll first deliver the full messages
* then adjust the assembly buffer so we can add the rest of the
* fragment when it arrives.
*/
msg_count = mcast->fragmented ? mcast->msg_count - 1 : mcast->msg_count;
continuation = mcast->continuation;
iov_delv.iov_base = (void *)&assembly->data[0];
iov_delv.iov_len = assembly->index + msg_lens[0];
/*
* Make sure that if this message is a continuation, that it
* matches the sequence number of the previous fragment.
* Also, if the first packed message is a continuation
* of a previous message, but the assembly buffer
* is empty, then we need to discard it since we can't
* assemble a complete message. Likewise, if this message isn't a
* continuation and the assembly buffer is empty, we have to discard
* the continued message.
*/
start = 0;
if (assembly->throw_away_mode == THROW_AWAY_ACTIVE) {
/* Throw away the first msg block */
if (mcast->fragmented == 0 || mcast->fragmented == 1) {
assembly->throw_away_mode = THROW_AWAY_INACTIVE;
assembly->index += msg_lens[0];
iov_delv.iov_base = (void *)&assembly->data[assembly->index];
iov_delv.iov_len = msg_lens[1];
start = 1;
}
} else
if (assembly->throw_away_mode == THROW_AWAY_INACTIVE) {
if (continuation == assembly->last_frag_num) {
assembly->last_frag_num = mcast->fragmented;
for (i = start; i < msg_count; i++) {
app_deliver_fn(nodeid, iov_delv.iov_base, iov_delv.iov_len,
endian_conversion_required);
assembly->index += msg_lens[i];
iov_delv.iov_base = (void *)&assembly->data[assembly->index];
if (i < (msg_count - 1)) {
iov_delv.iov_len = msg_lens[i + 1];
}
}
} else {
assembly->throw_away_mode = THROW_AWAY_ACTIVE;
}
}
if (mcast->fragmented == 0) {
/*
* End of messages, dereference assembly struct
*/
assembly->last_frag_num = 0;
assembly->index = 0;
assembly_deref (assembly);
} else {
/*
* Message is fragmented, keep around assembly list
*/
if (mcast->msg_count > 1) {
memmove (&assembly->data[0],
&assembly->data[assembly->index],
msg_lens[msg_count]);
assembly->index = 0;
}
assembly->index += msg_lens[msg_count];
}
}
/*
* Totem Process Group Abstraction
* depends on poll abstraction, POSIX, IPV4
*/
void *callback_token_received_handle;
int callback_token_received_fn (enum totem_callback_token_type type,
const void *data)
{
struct totempg_mcast mcast;
struct iovec iovecs[3];
if (totempg_threaded_mode == 1) {
pthread_mutex_lock (&mcast_msg_mutex);
}
if (mcast_packed_msg_count == 0) {
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&mcast_msg_mutex);
}
return (0);
}
if (totemmrp_avail() == 0) {
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&mcast_msg_mutex);
}
return (0);
}
mcast.header.version = 0;
mcast.header.type = 0;
mcast.fragmented = 0;
/*
* Was the first message in this buffer a continuation of a
* fragmented message?
*/
mcast.continuation = fragment_continuation;
fragment_continuation = 0;
mcast.msg_count = mcast_packed_msg_count;
iovecs[0].iov_base = (void *)&mcast;
iovecs[0].iov_len = sizeof (struct totempg_mcast);
iovecs[1].iov_base = (void *)mcast_packed_msg_lens;
iovecs[1].iov_len = mcast_packed_msg_count * sizeof (unsigned short);
iovecs[2].iov_base = (void *)&fragmentation_data[0];
iovecs[2].iov_len = fragment_size;
(void)totemmrp_mcast (iovecs, 3, 0);
mcast_packed_msg_count = 0;
fragment_size = 0;
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&mcast_msg_mutex);
}
return (0);
}
/*
* Initialize the totem process group abstraction
*/
int totempg_initialize (
qb_loop_t *poll_handle,
struct totem_config *totem_config)
{
int res;
totempg_totem_config = totem_config;
totempg_log_level_security = totem_config->totem_logging_configuration.log_level_security;
totempg_log_level_error = totem_config->totem_logging_configuration.log_level_error;
totempg_log_level_warning = totem_config->totem_logging_configuration.log_level_warning;
totempg_log_level_notice = totem_config->totem_logging_configuration.log_level_notice;
totempg_log_level_debug = totem_config->totem_logging_configuration.log_level_debug;
totempg_log_printf = totem_config->totem_logging_configuration.log_printf;
totempg_subsys_id = totem_config->totem_logging_configuration.log_subsys_id;
fragmentation_data = malloc (TOTEMPG_PACKET_SIZE);
if (fragmentation_data == 0) {
return (-1);
}
totemsrp_net_mtu_adjust (totem_config);
res = totemmrp_initialize (
poll_handle,
totem_config,
&totempg_stats,
totempg_deliver_fn,
totempg_confchg_fn);
totemmrp_callback_token_create (
&callback_token_received_handle,
TOTEM_CALLBACK_TOKEN_RECEIVED,
0,
callback_token_received_fn,
0);
totempg_size_limit = (totemmrp_avail() - 1) *
(totempg_totem_config->net_mtu -
sizeof (struct totempg_mcast) - 16);
+ list_init (&totempg_groups_list);
+
return (res);
}
void totempg_finalize (void)
{
if (totempg_threaded_mode == 1) {
pthread_mutex_lock (&totempg_mutex);
}
totemmrp_finalize ();
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&totempg_mutex);
}
}
/*
* Multicast a message
*/
static int mcast_msg (
struct iovec *iovec_in,
unsigned int iov_len,
int guarantee)
{
int res = 0;
struct totempg_mcast mcast;
struct iovec iovecs[3];
struct iovec iovec[64];
int i;
int dest, src;
int max_packet_size = 0;
int copy_len = 0;
int copy_base = 0;
int total_size = 0;
if (totempg_threaded_mode == 1) {
pthread_mutex_lock (&mcast_msg_mutex);
}
totemmrp_event_signal (TOTEM_EVENT_NEW_MSG, 1);
/*
* Remove zero length iovectors from the list
*/
assert (iov_len < 64);
for (dest = 0, src = 0; src < iov_len; src++) {
if (iovec_in[src].iov_len) {
memcpy (&iovec[dest++], &iovec_in[src],
sizeof (struct iovec));
}
}
iov_len = dest;
max_packet_size = TOTEMPG_PACKET_SIZE -
(sizeof (unsigned short) * (mcast_packed_msg_count + 1));
mcast_packed_msg_lens[mcast_packed_msg_count] = 0;
/*
* Check if we would overwrite new message queue
*/
for (i = 0; i < iov_len; i++) {
total_size += iovec[i].iov_len;
}
if (byte_count_send_ok (total_size + sizeof(unsigned short) *
(mcast_packed_msg_count)) == 0) {
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&mcast_msg_mutex);
}
return(-1);
}
mcast.header.version = 0;
for (i = 0; i < iov_len; ) {
mcast.fragmented = 0;
mcast.continuation = fragment_continuation;
copy_len = iovec[i].iov_len - copy_base;
/*
* If it all fits with room left over, copy it in.
* We need to leave at least sizeof(short) + 1 bytes in the
* fragment_buffer on exit so that max_packet_size + fragment_size
* doesn't exceed the size of the fragment_buffer on the next call.
*/
if ((copy_len + fragment_size) <
(max_packet_size - sizeof (unsigned short))) {
memcpy (&fragmentation_data[fragment_size],
(char *)iovec[i].iov_base + copy_base, copy_len);
fragment_size += copy_len;
mcast_packed_msg_lens[mcast_packed_msg_count] += copy_len;
next_fragment = 1;
copy_len = 0;
copy_base = 0;
i++;
continue;
/*
* If it just fits or is too big, then send out what fits.
*/
} else {
unsigned char *data_ptr;
copy_len = min(copy_len, max_packet_size - fragment_size);
if( copy_len == max_packet_size )
data_ptr = (unsigned char *)iovec[i].iov_base + copy_base;
else {
data_ptr = fragmentation_data;
memcpy (&fragmentation_data[fragment_size],
(unsigned char *)iovec[i].iov_base + copy_base, copy_len);
}
memcpy (&fragmentation_data[fragment_size],
(unsigned char *)iovec[i].iov_base + copy_base, copy_len);
mcast_packed_msg_lens[mcast_packed_msg_count] += copy_len;
/*
* if we're not on the last iovec or the iovec is too large to
* fit, then indicate a fragment. This also means that the next
* message will have the continuation of this one.
*/
if ((i < (iov_len - 1)) ||
((copy_base + copy_len) < iovec[i].iov_len)) {
if (!next_fragment) {
next_fragment++;
}
fragment_continuation = next_fragment;
mcast.fragmented = next_fragment++;
assert(fragment_continuation != 0);
assert(mcast.fragmented != 0);
} else {
fragment_continuation = 0;
}
/*
* assemble the message and send it
*/
mcast.msg_count = ++mcast_packed_msg_count;
iovecs[0].iov_base = (void *)&mcast;
iovecs[0].iov_len = sizeof(struct totempg_mcast);
iovecs[1].iov_base = (void *)mcast_packed_msg_lens;
iovecs[1].iov_len = mcast_packed_msg_count *
sizeof(unsigned short);
iovecs[2].iov_base = (void *)data_ptr;
iovecs[2].iov_len = max_packet_size;
assert (totemmrp_avail() > 0);
res = totemmrp_mcast (iovecs, 3, guarantee);
if (res == -1) {
goto error_exit;
}
/*
* Recalculate counts and indexes for the next.
*/
mcast_packed_msg_lens[0] = 0;
mcast_packed_msg_count = 0;
fragment_size = 0;
max_packet_size = TOTEMPG_PACKET_SIZE - (sizeof(unsigned short));
/*
* If the iovec all fit, go to the next iovec
*/
if ((copy_base + copy_len) == iovec[i].iov_len) {
copy_len = 0;
copy_base = 0;
i++;
/*
* Continue with the rest of the current iovec.
*/
} else {
copy_base += copy_len;
}
}
}
/*
* Bump only if we added message data. This may be zero if
* the last buffer just fit into the fragmentation_data buffer
* and we were at the last iovec.
*/
if (mcast_packed_msg_lens[mcast_packed_msg_count]) {
mcast_packed_msg_count++;
}
error_exit:
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&mcast_msg_mutex);
}
return (res);
}
/*
* Determine if a message of msg_size could be queued
*/
static int msg_count_send_ok (
int msg_count)
{
int avail = 0;
avail = totemmrp_avail ();
totempg_stats.msg_queue_avail = avail;
return ((avail - totempg_reserved) > msg_count);
}
static int byte_count_send_ok (
int byte_count)
{
unsigned int msg_count = 0;
int avail = 0;
avail = totemmrp_avail ();
msg_count = (byte_count / (totempg_totem_config->net_mtu - sizeof (struct totempg_mcast) - 16)) + 1;
return (avail >= msg_count);
}
static int send_reserve (
int msg_size)
{
unsigned int msg_count = 0;
msg_count = (msg_size / (totempg_totem_config->net_mtu - sizeof (struct totempg_mcast) - 16)) + 1;
totempg_reserved += msg_count;
totempg_stats.msg_reserved = totempg_reserved;
return (msg_count);
}
static void send_release (
int msg_count)
{
totempg_reserved -= msg_count;
totempg_stats.msg_reserved = totempg_reserved;
}
int totempg_callback_token_create (
void **handle_out,
enum totem_callback_token_type type,
int delete,
int (*callback_fn) (enum totem_callback_token_type type, const void *),
const void *data)
{
unsigned int res;
if (totempg_threaded_mode == 1) {
pthread_mutex_lock (&callback_token_mutex);
}
res = totemmrp_callback_token_create (handle_out, type, delete,
callback_fn, data);
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&callback_token_mutex);
}
return (res);
}
void totempg_callback_token_destroy (
void *handle_out)
{
if (totempg_threaded_mode == 1) {
pthread_mutex_lock (&callback_token_mutex);
}
totemmrp_callback_token_destroy (handle_out);
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&callback_token_mutex);
}
}
/*
* vi: set autoindent tabstop=4 shiftwidth=4 :
*/
int totempg_groups_initialize (
- hdb_handle_t *handle,
+ void **totempg_groups_instance,
void (*deliver_fn) (
unsigned int nodeid,
const void *msg,
unsigned int msg_len,
int endian_conversion_required),
void (*confchg_fn) (
enum totem_configuration_type configuration_type,
const unsigned int *member_list, size_t member_list_entries,
const unsigned int *left_list, size_t left_list_entries,
const unsigned int *joined_list, size_t joined_list_entries,
const struct memb_ring_id *ring_id))
{
struct totempg_group_instance *instance;
- unsigned int res;
if (totempg_threaded_mode == 1) {
pthread_mutex_lock (&totempg_mutex);
}
- res = hdb_handle_create (&totempg_groups_instance_database,
- sizeof (struct totempg_group_instance), handle);
- if (res != 0) {
+
+ instance = malloc (sizeof (struct totempg_group_instance));
+ if (instance == NULL) {
goto error_exit;
}
- if (*handle > totempg_max_handle) {
- totempg_max_handle = *handle;
- }
-
- res = hdb_handle_get (&totempg_groups_instance_database, *handle,
- (void *)&instance);
- if (res != 0) {
- goto error_destroy;
- }
-
instance->deliver_fn = deliver_fn;
instance->confchg_fn = confchg_fn;
instance->groups = 0;
instance->groups_cnt = 0;
instance->q_level = QB_LOOP_MED;
-
-
- hdb_handle_put (&totempg_groups_instance_database, *handle);
+ list_init (&instance->list);
+ list_add (&instance->list, &totempg_groups_list);
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&totempg_mutex);
}
+ *totempg_groups_instance = instance;
return (0);
-error_destroy:
- hdb_handle_destroy (&totempg_groups_instance_database, *handle);
error_exit:
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&totempg_mutex);
}
return (-1);
}
int totempg_groups_join (
- hdb_handle_t handle,
+ void *totempg_groups_instance,
const struct totempg_group *groups,
size_t group_cnt)
{
- struct totempg_group_instance *instance;
+ struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance;
struct totempg_group *new_groups;
unsigned int res;
if (totempg_threaded_mode == 1) {
pthread_mutex_lock (&totempg_mutex);
}
- res = hdb_handle_get (&totempg_groups_instance_database, handle,
- (void *)&instance);
- if (res != 0) {
- goto error_exit;
- }
-
new_groups = realloc (instance->groups,
sizeof (struct totempg_group) *
(instance->groups_cnt + group_cnt));
if (new_groups == 0) {
res = ENOMEM;
goto error_exit;
}
memcpy (&new_groups[instance->groups_cnt],
groups, group_cnt * sizeof (struct totempg_group));
instance->groups = new_groups;
instance->groups_cnt += group_cnt;
- hdb_handle_put (&totempg_groups_instance_database, handle);
-
error_exit:
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&totempg_mutex);
}
return (res);
}
int totempg_groups_leave (
- hdb_handle_t handle,
+ void *totempg_groups_instance,
const struct totempg_group *groups,
size_t group_cnt)
{
- struct totempg_group_instance *instance;
- unsigned int res;
-
if (totempg_threaded_mode == 1) {
pthread_mutex_lock (&totempg_mutex);
}
- res = hdb_handle_get (&totempg_groups_instance_database, handle,
- (void *)&instance);
- if (res != 0) {
- goto error_exit;
- }
- hdb_handle_put (&totempg_groups_instance_database, handle);
-
-error_exit:
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&totempg_mutex);
}
- return (res);
+ return (0);
}
#define MAX_IOVECS_FROM_APP 32
#define MAX_GROUPS_PER_MSG 32
int totempg_groups_mcast_joined (
- hdb_handle_t handle,
+ void *totempg_groups_instance,
const struct iovec *iovec,
unsigned int iov_len,
int guarantee)
{
- struct totempg_group_instance *instance;
+ struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance;
unsigned short group_len[MAX_GROUPS_PER_MSG + 1];
struct iovec iovec_mcast[MAX_GROUPS_PER_MSG + 1 + MAX_IOVECS_FROM_APP];
int i;
unsigned int res;
if (totempg_threaded_mode == 1) {
pthread_mutex_lock (&totempg_mutex);
}
- res = hdb_handle_get (&totempg_groups_instance_database, handle,
- (void *)&instance);
- if (res != 0) {
- goto error_exit;
- }
-
/*
* Build group_len structure and the iovec_mcast structure
*/
group_len[0] = instance->groups_cnt;
for (i = 0; i < instance->groups_cnt; i++) {
group_len[i + 1] = instance->groups[i].group_len;
iovec_mcast[i + 1].iov_len = instance->groups[i].group_len;
iovec_mcast[i + 1].iov_base = (void *) instance->groups[i].group;
}
iovec_mcast[0].iov_len = (instance->groups_cnt + 1) * sizeof (unsigned short);
iovec_mcast[0].iov_base = group_len;
for (i = 0; i < iov_len; i++) {
iovec_mcast[i + instance->groups_cnt + 1].iov_len = iovec[i].iov_len;
iovec_mcast[i + instance->groups_cnt + 1].iov_base = iovec[i].iov_base;
}
res = mcast_msg (iovec_mcast, iov_len + instance->groups_cnt + 1, guarantee);
- hdb_handle_put (&totempg_groups_instance_database, handle);
-error_exit:
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&totempg_mutex);
}
return (res);
}
-static void check_q_level(struct totempg_group_instance *instance)
+static void check_q_level(
+ void *totempg_groups_instance)
{
int32_t old_level;
int32_t percent_used = 0;
+ struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance;
old_level = instance->q_level;
percent_used = 100 - (totemmrp_avail () * 100 / 800); /*(1024*1024/1500)*/
if (percent_used > 90 && instance->q_level != TOTEM_Q_LEVEL_CRITICAL) {
instance->q_level = TOTEM_Q_LEVEL_CRITICAL;
} else if (percent_used < 30 && instance->q_level != TOTEM_Q_LEVEL_LOW) {
instance->q_level = TOTEM_Q_LEVEL_LOW;
} else if (percent_used > 40 && percent_used < 60 && instance->q_level != TOTEM_Q_LEVEL_GOOD) {
instance->q_level = TOTEM_Q_LEVEL_GOOD;
} else if (percent_used > 70 && percent_used < 80 && instance->q_level != TOTEM_Q_LEVEL_HIGH) {
instance->q_level = TOTEM_Q_LEVEL_HIGH;
}
if (totem_queue_level_changed && old_level != instance->q_level) {
totem_queue_level_changed(instance->q_level);
}
}
-void totempg_check_q_level(qb_handle_t handle)
+void totempg_check_q_level(
+ void *totempg_groups_instance)
{
- struct totempg_group_instance *instance;
+ struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance;
- if (hdb_handle_get (&totempg_groups_instance_database, handle,
- (void *)&instance) != 0) {
- return;
- }
check_q_level(instance);
-
- hdb_handle_put (&totempg_groups_instance_database, handle);
}
int totempg_groups_joined_reserve (
- hdb_handle_t handle,
+ void *totempg_groups_instance,
const struct iovec *iovec,
unsigned int iov_len)
{
- struct totempg_group_instance *instance;
+ struct totempg_group_instance *instance = (struct totempg_group_instance *)totempg_groups_instance;
unsigned int size = 0;
unsigned int i;
- unsigned int res;
unsigned int reserved = 0;
if (totempg_threaded_mode == 1) {
pthread_mutex_lock (&totempg_mutex);
pthread_mutex_lock (&mcast_msg_mutex);
}
- res = hdb_handle_get (&totempg_groups_instance_database, handle,
- (void *)&instance);
- if (res != 0) {
- goto error_exit;
- }
for (i = 0; i < instance->groups_cnt; i++) {
size += instance->groups[i].group_len;
}
for (i = 0; i < iov_len; i++) {
size += iovec[i].iov_len;
}
check_q_level(instance);
if (size >= totempg_size_limit) {
reserved = -1;
- goto error_put;
+ goto error_exit;
}
reserved = send_reserve (size);
if (msg_count_send_ok (reserved) == 0) {
send_release (reserved);
reserved = 0;
}
-error_put:
- hdb_handle_put (&totempg_groups_instance_database, handle);
error_exit:
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&mcast_msg_mutex);
pthread_mutex_unlock (&totempg_mutex);
}
return (reserved);
}
int totempg_groups_joined_release (int msg_count)
{
if (totempg_threaded_mode == 1) {
pthread_mutex_lock (&totempg_mutex);
pthread_mutex_lock (&mcast_msg_mutex);
}
send_release (msg_count);
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&mcast_msg_mutex);
pthread_mutex_unlock (&totempg_mutex);
}
return 0;
}
int totempg_groups_mcast_groups (
- hdb_handle_t handle,
+ void *totempg_groups_instance,
int guarantee,
const struct totempg_group *groups,
size_t groups_cnt,
const struct iovec *iovec,
unsigned int iov_len)
{
- struct totempg_group_instance *instance;
unsigned short group_len[MAX_GROUPS_PER_MSG + 1];
struct iovec iovec_mcast[MAX_GROUPS_PER_MSG + 1 + MAX_IOVECS_FROM_APP];
int i;
unsigned int res;
if (totempg_threaded_mode == 1) {
pthread_mutex_lock (&totempg_mutex);
}
- res = hdb_handle_get (&totempg_groups_instance_database, handle,
- (void *)&instance);
- if (res != 0) {
- goto error_exit;
- }
/*
* Build group_len structure and the iovec_mcast structure
*/
group_len[0] = groups_cnt;
for (i = 0; i < groups_cnt; i++) {
group_len[i + 1] = groups[i].group_len;
iovec_mcast[i + 1].iov_len = groups[i].group_len;
iovec_mcast[i + 1].iov_base = (void *) groups[i].group;
}
iovec_mcast[0].iov_len = (groups_cnt + 1) * sizeof (unsigned short);
iovec_mcast[0].iov_base = group_len;
for (i = 0; i < iov_len; i++) {
iovec_mcast[i + groups_cnt + 1].iov_len = iovec[i].iov_len;
iovec_mcast[i + groups_cnt + 1].iov_base = iovec[i].iov_base;
}
res = mcast_msg (iovec_mcast, iov_len + groups_cnt + 1, guarantee);
- hdb_handle_put (&totempg_groups_instance_database, handle);
-
-error_exit:
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&totempg_mutex);
}
return (res);
}
/*
* Returns -1 if error, 0 if can't send, 1 if can send the message
*/
int totempg_groups_send_ok_groups (
- hdb_handle_t handle,
+ void *totempg_groups_instance,
const struct totempg_group *groups,
size_t groups_cnt,
const struct iovec *iovec,
unsigned int iov_len)
{
- struct totempg_group_instance *instance;
unsigned int size = 0;
unsigned int i;
unsigned int res;
if (totempg_threaded_mode == 1) {
pthread_mutex_lock (&totempg_mutex);
}
- res = hdb_handle_get (&totempg_groups_instance_database, handle,
- (void *)&instance);
- if (res != 0) {
- goto error_exit;
- }
for (i = 0; i < groups_cnt; i++) {
size += groups[i].group_len;
}
for (i = 0; i < iov_len; i++) {
size += iovec[i].iov_len;
}
res = msg_count_send_ok (size);
- hdb_handle_put (&totempg_groups_instance_database, handle);
-error_exit:
if (totempg_threaded_mode == 1) {
pthread_mutex_unlock (&totempg_mutex);
}
return (res);
}
int totempg_ifaces_get (
unsigned int nodeid,
struct totem_ip_address *interfaces,
char ***status,
unsigned int *iface_count)
{
int res;
res = totemmrp_ifaces_get (
nodeid,
interfaces,
status,
iface_count);
return (res);
}
void totempg_event_signal (enum totem_event_type type, int value)
{
totemmrp_event_signal (type, value);
}
void* totempg_get_stats (void)
{
return &totempg_stats;
}
int totempg_crypto_set (
unsigned int type)
{
int res;
res = totemmrp_crypto_set (
type);
return (res);
}
int totempg_ring_reenable (void)
{
int res;
res = totemmrp_ring_reenable ();
return (res);
}
const char *totempg_ifaces_print (unsigned int nodeid)
{
static char iface_string[256 * INTERFACE_MAX];
char one_iface[64];
struct totem_ip_address interfaces[INTERFACE_MAX];
char **status;
unsigned int iface_count;
unsigned int i;
int res;
iface_string[0] = '\0';
res = totempg_ifaces_get (nodeid, interfaces, &status, &iface_count);
if (res == -1) {
return ("no interface found for nodeid");
}
for (i = 0; i < iface_count; i++) {
sprintf (one_iface, "r(%d) ip(%s) ",
i, totemip_print (&interfaces[i]));
strcat (iface_string, one_iface);
}
return (iface_string);
}
unsigned int totempg_my_nodeid_get (void)
{
return (totemmrp_my_nodeid_get());
}
int totempg_my_family_get (void)
{
return (totemmrp_my_family_get());
}
extern void totempg_service_ready_register (
void (*totem_service_ready) (void))
{
totemmrp_service_ready_register (totem_service_ready);
}
void totempg_queue_level_register_callback (totem_queue_level_changed_fn fn)
{
totem_queue_level_changed = fn;
}
extern int totempg_member_add (
const struct totem_ip_address *member,
int ring_no);
extern int totempg_member_remove (
const struct totem_ip_address *member,
int ring_no);
void totempg_threaded_mode_enable (void)
{
totempg_threaded_mode = 1;
totemmrp_threaded_mode_enable ();
}
diff --git a/include/corosync/totem/totempg.h b/include/corosync/totem/totempg.h
index 30ccfe46..9fda82ea 100644
--- a/include/corosync/totem/totempg.h
+++ b/include/corosync/totem/totempg.h
@@ -1,190 +1,188 @@
/*
* Copyright (c) 2003-2005 MontaVista Software, Inc.
* Copyright (c) 2006-2007, 2009 Red Hat, Inc.
*
* All rights reserved.
*
* Author: Steven Dake (sdake@redhat.com)
*
* This software licensed under BSD license, the text of which follows:
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* - Redistributions of source code must retain the above copyright notice,
* this list of conditions and the following disclaimer.
* - Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
* - Neither the name of the MontaVista Software, Inc. nor the names of its
* contributors may be used to endorse or promote products derived from this
* software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
* THE POSSIBILITY OF SUCH DAMAGE.
*/
/**
* @file
* Totem Single Ring Protocol
*
* depends on poll abstraction, POSIX, IPV4
*/
#ifndef TOTEMPG_H_DEFINED
#define TOTEMPG_H_DEFINED
#ifdef __cplusplus
extern "C" {
#endif
#include <netinet/in.h>
#include "totem.h"
-#include <corosync/hdb.h>
#include <qb/qbloop.h>
struct totempg_group {
const void *group;
size_t group_len;
};
#define TOTEMPG_AGREED 0
#define TOTEMPG_SAFE 1
/**
* Initialize the totem process groups abstraction
*/
extern int totempg_initialize (
qb_loop_t* poll_handle,
struct totem_config *totem_config
);
extern void totempg_finalize (void);
extern int totempg_callback_token_create (void **handle_out,
enum totem_callback_token_type type,
int delete,
int (*callback_fn) (enum totem_callback_token_type type, const void *),
const void *data);
extern void totempg_callback_token_destroy (void *handle);
/**
* Initialize a groups instance
*/
extern int totempg_groups_initialize (
- hdb_handle_t *handle,
+ void **instance,
void (*deliver_fn) (
unsigned int nodeid,
const void *msg,
unsigned int msg_len,
int endian_conversion_required),
void (*confchg_fn) (
enum totem_configuration_type configuration_type,
const unsigned int *member_list, size_t member_list_entries,
const unsigned int *left_list, size_t left_list_entries,
const unsigned int *joined_list, size_t joined_list_entries,
const struct memb_ring_id *ring_id));
-extern int totempg_groups_finalize (
- hdb_handle_t handle);
+extern int totempg_groups_finalize (void *instance);
extern int totempg_groups_join (
- hdb_handle_t handle,
+ void *instance,
const struct totempg_group *groups,
size_t group_cnt);
extern int totempg_groups_leave (
- hdb_handle_t handle,
+ void *instance,
const struct totempg_group *groups,
size_t group_cnt);
extern int totempg_groups_mcast_joined (
- hdb_handle_t handle,
+ void *instance,
const struct iovec *iovec,
unsigned int iov_len,
int guarantee);
extern int totempg_groups_joined_reserve (
- hdb_handle_t handle,
+ void *instance,
const struct iovec *iovec,
unsigned int iov_len);
extern int totempg_groups_joined_release (
int msg_count);
extern int totempg_groups_mcast_groups (
- hdb_handle_t handle,
+ void *instance,
int guarantee,
const struct totempg_group *groups,
size_t groups_cnt,
const struct iovec *iovec,
unsigned int iov_len);
extern int totempg_groups_send_ok_groups (
- hdb_handle_t handle,
+ void *instance,
const struct totempg_group *groups,
size_t groups_cnt,
const struct iovec *iovec,
unsigned int iov_len);
extern int totempg_ifaces_get (
unsigned int nodeid,
struct totem_ip_address *interfaces,
char ***status,
unsigned int *iface_count);
extern void* totempg_get_stats (void);
void totempg_event_signal (enum totem_event_type type, int value);
extern const char *totempg_ifaces_print (unsigned int nodeid);
extern unsigned int totempg_my_nodeid_get (void);
extern int totempg_my_family_get (void);
extern int totempg_crypto_set (unsigned int type);
extern int totempg_ring_reenable (void);
extern void totempg_service_ready_register (
void (*totem_service_ready) (void));
extern int totempg_member_add (
const struct totem_ip_address *member,
int ring_no);
extern int totempg_member_remove (
const struct totem_ip_address *member,
int ring_no);
enum totem_q_level {
TOTEM_Q_LEVEL_LOW,
TOTEM_Q_LEVEL_GOOD,
TOTEM_Q_LEVEL_HIGH,
TOTEM_Q_LEVEL_CRITICAL
};
-void totempg_check_q_level(hdb_handle_t handle);
+void totempg_check_q_level(void *instance);
typedef void (*totem_queue_level_changed_fn) (enum totem_q_level level);
extern void totempg_queue_level_register_callback (totem_queue_level_changed_fn);
#ifdef __cplusplus
}
#endif
#endif /* TOTEMPG_H_DEFINED */
File Metadata
Details
Attached
Mime Type
text/x-diff
Expires
Wed, Feb 26, 10:41 PM (12 h, 23 m ago)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
1465834
Default Alt Text
(127 KB)
Attached To
Mode
rC Corosync
Attached
Detach File
Event Timeline
Log In to Comment