No OneTemporary
Actions

Size

92 KB

Referenced Files

None

Subscribers

None

View Options

	diff --git a/fence/fenced/cpg.c b/fence/fenced/cpg.c
	index 5b6c8269c..af0cb5a02 100644
	--- a/fence/fenced/cpg.c
	+++ b/fence/fenced/cpg.c
	@@ -1,1554 +1,1575 @@
	#include "fd.h"
	#include "config.h"

	static unsigned int protocol_active[3] = {1, 0, 0};
	static cpg_handle_t cpg_handle_daemon;
	static struct cpg_address daemon_member_list[MAX_NODES];
	static int daemon_member_list_entries;

	struct member {
	struct list_head list;
	int nodeid;
	int start; /* 1 if we received a start message for this change */
	int added; /* 1 if added by this change */
	int failed; /* 1 if failed in this change */
	int disallowed;
	uint32_t start_flags;
	};

	/* fd_info and id_info: for syncing state in start message */

	struct fd_info {
	uint32_t fd_info_size;
	uint32_t id_info_size;
	uint32_t id_info_count;

	uint32_t started_count;

	int member_count;
	int joined_count;
	int remove_count;
	int failed_count;
	};

	#define IDI_NODEID_IS_MEMBER 0x00000001

	struct id_info {
	int nodeid;
	uint32_t flags;

	/* the following syncs info to make queries useful from all nodes */

	int fence_external_node;
	int fence_master;
	int fence_how;
	uint64_t fence_time;
	uint64_t fence_external_time;
	};

	static void fd_info_in(struct fd_info *fi)
	{
	fi->fd_info_size = le32_to_cpu(fi->fd_info_size);
	fi->id_info_size = le32_to_cpu(fi->id_info_size);
	fi->id_info_count = le32_to_cpu(fi->id_info_count);
	fi->started_count = le32_to_cpu(fi->started_count);
	fi->member_count = le32_to_cpu(fi->member_count);
	fi->joined_count = le32_to_cpu(fi->joined_count);
	fi->remove_count = le32_to_cpu(fi->remove_count);
	fi->failed_count = le32_to_cpu(fi->failed_count);
	}

	static void id_info_in(struct id_info *id)
	{
	id->nodeid = le32_to_cpu(id->nodeid);
	id->flags = le32_to_cpu(id->flags);
	id->fence_external_node = le32_to_cpu(id->fence_external_node);
	id->fence_master = le32_to_cpu(id->fence_master);
	id->fence_how = le32_to_cpu(id->fence_how);
	id->fence_time = le64_to_cpu(id->fence_time);
	id->fence_external_time = le64_to_cpu(id->fence_external_time);
	}

	static void ids_in(struct fd_info fi, struct id_info ids)
	{
	struct id_info *id;
	int i;

	id = ids;
	for (i = 0; i < fi->id_info_count; i++) {
	id_info_in(id);
	id = (struct id_info )((char )id + fi->id_info_size);
	}
	}

	static char *msg_name(int type)
	{
	switch (type) {
	case FD_MSG_START:
	return "start";
	case FD_MSG_VICTIM_DONE:
	return "victim_done";
	case FD_MSG_COMPLETE:
	return "complete";
	case FD_MSG_EXTERNAL:
	return "external";
	default:
	return "unknown";
	}
	}

	static int _send_message(cpg_handle_t h, void *buf, int len, int type)
	{
	struct iovec iov;
	cpg_error_t error;
	int retries = 0;

	iov.iov_base = buf;
	iov.iov_len = len;

	retry:
	error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1);
	if (error == CPG_ERR_TRY_AGAIN) {
	retries++;
	usleep(1000);
	if (!(retries % 100))
	log_error("cpg_mcast_joined retry %d %s",
	retries, msg_name(type));
	goto retry;
	}
	if (error != CPG_OK) {
	log_error("cpg_mcast_joined error %d handle %llx %s",
	error, (unsigned long long)h, msg_name(type));
	return -1;
	}

	if (retries)
	log_debug("cpg_mcast_joined retried %d %s",
	retries, msg_name(type));

	return 0;
	}

	/* header fields caller needs to set: type, to_nodeid, flags, msgdata */

	static void fd_send_message(struct fd fd, char buf, int len)
	{
	struct fd_header hd = (struct fd_header ) buf;
	int type = hd->type;

	hd->version[0] = cpu_to_le16(protocol_active[0]);
	hd->version[1] = cpu_to_le16(protocol_active[1]);
	hd->version[2] = cpu_to_le16(protocol_active[2]);
	hd->type = cpu_to_le16(hd->type);
	hd->nodeid = cpu_to_le32(our_nodeid);
	hd->to_nodeid = cpu_to_le32(hd->to_nodeid);
	hd->flags = cpu_to_le32(hd->flags);
	hd->msgdata = cpu_to_le32(hd->msgdata);

	_send_message(fd->cpg_handle, buf, len, type);
	}

	static struct member find_memb(struct change cg, int nodeid)
	{
	struct member *memb;

	list_for_each_entry(memb, &cg->members, list) {
	if (memb->nodeid == nodeid)
	return memb;
	}
	return NULL;
	}

	static struct fd *find_fd_handle(cpg_handle_t h)
	{
	struct fd *fd;

	list_for_each_entry(fd, &domains, list) {
	if (fd->cpg_handle == h)
	return fd;
	}
	return NULL;
	}

	static struct fd *find_fd_ci(int ci)
	{
	struct fd *fd;

	list_for_each_entry(fd, &domains, list) {
	if (fd->cpg_client == ci)
	return fd;
	}
	return NULL;
	}

	void free_cg(struct change *cg)
	{
	struct member memb, safe;

	list_for_each_entry_safe(memb, safe, &cg->members, list) {
	list_del(&memb->list);
	free(memb);
	}
	list_for_each_entry_safe(memb, safe, &cg->removed, list) {
	list_del(&memb->list);
	free(memb);
	}
	free(cg);
	}

	static struct node_history get_node_history(struct fd fd, int nodeid)
	{
	struct node_history *node;

	list_for_each_entry(node, &fd->node_history, list) {
	if (node->nodeid == nodeid)
	return node;
	}
	return NULL;
	}

	static void node_history_init(struct fd *fd, int nodeid)
	{
	struct node_history *node;

	node = get_node_history(fd, nodeid);
	if (node)
	return;

	node = malloc(sizeof(struct node_history));
	if (!node)
	return;
	memset(node, 0, sizeof(struct node_history));

	node->nodeid = nodeid;
	list_add_tail(&node->list, &fd->node_history);
	}

	static void node_history_start(struct fd *fd, int nodeid)
	{
	struct node_history *node;

	node = get_node_history(fd, nodeid);
	if (!node) {
	log_error("node_history_start no nodeid %d", nodeid);
	return;
	}

	node->add_time = time(NULL);
	}

	static void node_history_left(struct fd *fd, int nodeid)
	{
	struct node_history *node;

	node = get_node_history(fd, nodeid);
	if (!node) {
	log_error("node_history_left no nodeid %d", nodeid);
	return;
	}

	node->left_time = time(NULL);
	}

	static void node_history_fail(struct fd *fd, int nodeid)
	{
	struct node_history *node;

	node = get_node_history(fd, nodeid);
	if (!node) {
	log_error("node_history_fail no nodeid %d", nodeid);
	return;
	}

	node->fail_time = time(NULL);

	node->check_quorum = 1;
	}

	/* The master node updates this info when it fences the victim, the other
	domain members update it when they receive the status message from the
	master. */

	void node_history_fence(struct fd *fd, int victim, int master, int how,
	uint64_t mastertime)
	{
	struct node_history *node;

	node = get_node_history(fd, victim);
	if (!node) {
	log_error("node_history_fence no nodeid %d", victim);
	return;
	}

	node->fence_master = master;
	node->fence_time = mastertime;
	node->fence_how = how;
	}

	/* When the fence_node command is run on a machine, it will first call
	libfence:fence_node(victim) to do the fencing. Afterward, it should call
	libfenced:fence_external(victim) to tell fenced what it's done, so fenced
	can avoid fencing the node a second time. This will result in a message
	being sent to all domain members which will update their node_history entry
	for the victim. The recover.c:fence_victims() code can check whether
	a victim has been externally fenced since the last add_time, and if so
	skip the fencing. This won't always work perfectly; a node might in some
	circumstances be fenced a second time by fenced. */

	static void node_history_fence_external(struct fd *fd, int nodeid, int from)
	{
	struct node_history *node;

	node = get_node_history(fd, nodeid);
	if (!node) {
	log_error("node_history_fence_external no nodeid %d", nodeid);
	return;
	}

	node->fence_external_time = time(NULL);
	node->fence_external_node = from;
	}

	static void save_history(struct fd fd, struct fd_info fi, struct id_info *ids)
	{
	struct node_history *node;
	struct id_info *id;
	int i;

	id = ids;

	for (i = 0; i < fi->id_info_count; i++) {
	node = get_node_history(fd, id->nodeid);
	if (!node)
	goto next;

	if (!node->fence_time && id->fence_time) {
	node->fence_master = id->fence_master;
	node->fence_time = id->fence_time;
	node->fence_how = id->fence_how;
	log_debug("save_history %d master %d time %llu how %d",
	node->nodeid, node->fence_master,
	(unsigned long long)node->fence_time,
	node->fence_how);
	}

	if (!node->fence_external_time && id->fence_external_time) {
	node->fence_external_time = id->fence_external_time;
	node->fence_external_node = id->fence_external_node;
	log_debug("save_history %d ext node %d ext time %llu",
	node->nodeid, node->fence_external_node,
	(unsigned long long)node->fence_external_time);
	}
	next:
	id = (struct id_info )((char )id + fi->id_info_size);
	}
	}

	/* call this from libfenced:fenced_external() */

	void send_external(struct fd *fd, int victim)
	{
	struct fd_header *hd;
	char *buf;
	int len;

	len = sizeof(struct fd_header);

	buf = malloc(len);
	if (!buf) {
	+ log_error("send_external no mem len %d", len);
	return;
	}
	memset(buf, 0, len);

	hd = (struct fd_header *)buf;
	hd->type = FD_MSG_EXTERNAL;
	hd->msgdata = victim;

	- log_debug("send_external %u", victim);
	+ log_debug("send_external victim nodeid %u", victim);

	fd_send_message(fd, buf, len);

	free(buf);
	}

	/* now, if the victim dies and the fence domain sees it fail,
	it will be added as an fd victim, but fence_victims() will
	call is_fenced_external() which will see that it's already
	fenced and bypass fencing it again */

	static void receive_external(struct fd fd, struct fd_header hd, int len)
	{
	- log_debug("receive_external from %d len %d victim %d",
	+ log_debug("receive_external from %d len %d victim nodeid %d",
	hd->nodeid, len, hd->msgdata);

	node_history_fence_external(fd, hd->msgdata, hd->nodeid);
	}

	int is_fenced_external(struct fd *fd, int nodeid)
	{
	struct node_history *node;

	node = get_node_history(fd, nodeid);
	if (!node) {
	log_error("is_fenced_external no nodeid %d", nodeid);
	return 0;
	}

	if (node->fence_external_time > node->add_time)
	return 1;
	return 0;
	}

	/* completed victim must be removed from victims list before calling this
	because we count the number of entries on the victims list for remaining */

	void send_victim_done(struct fd *fd, int victim)
	{
	struct change *cg = list_first_entry(&fd->changes, struct change, list);
	struct fd_header *hd;
	struct id_info *id;
	struct node_history *node;
	char *buf;
	int len;

	len = sizeof(struct fd_header) + sizeof(struct id_info);

	buf = malloc(len);
	if (!buf) {
	log_error("send_victim_done no mem len %d", len);
	return;
	}
	memset(buf, 0, len);

	hd = (struct fd_header *)buf;
	hd->type = FD_MSG_VICTIM_DONE;
	hd->msgdata = cg->seq;

	if (fd->init_complete)
	hd->flags \|= FD_MFLG_COMPLETE;

	node = get_node_history(fd, victim);
	if (!node) {
	- log_error("send_victim_done %d no node struct", victim);
	+ log_error("send_victim_done no nodeid %d", victim);
	return;
	}

	id = (struct id_info *)(buf + sizeof(struct fd_header));
	id->nodeid = cpu_to_le32(victim);
	id->fence_master = cpu_to_le32(our_nodeid);
	id->fence_time = cpu_to_le64(node->fence_time);
	id->fence_how = cpu_to_le32(node->fence_how);

	- log_debug("send_victim_done %u flags %x victim %d",
	+ log_debug("send_victim_done cg %u flags %x victim nodeid %d",
	cg->seq, hd->flags, victim);

	fd_send_message(fd, buf, len);

	free(buf);
	}

	static void receive_victim_done(struct fd fd, struct fd_header hd, int len)
	{
	struct node *node;
	uint32_t seq = hd->msgdata;
	int found;
	struct id_info *id;

	log_debug("receive_victim_done %d:%u flags %x len %d", hd->nodeid, seq,
	hd->flags, len);

	/* check that hd->nodeids is fd->master ? */

	/* I don't think there's any problem with the master removing the
	victim when it's done instead of waiting to remove it when it
	receives its own victim_done message, like the other nodes do */

	if (hd->nodeid == our_nodeid)
	return;

	id = (struct id_info )((char )hd + sizeof(struct fd_header));
	id_info_in(id);

	found = 0;
	list_for_each_entry(node, &fd->victims, list) {
	if (node->nodeid == id->nodeid) {
	- log_debug("receive_victim_done remove %d how %d",
	+ log_debug("receive_victim_done remove nodeid %d how %d",
	id->nodeid, id->fence_how);
	node_history_fence(fd, id->nodeid, id->fence_master,
	id->fence_how, id->fence_time);
	list_del(&node->list);
	free(node);
	found = 1;
	break;
	}
	}

	if (!found)
	- log_debug("receive_victim_done victim %d not found from %d",
	+ log_debug("receive_victim_done no nodeid %d from %d",
	id->nodeid, hd->nodeid);
	}

	static int check_quorum_done(struct fd *fd)
	{
	struct node_history *node;
	int wait_count = 0;

	/* We don't want to trust the cman_quorate value until we know
	that cman has seen the same nodes fail that we have. So, we
	first make sure that all nodes we've seen fail are also
	failed in cman, then we can just check cman_quorate. This
	assumes that we'll get to this function to do all the checks
	before any of the failed nodes can actually rejoin and become
	cman members again (if that assumption doesn't hold, perhaps
	do something with timestamps of join/fail). */

	list_for_each_entry(node, &fd->node_history, list) {
	if (!node->check_quorum)
	continue;

	if (!is_cman_member(node->nodeid)) {
	node->check_quorum = 0;
	} else {
	log_debug("check_quorum %d is_cman_member",
	node->nodeid);
	wait_count++;
	}
	}

	if (wait_count)
	return 0;

	if (!cman_quorate) {
	log_debug("check_quorum not quorate");
	return 0;
	}

	log_debug("check_quorum done");
	return 1;
	}

	static int wait_conditions_done(struct fd *fd)
	{
	if (!check_quorum_done(fd))
	return 0;
	return 1;
	}

	static int wait_messages_done(struct fd *fd)
	{
	struct change *cg = list_first_entry(&fd->changes, struct change, list);
	struct member *memb;
	int need = 0, total = 0;

	list_for_each_entry(memb, &cg->members, list) {
	if (!memb->start)
	need++;
	total++;
	}

	if (need) {
	log_debug("wait_messages_done need %d of %d", need, total);
	return 0;
	}

	log_debug("wait_messages_done got all %d", total);
	return 1;
	}

	static void cleanup_changes(struct fd *fd)
	{
	struct change *cg = list_first_entry(&fd->changes, struct change, list);
	struct change *safe;

	list_del(&cg->list);
	if (fd->started_change)
	free_cg(fd->started_change);
	fd->started_change = cg;

	/* zero started_count means "never started" */

	fd->started_count++;
	if (!fd->started_count)
	fd->started_count++;

	list_for_each_entry_safe(cg, safe, &fd->changes, list) {
	list_del(&cg->list);
	free_cg(cg);
	}
	}

	static void set_master(struct fd *fd)
	{
	struct change *cg = list_first_entry(&fd->changes, struct change, list);
	struct member *memb;
	int low = 0, complete = 0;

	list_for_each_entry(memb, &cg->members, list) {
	if (!low \|\| memb->nodeid < low)
	low = memb->nodeid;

	if (!(memb->start_flags & FD_MFLG_COMPLETE))
	continue;

	if (!complete \|\| memb->nodeid < complete)
	complete = memb->nodeid;
	}

	log_debug("set_master from %d to %s node %d", fd->master,
	complete ? "complete" : "low",
	complete ? complete : low);

	fd->master = complete ? complete : low;
	}

	static struct id_info get_id_struct(struct id_info ids, int count, int size,
	int nodeid)
	{
	struct id_info *id = ids;
	int i;

	for (i = 0; i < count; i++) {
	if (id->nodeid == nodeid)
	return id;
	id = (struct id_info )((char )id + size);
	}
	return NULL;
	}

	/* do the change details in the message match the details of the given change */

	static int match_change(struct fd fd, struct change cg, struct fd_header *hd,
	struct fd_info fi, struct id_info ids)
	{
	struct id_info *id;
	struct member *memb;
	uint32_t seq = hd->msgdata;
	int i, members_mismatch;

	/* We can ignore messages if we're not in the list of members.
	The one known time this will happen is after we've joined
	the cpg, we can get messages for changes prior to the change
	in which we're added. */

	id = get_id_struct(ids, fi->id_info_count, fi->id_info_size,our_nodeid);

	if (!id \|\| !(id->flags & IDI_NODEID_IS_MEMBER)) {
	- log_debug("match_change fail %d:%u we are not in members",
	- hd->nodeid, seq);
	+ log_debug("match_change %d:%u skip cg %u we are not in members",
	+ hd->nodeid, seq, cg->seq);
	return 0;
	}

	memb = find_memb(cg, hd->nodeid);
	if (!memb) {
	- log_debug("match_change fail %d:%u sender not member",
	- hd->nodeid, seq);
	+ log_debug("match_change %d:%u skip cg %u sender not member",
	+ hd->nodeid, seq, cg->seq);
	return 0;
	}

	/* verify this is the right change by matching the counts
	and the nodeids of the current members */

	if (fi->member_count != cg->member_count \|\|
	fi->joined_count != cg->joined_count \|\|
	fi->remove_count != cg->remove_count \|\|
	fi->failed_count != cg->failed_count) {
	- log_debug("match_change fail %d:%u expect counts "
	- "%d %d %d %d", hd->nodeid, seq,
	+ log_debug("match_change %d:%u skip cg %u expect counts "
	+ "%d %d %d %d", hd->nodeid, seq, cg->seq,
	cg->member_count, cg->joined_count,
	cg->remove_count, cg->failed_count);
	return 0;
	}

	members_mismatch = 0;
	id = ids;

	for (i = 0; i < fi->id_info_count; i++) {
	if (id->flags & IDI_NODEID_IS_MEMBER) {
	memb = find_memb(cg, id->nodeid);
	if (!memb) {
	- log_debug("match_change fail %d:%u memb %d",
	- hd->nodeid, seq, id->nodeid);
	+ log_debug("match_change %d:%u skip cg %u "
	+ "no memb %d", hd->nodeid, seq,
	+ cg->seq, id->nodeid);
	members_mismatch = 1;
	break;
	}
	}
	id = (struct id_info )((char )id + fi->id_info_size);
	}

	if (members_mismatch)
	return 0;

	- log_debug("match_change done %d:%u", hd->nodeid, seq);
	+ log_debug("match_change %d:%u matches cg %u", hd->nodeid, seq, cg->seq);
	return 1;
	}

	/* Unfortunately, there's no really simple way to match a message with the
	specific change that it was sent for. We hope that by passing all the
	details of the change in the message, we will be able to uniquely match the
	it to the correct change. */

	/* A start message will usually be for the first (current) change on our list.
	In some cases it will be for a non-current change, and we can ignore it:

	1. A,B,C get confchg1 adding C
	2. C sends start for confchg1
	3. A,B,C get confchg2 adding D
	4. A,B,C,D recv start from C for confchg1 - ignored
	5. C,D send start for confchg2
	6. A,B send start for confchg2
	7. A,B,C,D recv all start messages for confchg2, and start kernel

	In step 4, how do the nodes know whether the start message from C is
	for confchg1 or confchg2? Hopefully by comparing the counts and members. */

	static struct change find_change(struct fd fd, struct fd_header *hd,
	struct fd_info fi, struct id_info ids)
	{
	struct change *cg;

	list_for_each_entry_reverse(cg, &fd->changes, list) {
	if (!match_change(fd, cg, hd, fi, ids))
	continue;
	return cg;
	}

	log_debug("find_change %d:%u no match", hd->nodeid, hd->msgdata);
	return NULL;
	}

	static int is_added(struct fd *fd, int nodeid)
	{
	struct change *cg;
	struct member *memb;

	list_for_each_entry(cg, &fd->changes, list) {
	memb = find_memb(cg, nodeid);
	if (memb && memb->added)
	return 1;
	}
	return 0;
	}

	static void receive_start(struct fd fd, struct fd_header hd, int len)
	{
	struct change *cg;
	struct member *memb;
	struct fd_info *fi;
	struct id_info *ids;
	uint32_t seq = hd->msgdata;
	int added;

	log_debug("receive_start %d:%u len %d", hd->nodeid, seq, len);

	fi = (struct fd_info )((char )hd + sizeof(struct fd_header));
	ids = (struct id_info )((char )fi + sizeof(struct fd_info));

	fd_info_in(fi);
	ids_in(fi, ids);

	cg = find_change(fd, hd, fi, ids);
	if (!cg)
	return;

	memb = find_memb(cg, hd->nodeid);
	if (!memb) {
	/* this should never happen since match_change checks it */
	log_error("receive_start no member %d", hd->nodeid);
	return;
	}

	memb->start_flags = hd->flags;

	added = is_added(fd, hd->nodeid);

	if (added && fi->started_count) {
	log_error("receive_start %d:%u add node with started_count %u",
	hd->nodeid, seq, fi->started_count);

	- /* observe this scheme working before using it; I'm not sure
	- that a joining node won't ever see an existing node as added
	- under normal circumstances */
	- /*
	- memb->disallowed = 1;
	- return;
	- */
	+ /* This is how we deal with cpg's that are partitioned and
	+ then merge back together. When the merge happens, the
	+ cpg on each side will see nodes from the other side being
	+ added, and neither side will have zero started_count. So,
	+ both sides will ignore start messages from the other side.
	+ This causes the the domain on each side to continue waiting
	+ for the missing start messages indefinately. To unblock
	+ things, all nodes from one side of the former partition
	+ need to fail. */
	+
	+ /* This method of detecting a merge of a partitioned cpg
	+ assumes a joining node won't ever see an existing node
	+ as "added" under normal circumstances. */
	+
	+ memb->disallowed = 1;
	+ return;
	}

	node_history_start(fd, hd->nodeid);
	memb->start = 1;

	/* save any fencing history from this message that we don't have */
	save_history(fd, fi, ids);
	}

	static void receive_complete(struct fd fd, struct fd_header hd, int len)
	{
	struct fd_info *fi;
	struct id_info ids, id;
	uint32_t seq = hd->msgdata;
	struct node node, safe;

	log_debug("receive_complete %d:%u len %d", hd->nodeid, seq, len);

	if (fd->init_complete)
	return;

	fi = (struct fd_info )((char )hd + sizeof(struct fd_header));
	ids = (struct id_info )((char )fi + sizeof(struct fd_info));

	fd_info_in(fi);
	ids_in(fi, ids);

	id = get_id_struct(ids, fi->id_info_count, fi->id_info_size,our_nodeid);

	if (!id \|\| !(id->flags & IDI_NODEID_IS_MEMBER)) {
	log_debug("receive_complete %d:%u we are not in members",
	hd->nodeid, seq);
	return;
	}

	fd->init_complete = 1;

	/* we may have victims from init which we can clear now */
	list_for_each_entry_safe(node, safe, &fd->victims, list) {
	- log_debug("receive_complete clear victim %d init %d",
	+ log_debug("receive_complete clear victim nodeid %d init %d",
	node->nodeid, node->init_victim);
	list_del(&node->list);
	free(node);
	}
	}

	static int count_ids(struct fd *fd)
	{
	struct node_history *node;
	int count = 0;

	list_for_each_entry(node, &fd->node_history, list)
	count++;

	return count;
	}

	static void send_info(struct fd *fd, int type)
	{
	struct change *cg;
	struct fd_header *hd;
	struct fd_info *fi;
	struct id_info *id;
	struct node_history *node;
	char *buf;
	uint32_t flags;
	int len, id_count;

	cg = list_first_entry(&fd->changes, struct change, list);

	id_count = count_ids(fd);

	len = sizeof(struct fd_header) + sizeof(struct fd_info) +
	id_count * sizeof(struct id_info);

	buf = malloc(len);
	if (!buf) {
	log_error("send_info len %d no mem", len);
	return;
	}
	memset(buf, 0, len);

	hd = (struct fd_header *)buf;
	fi = (struct fd_info )(buf + sizeof(hd));
	id = (struct id_info )(buf + sizeof(hd) + sizeof(*fi));

	/* fill in header (fd_send_message handles part of header) */

	hd->type = type;
	hd->msgdata = cg->seq;
	if (cg->we_joined)
	hd->flags \|= FD_MFLG_JOINING;
	if (fd->init_complete)
	hd->flags \|= FD_MFLG_COMPLETE;

	/* fill in fd_info */

	fi->fd_info_size = cpu_to_le32(sizeof(struct fd_info));
	fi->id_info_size = cpu_to_le32(sizeof(struct id_info));
	fi->id_info_count = cpu_to_le32(id_count);
	fi->started_count = cpu_to_le32(fd->started_count);
	fi->member_count = cpu_to_le32(cg->member_count);
	fi->joined_count = cpu_to_le32(cg->joined_count);
	fi->remove_count = cpu_to_le32(cg->remove_count);
	fi->failed_count = cpu_to_le32(cg->failed_count);

	/* fill in id_info entries */

	list_for_each_entry(node, &fd->node_history, list) {
	flags = 0;
	if (find_memb(cg, node->nodeid))
	flags = IDI_NODEID_IS_MEMBER;

	id->flags = cpu_to_le32(flags);
	id->nodeid = cpu_to_le32(node->nodeid);
	id->fence_external_node= cpu_to_le32(node->fence_external_node);
	id->fence_master = cpu_to_le32(node->fence_master);
	id->fence_how = cpu_to_le32(node->fence_how);
	id->fence_time = cpu_to_le64(node->fence_time);
	id->fence_external_time= cpu_to_le64(node->fence_external_time);
	id++;
	}

	- log_debug("send_%s %u flags %x counts %u %d %d %d %d",
	+ log_debug("send_%s cg %u flags %x counts %u %d %d %d %d",
	type == FD_MSG_START ? "start" : "complete",
	cg->seq, hd->flags, fd->started_count, cg->member_count,
	cg->joined_count, cg->remove_count, cg->failed_count);

	fd_send_message(fd, buf, len);

	free(buf);
	}

	static void send_start(struct fd *fd)
	{
	send_info(fd, FD_MSG_START);
	}

	/* same content as a start message, a new (incomplete) node will look for
	a complete message that shows it as a member, when it sees one it can
	clear any init_victims and set init_complete for future cycles */

	static void send_complete(struct fd *fd)
	{
	send_info(fd, FD_MSG_COMPLETE);
	}

	/* FIXME: better to just look in victims list for any nodes with init_victim? */

	static int nodes_added(struct fd *fd)
	{
	struct change *cg;

	list_for_each_entry(cg, &fd->changes, list) {
	if (cg->joined_count)
	return 1;
	}
	return 0;
	}

	/* If we're being added by the current change, we'll have an empty victims
	list, while other previous members may already have nodes in their
	victims list. So, we need to assume that any node in cluster.conf that's
	not a cluster member when we're added to the fd is already a victim.
	We can go back on that assumption, and clear out any presumed victims, when
	we see a message from a previous member saying that are no current victims. */

	static void add_victims(struct fd fd, struct change cg)
	{
	struct member *memb;
	struct node *node;

	list_for_each_entry(memb, &cg->removed, list) {
	if (!memb->failed)
	continue;
	+ if (is_victim(fd, memb->nodeid)) {
	+ /* Only one scenario I know of where this happens:
	+ when a partitioned cpg merges and then the
	+ disallowed node is killed. The original
	+ partition makes the node a victim, and killing
	+ it after a merge will find it already a victim. */
	+ log_debug("add_victims nodeid %d already victim",
	+ memb->nodeid);
	+ continue;
	+ }
	node = get_new_node(fd, memb->nodeid);
	if (!node)
	return;
	list_add(&node->list, &fd->victims);
	- log_debug("add node %d to victims", node->nodeid);
	+ log_debug("add nodeid %d to victims", node->nodeid);
	}
	}

	/* with start messages from all members, we can pick which one should be master
	and do the fencing (low nodeid with state, "COMPLETE"). as the master
	successfully fences each victim, it sends a status message such that all
	members remove the node from their victims list.

	after all victims have been dealt following a change (or set of changes),
	the master sends a complete message that indicates the members of the group
	for the change it has completed processing. when a joining node sees this
	complete message and sees itself as a member, it knows it can clear all
	init_victims from startup init, and it sets init_complete so it will
	volunteer to be master in the next round by setting COMPLETE flag.

	once the master begins fencing victims, it won't process any new changes
	until it's done. the non-master members will process changes while the
	master is fencing, but will wait for the master to catch up in
	WAIT_MESSAGES. if the master fails, the others will no longer wait for it.*/

	static void apply_changes(struct fd *fd)
	{
	struct change *cg;

	if (list_empty(&fd->changes))
	return;
	cg = list_first_entry(&fd->changes, struct change, list);

	switch (cg->state) {

	case CGST_WAIT_CONDITIONS:
	if (wait_conditions_done(fd)) {
	send_start(fd);
	cg->state = CGST_WAIT_MESSAGES;
	}
	break;

	case CGST_WAIT_MESSAGES:
	if (wait_messages_done(fd)) {
	set_master(fd);
	if (fd->master == our_nodeid) {
	delay_fencing(fd, nodes_added(fd));
	fence_victims(fd);
	send_complete(fd);
	} else {
	defer_fencing(fd);
	}

	cleanup_changes(fd);
	fd->joining_group = 0;
	}
	break;

	default:
	log_error("apply_changes invalid state %d", cg->state);
	}
	}

	void process_fd_changes(void)
	{
	struct fd fd, safe;

	list_for_each_entry_safe(fd, safe, &domains, list) {
	if (!list_empty(&fd->changes))
	apply_changes(fd);
	}
	}

	static int add_change(struct fd *fd,
	struct cpg_address *member_list, int member_list_entries,
	struct cpg_address *left_list, int left_list_entries,
	struct cpg_address *joined_list, int joined_list_entries,
	struct change **cg_out)
	{
	struct change *cg;
	struct member *memb;
	int i, error;

	cg = malloc(sizeof(struct change));
	if (!cg)
	goto fail_nomem;
	memset(cg, 0, sizeof(struct change));
	INIT_LIST_HEAD(&cg->members);
	INIT_LIST_HEAD(&cg->removed);
	cg->seq = ++fd->change_seq;
	cg->state = CGST_WAIT_CONDITIONS;

	cg->member_count = member_list_entries;
	cg->joined_count = joined_list_entries;
	cg->remove_count = left_list_entries;

	for (i = 0; i < member_list_entries; i++) {
	memb = malloc(sizeof(struct member));
	if (!memb)
	goto fail_nomem;
	memset(memb, 0, sizeof(struct member));
	memb->nodeid = member_list[i].nodeid;
	list_add_tail(&memb->list, &cg->members);
	}

	for (i = 0; i < left_list_entries; i++) {
	memb = malloc(sizeof(struct member));
	if (!memb)
	goto fail_nomem;
	memset(memb, 0, sizeof(struct member));
	memb->nodeid = left_list[i].nodeid;
	if (left_list[i].reason == CPG_REASON_NODEDOWN \|\|
	left_list[i].reason == CPG_REASON_PROCDOWN) {
	memb->failed = 1;
	cg->failed_count++;
	}
	list_add_tail(&memb->list, &cg->removed);

	if (memb->failed)
	node_history_fail(fd, memb->nodeid);
	else
	node_history_left(fd, memb->nodeid);

	- log_debug("add_change %u nodeid %d remove reason %d",
	+ log_debug("add_change cg %u remove nodeid %d reason %d",
	cg->seq, memb->nodeid, left_list[i].reason);

	if (left_list[i].reason == CPG_REASON_PROCDOWN)
	kick_node_from_cluster(memb->nodeid);
	}

	for (i = 0; i < joined_list_entries; i++) {
	memb = find_memb(cg, joined_list[i].nodeid);
	if (!memb) {
	log_error("no member %d", joined_list[i].nodeid);
	error = -ENOENT;
	goto fail;
	}
	memb->added = 1;

	if (memb->nodeid == our_nodeid)
	cg->we_joined = 1;
	else
	node_history_init(fd, memb->nodeid);

	- log_debug("add_change %u nodeid %d joined", cg->seq,
	+ log_debug("add_change cg %u joined nodeid %d", cg->seq,
	memb->nodeid);
	}

	if (cg->we_joined)
	list_for_each_entry(memb, &cg->members, list)
	node_history_init(fd, memb->nodeid);

	- log_debug("add_change %u member %d joined %d remove %d failed %d",
	- cg->seq, cg->member_count, cg->joined_count, cg->remove_count,
	- cg->failed_count);
	+ log_debug("add_change cg %u counts member %d joined %d remove %d "
	+ "failed %d", cg->seq, cg->member_count, cg->joined_count,
	+ cg->remove_count, cg->failed_count);

	list_add(&cg->list, &fd->changes);
	*cg_out = cg;
	return 0;

	fail_nomem:
	log_error("no memory");
	error = -ENOMEM;
	fail:
	free_cg(cg);
	return error;
	}

	/* add a victim for each node in complete list (represents all nodes in
	cluster.conf) that is not a cman member (and not already a victim) */

	static void add_victims_init(struct fd fd, struct change cg)
	{
	struct node node, safe;

	list_for_each_entry_safe(node, safe, &fd->complete, list) {
	list_del(&node->list);

	if (!is_cman_member(node->nodeid) &&
	!find_memb(cg, node->nodeid) &&
	!is_victim(fd, node->nodeid)) {
	node->init_victim = 1;
	list_add(&node->list, &fd->victims);
	- log_debug("add_victims_init %d", node->nodeid);
	+ log_debug("add_victims_init nodeid %d", node->nodeid);
	} else {
	free(node);
	}
	}
	}

	static int we_left(struct cpg_address *left_list, int left_list_entries)
	{
	int i;

	for (i = 0; i < left_list_entries; i++) {
	if (left_list[i].nodeid == our_nodeid)
	return 1;
	}
	return 0;
	}

	static void confchg_cb(cpg_handle_t handle, struct cpg_name *group_name,
	struct cpg_address *member_list, int member_list_entries,
	struct cpg_address *left_list, int left_list_entries,
	struct cpg_address *joined_list, int joined_list_entries)
	{
	struct fd *fd;
	struct change *cg;
	int rv;

	fd = find_fd_handle(handle);
	if (!fd) {
	log_error("confchg_cb no fence domain for cpg %s",
	group_name->value);
	return;
	}

	if (fd->leaving_group && we_left(left_list, left_list_entries)) {
	/* we called cpg_leave(), and this should be the final
	cpg callback we receive */
	log_debug("confchg for our leave");
	cpg_finalize(fd->cpg_handle);
	client_dead(fd->cpg_client);
	list_del(&fd->list);
	free_fd(fd);
	return;
	}

	rv = add_change(fd, member_list, member_list_entries,
	left_list, left_list_entries,
	joined_list, joined_list_entries, &cg);
	if (rv)
	return;

	/* failed nodes in this change become victims */

	add_victims(fd, cg);

	/* As a joining domain member with no previous state, we need to
	assume non-member nodes are already victims; these initial victims
	are cleared if we get a "complete" message from the master.
	But, if we're the master, we do end up fencing these init nodes. */

	if (cg->we_joined)
	add_victims_init(fd, cg);
	}

	static void deliver_cb(cpg_handle_t handle, struct cpg_name *group_name,
	uint32_t nodeid, uint32_t pid, void *data, int len)
	{
	struct fd *fd;
	struct fd_header *hd;

	fd = find_fd_handle(handle);
	if (!fd) {
	log_error("deliver_cb no fd for cpg %s", group_name->value);
	return;
	}

	hd = (struct fd_header *)data;

	hd->version[0] = le16_to_cpu(hd->version[0]);
	hd->version[1] = le16_to_cpu(hd->version[1]);
	hd->version[2] = le16_to_cpu(hd->version[2]);
	hd->type = le16_to_cpu(hd->type);
	hd->nodeid = le32_to_cpu(hd->nodeid);
	hd->to_nodeid = le32_to_cpu(hd->to_nodeid);
	hd->global_id = le32_to_cpu(hd->global_id);
	hd->flags = le32_to_cpu(hd->flags);
	hd->msgdata = le32_to_cpu(hd->msgdata);

	if (hd->version[0] != protocol_active[0]) {
	log_error("reject message from %d version %u.%u.%u vs %u.%u.%u",
	nodeid, hd->version[0], hd->version[1],
	hd->version[2], protocol_active[0],
	protocol_active[1], protocol_active[2]);
	return;
	}

	if (hd->nodeid != nodeid) {
	log_error("bad msg nodeid %d %d", hd->nodeid, nodeid);
	return;
	}

	switch (hd->type) {
	case FD_MSG_START:
	receive_start(fd, hd, len);
	break;
	case FD_MSG_VICTIM_DONE:
	receive_victim_done(fd, hd, len);
	break;
	case FD_MSG_COMPLETE:
	receive_complete(fd, hd, len);
	break;
	case FD_MSG_EXTERNAL:
	receive_external(fd, hd, len);
	break;
	default:
	log_error("unknown msg type %d", hd->type);
	}
	}

	static cpg_callbacks_t cpg_callbacks = {
	.cpg_deliver_fn = deliver_cb,
	.cpg_confchg_fn = confchg_cb,
	};

	static void process_fd_cpg(int ci)
	{
	struct fd *fd;
	cpg_error_t error;

	fd = find_fd_ci(ci);
	if (!fd) {
	log_error("process_fd_cpg no fence domain for ci %d", ci);
	return;
	}

	error = cpg_dispatch(fd->cpg_handle, CPG_DISPATCH_ALL);
	if (error != CPG_OK) {
	log_error("cpg_dispatch error %d", error);
	return;
	}

	apply_changes(fd);
	}

	int fd_join(struct fd *fd)
	{
	cpg_error_t error;
	cpg_handle_t h;
	struct cpg_name name;
	int i = 0, f, ci;

	error = cpg_initialize(&h, &cpg_callbacks);
	if (error != CPG_OK) {
	log_error("cpg_initialize error %d", error);
	goto fail_free;
	}

	cpg_fd_get(h, &f);

	ci = client_add(f, process_fd_cpg, NULL);

	list_add(&fd->list, &domains);
	fd->cpg_handle = h;
	fd->cpg_client = ci;
	fd->cpg_fd = f;
	fd->joining_group = 1;

	memset(&name, 0, sizeof(name));
	sprintf(name.value, "fenced:%s", fd->name);
	name.length = strlen(name.value) + 1;

	retry:
	error = cpg_join(h, &name);
	if (error == CPG_ERR_TRY_AGAIN) {
	sleep(1);
	if (!(++i % 10))
	log_error("cpg_join error retrying");
	goto retry;
	}
	if (error != CPG_OK) {
	log_error("cpg_join error %d", error);
	goto fail;
	}

	return 0;

	fail:
	list_del(&fd->list);
	client_dead(ci);
	cpg_finalize(h);
	fail_free:
	free(fd);
	return error;
	}

	int fd_leave(struct fd *fd)
	{
	cpg_error_t error;
	struct cpg_name name;
	int i = 0;

	fd->leaving_group = 1;

	memset(&name, 0, sizeof(name));
	sprintf(name.value, "fenced:%s", fd->name);
	name.length = strlen(name.value) + 1;

	retry:
	error = cpg_leave(fd->cpg_handle, &name);
	if (error == CPG_ERR_TRY_AGAIN) {
	sleep(1);
	if (!(++i % 10))
	log_error("cpg_leave error retrying");
	goto retry;
	}
	if (error != CPG_OK)
	log_error("cpg_leave error %d", error);

	return 0;
	}

	/* process_cpg(), setup_cpg(), close_cpg() are for the "daemon" cpg which
	tracks the presence of other daemons; it's not the fenced domain cpg.
	Joining this cpg tells others that we don't have uncontrolled dlm/gfs
	kernel state and they can skip fencing us if we're a victim. (We have
	to check for that uncontrolled state before calling setup_cpg, obviously.) */

	static void deliver_cb_daemon(cpg_handle_t handle, struct cpg_name *group_name,
	uint32_t nodeid, uint32_t pid, void *data, int len)
	{
	}

	static void confchg_cb_daemon(cpg_handle_t handle, struct cpg_name *group_name,
	struct cpg_address *member_list, int member_list_entries,
	struct cpg_address *left_list, int left_list_entries,
	struct cpg_address *joined_list, int joined_list_entries)
	{
	memset(&daemon_member_list, 0, sizeof(daemon_member_list));
	memcpy(&daemon_member_list, member_list,
	member_list_entries * sizeof(struct cpg_address));
	daemon_member_list_entries = member_list_entries;
	}

	static cpg_callbacks_t cpg_callbacks_daemon = {
	.cpg_deliver_fn = deliver_cb_daemon,
	.cpg_confchg_fn = confchg_cb_daemon,
	};

	void process_cpg(int ci)
	{
	cpg_error_t error;

	error = cpg_dispatch(cpg_handle_daemon, CPG_DISPATCH_ALL);
	if (error != CPG_OK)
	log_error("daemon cpg_dispatch error %d", error);
	}

	int in_daemon_member_list(int nodeid)
	{
	int i;

	cpg_dispatch(cpg_handle_daemon, CPG_DISPATCH_ALL);

	for (i = 0; i < daemon_member_list_entries; i++) {
	if (daemon_member_list[i].nodeid == nodeid)
	return 1;
	}
	return 0;
	}

	int setup_cpg(void)
	{
	cpg_error_t error;
	cpg_handle_t h;
	struct cpg_name name;
	int i = 0, f;

	error = cpg_initialize(&h, &cpg_callbacks_daemon);
	if (error != CPG_OK) {
	log_error("daemon cpg_initialize error %d", error);
	goto fail;
	}

	cpg_fd_get(h, &f);

	cpg_handle_daemon = h;

	memset(&name, 0, sizeof(name));
	sprintf(name.value, "fenced:daemon");
	name.length = strlen(name.value) + 1;

	retry:
	error = cpg_join(h, &name);
	if (error == CPG_ERR_TRY_AGAIN) {
	sleep(1);
	if (!(++i % 10))
	log_error("daemon cpg_join error retrying");
	goto retry;
	}
	if (error != CPG_OK) {
	log_error("daemon cpg_join error %d", error);
	goto fail;
	}

	log_debug("setup_cpg %d", f);
	return f;

	fail:
	cpg_finalize(h);
	return -1;
	}

	void close_cpg(void)
	{
	cpg_error_t error;
	struct cpg_name name;
	int i = 0;

	if (!cpg_handle_daemon)
	return;

	memset(&name, 0, sizeof(name));
	sprintf(name.value, "fenced:daemon");
	name.length = strlen(name.value) + 1;

	retry:
	error = cpg_leave(cpg_handle_daemon, &name);
	if (error == CPG_ERR_TRY_AGAIN) {
	sleep(1);
	if (!(++i % 10))
	log_error("daemon cpg_leave error retrying");
	goto retry;
	}
	if (error != CPG_OK)
	log_error("daemon cpg_leave error %d", error);
	}

	int set_node_info(struct fd fd, int nodeid, struct fenced_node nodeinfo)
	{
	struct node_history *node;
	struct member *memb;

	nodeinfo->nodeid = nodeid;
	nodeinfo->victim = is_victim(fd, nodeid);

	if (!fd->started_change)
	goto history;

	memb = find_memb(fd->started_change, nodeid);
	if (memb)
	nodeinfo->member = memb->disallowed ? 0 : 1;

	history:
	node = get_node_history(fd, nodeid);
	if (!node)
	return 0;

	nodeinfo->last_fenced_master = node->fence_master;
	nodeinfo->last_fenced_how = node->fence_how;
	nodeinfo->last_fenced_time = node->fence_time;

	return 0;
	}

	int set_domain_info(struct fd fd, struct fenced_domain domain)
	{
	struct change *cg = fd->started_change;

	if (cg) {
	domain->member_count = cg->member_count;
	domain->state = cg->state;
	}
	domain->master_nodeid = fd->master;
	domain->victim_count = list_count(&fd->victims);
	domain->current_victim = fd->current_victim;

	return 0;
	}

	int set_domain_nodes(struct fd fd, int option, int node_count,
	struct fenced_node **nodes_out)
	{
	struct change *cg = fd->started_change;
	struct fenced_node nodes = NULL, n;
	struct node_history *nh;
	struct member *memb;
	int count = 0;

	if (option == FENCED_NODES_MEMBERS) {
	if (!cg)
	goto out;
	count = cg->member_count;

	nodes = malloc(count * sizeof(struct fenced_node));
	if (!nodes)
	return -ENOMEM;
	memset(nodes, 0, sizeof(*nodes));

	n = nodes;
	list_for_each_entry(memb, &cg->members, list)
	set_node_info(fd, memb->nodeid, n++);
	}

	else if (option == FENCED_NODES_ALL) {
	list_for_each_entry(nh, &fd->node_history, list)
	count++;

	nodes = malloc(count * sizeof(struct fenced_node));
	if (!nodes)
	return -ENOMEM;
	memset(nodes, 0, sizeof(*nodes));

	n = nodes;
	list_for_each_entry(nh, &fd->node_history, list)
	set_node_info(fd, nh->nodeid, n++);
	}
	out:
	*node_count = count;
	*nodes_out = nodes;
	return 0;
	}

	diff --git a/fence/fenced/fd.h b/fence/fenced/fd.h
	index a9dacbd63..a649366af 100644
	--- a/fence/fenced/fd.h
	+++ b/fence/fenced/fd.h
	@@ -1,276 +1,277 @@
	#ifndef __FD_DOT_H__
	#define __FD_DOT_H__

	#include <stdio.h>
	#include <stdlib.h>
	#include <stddef.h>
	#include <signal.h>
	#include <unistd.h>
	#include <fcntl.h>
	#include <errno.h>
	#include <string.h>
	#include <stdint.h>
	#include <time.h>
	#include <sched.h>
	#include <limits.h>
	#include <dirent.h>
	#include <sys/ioctl.h>
	#include <sys/types.h>
	#include <sys/stat.h>
	#include <sys/socket.h>
	#include <sys/un.h>
	#include <sys/poll.h>
	#include <sys/select.h>
	#include <sys/time.h>

	#include <openais/saAis.h>
	#include <corosync/cpg.h>
	#include <corosync/engine/logsys.h>

	#include "list.h"
	#include "linux_endian.h"
	#include "libfence.h"
	#include "libfenced.h"
	#include "fenced.h"

	/* Max name length for a group, pointless since we only ever create the
	"default" group. Regardless, set arbitrary max to match dlm's
	DLM_LOCKSPACE_LEN 64. The libcpg limit is larger at 128; we prefix
	the fence domain name with "fenced:" to create the cpg name. */

	#define MAX_GROUPNAME_LEN 64

	/* Max name length for a node. This should match libcman's
	CMAN_MAX_NODENAME_LEN which is 255. */

	#define MAX_NODENAME_LEN 255

	/* Maximum members of the fence domain, or cluster. Should match
	CPG_MEMBERS_MAX in openais/cpg.h. */

	#define MAX_NODES 128

	/* Max string length printed on a line, for debugging/dump output. */

	#define MAXLINE 256

	/* group_mode */

	#define GROUP_LIBGROUP 2
	#define GROUP_LIBCPG 3

	extern int daemon_debug_opt;
	extern int daemon_quit;
	extern struct list_head domains;
	extern int cman_quorate;
	extern int our_nodeid;
	extern char our_name[MAX_NODENAME_LEN+1];
	extern char daemon_debug_buf[256];
	extern char dump_buf[FENCED_DUMP_SIZE];
	extern int dump_point;
	extern int dump_wrap;
	extern int group_mode;

	extern void daemon_dump_save(void);

	#define log_debug(fmt, args...) \
	do { \
	snprintf(daemon_debug_buf, 255, "%ld " fmt "\n", time(NULL), ##args); \
	daemon_dump_save(); \
	if (daemon_debug_opt) \
	fprintf(stderr, "%s", daemon_debug_buf); \
	if (cfgd_debug_logsys) \
	log_printf(LOG_DEBUG, "%s", daemon_debug_buf); \
	} while (0)

	#define log_error(fmt, args...) \
	do { \
	log_debug(fmt, ##args); \
	log_printf(LOG_ERR, fmt, ##args); \
	} while (0)

	#define log_level(lvl, fmt, args...) \
	do { \
	log_debug(fmt, ##args); \
	log_printf(lvl, fmt, ##args); \
	} while (0)

	#define FD_MSG_START 1
	#define FD_MSG_VICTIM_DONE 2
	#define FD_MSG_COMPLETE 3
	#define FD_MSG_EXTERNAL 4

	#define FD_MFLG_JOINING 1 /* accompanies start, we are joining */
	#define FD_MFLG_COMPLETE 2 /* accompanies start, we have complete info */

	struct fd_header {
	uint16_t version[3];
	uint16_t type; /* FD_MSG_ */
	uint32_t nodeid; /* sender */
	uint32_t to_nodeid; /* recipient, 0 for all */
	uint32_t global_id; /* global unique id for this domain */
	uint32_t flags; /* FD_MFLG_ */
	uint32_t msgdata; /* in-header payload depends on MSG type */
	uint32_t pad1;
	uint64_t pad2;
	};

	#define CGST_WAIT_CONDITIONS 1
	#define CGST_WAIT_MESSAGES 2

	struct change {
	struct list_head list;
	struct list_head members;
	struct list_head removed; /* nodes removed by this change */
	int member_count;
	int joined_count;
	int remove_count;
	int failed_count;
	int state; /* CGST_ */
	int we_joined;
	uint32_t seq; /* just used as a reference when debugging */
	};

	#define VIC_DONE_AGENT 1
	#define VIC_DONE_MEMBER 2
	#define VIC_DONE_OVERRIDE 3
	#define VIC_DONE_EXTERNAL 4

	struct node_history {
	struct list_head list;
	int nodeid;
	int check_quorum;
	uint64_t add_time;
	uint64_t left_time;
	uint64_t fail_time;
	uint64_t fence_time;
	uint64_t fence_external_time;
	int fence_external_node;
	int fence_master;
	int fence_how; /* VIC_DONE_ */
	};

	struct node {
	struct list_head list;
	int nodeid;
	int init_victim;
	char name[MAX_NODENAME_LEN+1];
	};

	struct fd {
	struct list_head list;
	char name[MAX_GROUPNAME_LEN+1];

	/* libcpg domain membership */

	cpg_handle_t cpg_handle;
	int cpg_client;
	int cpg_fd;
	uint32_t change_seq;
	uint32_t started_count;
	struct change *started_change;
	struct list_head changes;
	struct list_head node_history;
	int init_complete;

	/* general domain membership */

	int master;
	int joining_group;
	int leaving_group;
	int current_victim; /* for queries */
	struct list_head victims;
	struct list_head complete;

	/* libgroup domain membership */

	int last_stop;
	int last_start;
	int last_finish;
	int first_recovery;
	int prev_count;
	struct list_head prev;
	struct list_head leaving;
	};

	/* config.c */

	int setup_ccs(void);
	void close_ccs(void);
	void read_ccs_name(char path, char name);
	void read_ccs_yesno(char path, int yes, int *no);
	void read_ccs_int(char path, int config_val);
	int read_ccs(struct fd *fd);

	/* cpg.c */

	void process_cpg(int ci);
	int setup_cpg(void);
	void close_cpg(void);
	void free_cg(struct change *cg);
	void node_history_fence(struct fd *fd, int victim, int master, int how,
	uint64_t mastertime);
	void send_external(struct fd *fd, int victim);
	int is_fenced_external(struct fd *fd, int nodeid);
	void send_victim_done(struct fd *fd, int victim);
	void process_fd_changes(void);
	int fd_join(struct fd *fd);
	int fd_leave(struct fd *fd);
	int set_node_info(struct fd fd, int nodeid, struct fenced_node node);
	int set_domain_info(struct fd fd, struct fenced_domain domain);
	int set_domain_nodes(struct fd fd, int option, int node_count,
	struct fenced_node **nodes);
	int in_daemon_member_list(int nodeid);

	/* group.c */

	void process_groupd(int ci);
	int setup_groupd(void);
	void close_groupd(void);
	int fd_join_group(struct fd *fd);
	int fd_leave_group(struct fd *fd);
	int set_node_info_group(struct fd fd, int nodeid, struct fenced_node node);
	int set_domain_info_group(struct fd fd, struct fenced_domain domain);
	int set_domain_nodes_group(struct fd fd, int option, int node_count,
	struct fenced_node **nodes);
	void set_group_mode(void);

	/* main.c */

	void client_dead(int ci);
	int client_add(int fd, void (workfn)(int ci), void (deadfn)(int ci));
	void free_fd(struct fd *fd);
	struct fd find_fd(char name);
	void query_lock(void);
	void query_unlock(void);
	void cluster_dead(int ci);

	/* member_cman.c */

	void process_cman(int ci);
	int setup_cman(void);
	void close_cman(void);
	int is_cman_member(int nodeid);
	char *nodeid_to_name(int nodeid);
	int name_to_nodeid(char *name);
	struct node get_new_node(struct fd fd, int nodeid);
	void kick_node_from_cluster(int nodeid);
	+void set_cman_dirty(void);

	/* recover.c */

	void free_node_list(struct list_head *head);
	void add_complete_node(struct fd *fd, int nodeid);
	int list_count(struct list_head *head);
	int is_victim(struct fd *fd, int nodeid);
	void delay_fencing(struct fd *fd, int node_join);
	void defer_fencing(struct fd *fd);
	void fence_victims(struct fd *fd);

	/* logging.c */

	void init_logging(void);
	void setup_logging();
	void close_logging(void);

	#endif /* __FD_DOT_H__ */

	diff --git a/fence/fenced/main.c b/fence/fenced/main.c
	index 01937f1ab..74835c742 100644
	--- a/fence/fenced/main.c
	+++ b/fence/fenced/main.c
	@@ -1,1044 +1,1056 @@
	#include "fd.h"
	#include "config.h"
	#include <pthread.h>
	#include "copyright.cf"

	#define LOCKFILE_NAME "/var/run/fenced.pid"
	#define CLIENT_NALLOC 32

	static int client_maxi;
	static int client_size = 0;
	static struct client *client = NULL;
	static struct pollfd *pollfd = NULL;
	static pthread_t query_thread;
	static pthread_mutex_t query_mutex;
	static struct list_head controlled_entries;

	struct client {
	int fd;
	void *workfn;
	void *deadfn;
	};

	static int do_read(int fd, void *buf, size_t count)
	{
	int rv, off = 0;

	while (off < count) {
	rv = read(fd, buf + off, count - off);
	if (rv == 0)
	return -1;
	if (rv == -1 && errno == EINTR)
	continue;
	if (rv == -1)
	return -1;
	off += rv;
	}
	return 0;
	}

	static int do_write(int fd, void *buf, size_t count)
	{
	int rv, off = 0;

	retry:
	rv = write(fd, buf + off, count);
	if (rv == -1 && errno == EINTR)
	goto retry;
	if (rv < 0) {
	return rv;
	}

	if (rv != count) {
	count -= rv;
	off += rv;
	goto retry;
	}
	return 0;
	}

	static void client_alloc(void)
	{
	int i;

	if (!client) {
	client = malloc(CLIENT_NALLOC * sizeof(struct client));
	pollfd = malloc(CLIENT_NALLOC * sizeof(struct pollfd));
	} else {
	client = realloc(client, (client_size + CLIENT_NALLOC) *
	sizeof(struct client));
	pollfd = realloc(pollfd, (client_size + CLIENT_NALLOC) *
	sizeof(struct pollfd));
	if (!pollfd)
	log_error("can't alloc for pollfd");
	}
	if (!client \|\| !pollfd)
	log_error("can't alloc for client array");

	for (i = client_size; i < client_size + CLIENT_NALLOC; i++) {
	client[i].workfn = NULL;
	client[i].deadfn = NULL;
	client[i].fd = -1;
	pollfd[i].fd = -1;
	pollfd[i].revents = 0;
	}
	client_size += CLIENT_NALLOC;
	}

	void client_dead(int ci)
	{
	close(client[ci].fd);
	client[ci].workfn = NULL;
	client[ci].fd = -1;
	pollfd[ci].fd = -1;
	}

	int client_add(int fd, void (workfn)(int ci), void (deadfn)(int ci))
	{
	int i;

	if (!client)
	client_alloc();
	again:
	for (i = 0; i < client_size; i++) {
	if (client[i].fd == -1) {
	client[i].workfn = workfn;
	if (deadfn)
	client[i].deadfn = deadfn;
	else
	client[i].deadfn = client_dead;
	client[i].fd = fd;
	pollfd[i].fd = fd;
	pollfd[i].events = POLLIN;
	if (i > client_maxi)
	client_maxi = i;
	return i;
	}
	}

	client_alloc();
	goto again;
	}

	static void sigterm_handler(int sig)
	{
	daemon_quit = 1;
	}

	static struct fd create_fd(char name)
	{
	struct fd *fd;

	if (strlen(name) > MAX_GROUPNAME_LEN)
	return NULL;

	fd = malloc(sizeof(struct fd));
	if (!fd)
	return NULL;

	memset(fd, 0, sizeof(struct fd));
	strcpy(fd->name, name);

	INIT_LIST_HEAD(&fd->changes);
	INIT_LIST_HEAD(&fd->node_history);
	INIT_LIST_HEAD(&fd->victims);
	INIT_LIST_HEAD(&fd->complete);
	INIT_LIST_HEAD(&fd->prev);
	INIT_LIST_HEAD(&fd->leaving);

	return fd;
	}

	void free_fd(struct fd *fd)
	{
	struct change cg, cg_safe;
	struct node_history nodeh, nodeh_safe;

	list_for_each_entry_safe(cg, cg_safe, &fd->changes, list) {
	list_del(&cg->list);
	free_cg(cg);
	}
	if (fd->started_change)
	free_cg(fd->started_change);

	list_for_each_entry_safe(nodeh, nodeh_safe, &fd->node_history, list) {
	list_del(&nodeh->list);
	free(nodeh);
	}

	free_node_list(&fd->victims);
	free_node_list(&fd->complete);
	free_node_list(&fd->prev);
	free_node_list(&fd->leaving);

	free(fd);
	}

	struct fd find_fd(char name)
	{
	struct fd *fd;

	list_for_each_entry(fd, &domains, list) {
	if (strlen(name) == strlen(fd->name) &&
	!strncmp(fd->name, name, strlen(name)))
	return fd;
	}
	return NULL;
	}

	+/* We don't require cman dirty/disallowed to detect and handle cpg merges after
	+ a partition, because we already do that with started_count checks and our
	+ own disallowed flag. But, we do need cman dirty/disallowed to deal with
	+ correctly skipping victims that rejoin the cluster. Without cman
	+ dirty/disallowed, we'd skip fencing a node after a merge of a partition
	+ since the merged node would be a cman member and a fenced:daemon cpg member.
	+ By setting the dirty flag, cman won't report a dirty merged node as a
	+ member, so we'll continue fencing it. */
	+
	static int do_join(char *name)
	{
	struct fd *fd;
	int rv;

	fd = find_fd(name);
	if (fd) {
	log_debug("join error: domain %s exists", name);
	rv = -EEXIST;
	goto out;
	}

	fd = create_fd(name);
	if (!fd) {
	rv = -ENOMEM;
	goto out;
	}

	rv = read_ccs(fd);
	if (rv) {
	free(fd);
	goto out;
	}

	if (group_mode == GROUP_LIBGROUP)
	rv = fd_join_group(fd);
	else
	rv = fd_join(fd);
	+
	+ if (!rv)
	+ set_cman_dirty();
	out:
	return rv;
	}

	static int do_leave(char *name)
	{
	struct fd *fd;
	int rv;

	fd = find_fd(name);
	if (!fd)
	return -EINVAL;

	if (group_mode == GROUP_LIBGROUP)
	rv = fd_leave_group(fd);
	else
	rv = fd_leave(fd);

	return rv;
	}

	static int do_external(char name, char extra, int extra_len)
	{
	struct fd *fd;
	int rv = 0;

	fd = find_fd(name);
	if (!fd)
	return -EINVAL;

	if (group_mode == GROUP_LIBGROUP)
	rv = -ENOSYS;
	else
	send_external(fd, name_to_nodeid(extra));

	return rv;
	}

	static void init_header(struct fenced_header *h, int cmd, int result,
	int extra_len)
	{
	memset(h, 0, sizeof(struct fenced_header));

	h->magic = FENCED_MAGIC;
	h->version = FENCED_VERSION;
	h->len = sizeof(struct fenced_header) + extra_len;
	h->command = cmd;
	h->data = result;
	}

	/* combines a header and the data and sends it back to the client in
	a single do_write() call */

	static void do_reply(int f, int cmd, int result, char *buf, int buflen)
	{
	char *reply;
	int reply_len;

	reply_len = sizeof(struct fenced_header) + buflen;
	reply = malloc(reply_len);
	if (!reply)
	return;
	memset(reply, 0, reply_len);

	init_header((struct fenced_header *)reply, cmd, result, buflen);

	if (buf && buflen)
	memcpy(reply + sizeof(struct fenced_header), buf, buflen);

	do_write(f, reply, reply_len);

	free(reply);
	}

	static void query_dump_debug(int f)
	{
	struct fenced_header h;
	int extra_len;
	int len;

	/* in the case of dump_wrap, extra_len will go in two writes,
	first the log tail, then the log head */
	if (dump_wrap)
	extra_len = FENCED_DUMP_SIZE;
	else
	extra_len = dump_point;

	init_header(&h, FENCED_CMD_DUMP_DEBUG, 0, extra_len);
	do_write(f, &h, sizeof(h));

	if (dump_wrap) {
	len = FENCED_DUMP_SIZE - dump_point;
	do_write(f, dump_buf + dump_point, len);
	len = dump_point;
	} else
	len = dump_point;

	/* NUL terminate the debug string */
	dump_buf[dump_point] = '\0';

	do_write(f, dump_buf, len);
	}

	static void query_node_info(int f, int data_nodeid)
	{
	struct fd *fd;
	struct fenced_node node;
	int nodeid, rv;

	fd = find_fd("default");
	if (!fd) {
	rv = -ENOENT;
	goto out;
	}

	if (data_nodeid == FENCED_NODEID_US)
	nodeid = our_nodeid;
	else
	nodeid = data_nodeid;

	if (group_mode == GROUP_LIBGROUP)
	rv = set_node_info_group(fd, nodeid, &node);
	else
	rv = set_node_info(fd, nodeid, &node);
	out:
	do_reply(f, FENCED_CMD_NODE_INFO, rv, (char *)&node, sizeof(node));
	}

	static void query_domain_info(int f)
	{
	struct fd *fd;
	struct fenced_domain domain;
	int rv;

	fd = find_fd("default");
	if (!fd) {
	rv = -ENOENT;
	goto out;
	}

	memset(&domain, 0, sizeof(domain));
	domain.group_mode = group_mode;

	if (group_mode == GROUP_LIBGROUP)
	rv = set_domain_info_group(fd, &domain);
	else
	rv = set_domain_info(fd, &domain);
	out:
	do_reply(f, FENCED_CMD_DOMAIN_INFO, rv, (char *)&domain, sizeof(domain));
	}

	static void query_domain_nodes(int f, int option, int max)
	{
	struct fd *fd;
	int node_count = 0;
	struct fenced_node *nodes = NULL;
	int rv, result;

	fd = find_fd("default");
	if (!fd) {
	result = -ENOENT;
	node_count = 0;
	goto out;
	}

	if (group_mode == GROUP_LIBGROUP)
	rv = set_domain_nodes_group(fd, option, &node_count, &nodes);
	else
	rv = set_domain_nodes(fd, option, &node_count, &nodes);

	if (rv < 0) {
	result = rv;
	node_count = 0;
	goto out;
	}

	/* node_count is the number of structs copied/returned; the caller's
	max may be less than that, in which case we copy as many as they
	asked for and return -E2BIG */

	if (node_count > max) {
	result = -E2BIG;
	node_count = max;
	} else {
	result = node_count;
	}
	out:
	do_reply(f, FENCED_CMD_DOMAIN_NODES, result,
	(char )nodes, node_count sizeof(struct fenced_node));

	if (nodes)
	free(nodes);
	}

	static void process_connection(int ci)
	{
	struct fenced_header h;
	char *extra = NULL;
	int rv, extra_len;

	rv = do_read(client[ci].fd, &h, sizeof(h));
	if (rv < 0) {
	log_debug("connection %d read error %d", ci, rv);
	goto out;
	}

	if (h.magic != FENCED_MAGIC) {
	log_debug("connection %d magic error %x", ci, h.magic);
	goto out;
	}

	if ((h.version & 0xFFFF0000) != (FENCED_VERSION & 0xFFFF0000)) {
	log_debug("connection %d version error %x", ci, h.version);
	goto out;
	}

	if (h.len > sizeof(h)) {
	extra_len = h.len - sizeof(h);
	extra = malloc(extra_len);
	if (!extra) {
	log_error("process_connection no mem %d", extra_len);
	goto out;
	}
	memset(extra, 0, extra_len);

	rv = do_read(client[ci].fd, extra, extra_len);
	if (rv < 0) {
	log_debug("connection %d extra read error %d", ci, rv);
	goto out;
	}
	}

	switch (h.command) {
	case FENCED_CMD_JOIN:
	do_join("default");
	break;
	case FENCED_CMD_LEAVE:
	do_leave("default");
	break;
	case FENCED_CMD_EXTERNAL:
	do_external("default", extra, extra_len);
	break;
	case FENCED_CMD_DUMP_DEBUG:
	case FENCED_CMD_NODE_INFO:
	case FENCED_CMD_DOMAIN_INFO:
	case FENCED_CMD_DOMAIN_NODES:
	log_error("process_connection query on wrong socket");
	break;
	default:
	log_error("process_connection %d unknown command %d",
	ci, h.command);
	}
	out:
	if (extra)
	free(extra);
	client_dead(ci);
	}

	static void process_listener(int ci)
	{
	int fd, i;

	fd = accept(client[ci].fd, NULL, NULL);
	if (fd < 0) {
	log_error("process_listener: accept error %d %d", fd, errno);
	return;
	}

	i = client_add(fd, process_connection, NULL);

	log_debug("client connection %d fd %d", i, fd);
	}

	static int setup_listener(char *sock_path)
	{
	struct sockaddr_un addr;
	socklen_t addrlen;
	int rv, s;

	/* we listen for new client connections on socket s */

	s = socket(AF_LOCAL, SOCK_STREAM, 0);
	if (s < 0) {
	log_error("socket error %d %d", s, errno);
	return s;
	}

	memset(&addr, 0, sizeof(addr));
	addr.sun_family = AF_LOCAL;
	strcpy(&addr.sun_path[1], sock_path);
	addrlen = sizeof(sa_family_t) + strlen(addr.sun_path+1) + 1;

	rv = bind(s, (struct sockaddr *) &addr, addrlen);
	if (rv < 0) {
	log_error("bind error %d %d", rv, errno);
	close(s);
	return rv;
	}

	rv = listen(s, 5);
	if (rv < 0) {
	log_error("listen error %d %d", rv, errno);
	close(s);
	return rv;
	}
	return s;
	}

	void query_lock(void)
	{
	pthread_mutex_lock(&query_mutex);
	}

	void query_unlock(void)
	{
	pthread_mutex_unlock(&query_mutex);
	}

	/* This is a thread, so we have to be careful, don't call log_ functions.
	We need a thread to process queries because the main thread will block
	for long periods when running fence agents. */

	static void process_queries(void arg)
	{
	struct fenced_header h;
	int s = ((int )arg);
	int f, rv;

	for (;;) {
	f = accept(s, NULL, NULL);

	rv = do_read(f, &h, sizeof(h));
	if (rv < 0) {
	goto out;
	}

	if (h.magic != FENCED_MAGIC) {
	goto out;
	}

	if ((h.version & 0xFFFF0000) != (FENCED_VERSION & 0xFFFF0000)) {
	goto out;
	}

	pthread_mutex_lock(&query_mutex);

	switch (h.command) {
	case FENCED_CMD_DUMP_DEBUG:
	query_dump_debug(f);
	break;
	case FENCED_CMD_NODE_INFO:
	query_node_info(f, h.data);
	break;
	case FENCED_CMD_DOMAIN_INFO:
	query_domain_info(f);
	break;
	case FENCED_CMD_DOMAIN_NODES:
	query_domain_nodes(f, h.option, h.data);
	break;
	default:
	break;
	}
	pthread_mutex_unlock(&query_mutex);

	out:
	close(f);
	}
	}

	static int setup_queries(void)
	{
	int rv, s;

	rv = setup_listener(FENCED_QUERY_SOCK_PATH);
	if (rv < 0)
	return rv;
	s = rv;

	pthread_mutex_init(&query_mutex, NULL);

	rv = pthread_create(&query_thread, NULL, process_queries, &s);
	if (rv < 0) {
	log_error("can't create query thread");
	close(s);
	return rv;
	}
	return 0;
	}

	struct controlled_entry {
	struct list_head list;
	char path[PATH_MAX+1];
	};

	static void register_controlled_dir(char *path)
	{
	struct controlled_entry *ce;

	ce = malloc(sizeof(struct controlled_entry));
	if (!ce)
	return;
	memset(ce, 0, sizeof(struct controlled_entry));
	strncpy(ce->path, path, PATH_MAX);
	list_add(&ce->list, &controlled_entries);
	}

	static int ignore_nolock(char sysfs_dir, char table)
	{
	char path[PATH_MAX];
	int fd;

	memset(path, 0, PATH_MAX);

	snprintf(path, PATH_MAX, "%s/%s/lock_module/proto_name",
	sysfs_dir, table);

	/* lock_nolock doesn't create the "lock_module" dir at all,
	so we'll fail to open this */

	fd = open(path, O_RDONLY);
	if (fd < 0)
	return 1;

	close(fd);
	return 0;
	}

	static int check_controlled_dir(char *path)
	{
	DIR *d;
	struct dirent *de;
	int count = 0;

	d = opendir(path);
	if (!d)
	return 0;

	while ((de = readdir(d))) {
	if (de->d_name[0] == '.')
	continue;

	if (strstr(path, "fs/gfs") && ignore_nolock(path, de->d_name))
	continue;

	log_error("found uncontrolled entry %s/%s", path, de->d_name);
	count++;
	}
	closedir(d);

	return count;
	}

	/* Joining the "fenced:daemon" cpg (in setup_cpg()) tells fenced on other
	nodes that we are in a "clean state", and don't need fencing. So, if
	we're a pending fence victim on another node, they'll skip fencing us
	once we start fenced and join the "daemon" cpg (it's not the fence domain
	cpg which we join when fence_tool join is run). This "daemon" cpg is just
	to notify others that we have no uncontrolled gfs/dlm objects.
	(Conceptually, we could use the fence domain cpg for this purpose instead,
	but that would require processing domain membership changes during
	fence_victims(), which would be a major change in the way the daemon works.)

	So, if we (the local node) are not in a clean state, we don't join the
	daemon cpg and we exit; we still need to be fenced. If we are starting
	up and find that instances of gfs/dlm in the kernel have been previously
	abandoned, that's an unclean, unreset state, and we still need fencing. */

	static int check_uncontrolled_entries(void)
	{
	struct controlled_entry *ce;
	int count = 0;

	list_for_each_entry(ce, &controlled_entries, list) {
	if (strncmp(ce->path, "-", 1))
	goto skip_default;
	}

	/* the default dirs to check */
	register_controlled_dir("/sys/kernel/dlm");
	register_controlled_dir("/sys/fs/gfs2");
	register_controlled_dir("/sys/fs/gfs");

	skip_default:
	list_for_each_entry(ce, &controlled_entries, list)
	count += check_controlled_dir(ce->path);

	if (count)
	return -1;
	return 0;
	}

	void cluster_dead(int ci)
	{
	log_error("cluster is down, exiting");
	daemon_quit = 1;
	}

	static void loop(void)
	{
	int rv, i;
	void (*workfn) (int ci);
	void (*deadfn) (int ci);

	rv = setup_queries();
	if (rv < 0)
	goto out;

	rv = setup_listener(FENCED_SOCK_PATH);
	if (rv < 0)
	goto out;
	client_add(rv, process_listener, NULL);

	rv = setup_cman();
	if (rv < 0)
	goto out;
	client_add(rv, process_cman, cluster_dead);

	rv = setup_ccs();
	if (rv < 0)
	goto out;

	setup_logging();

	rv = check_uncontrolled_entries();
	if (rv < 0)
	goto out;

	rv = setup_cpg();
	if (rv < 0)
	goto out;
	client_add(rv, process_cpg, cluster_dead);

	group_mode = GROUP_LIBCPG;

	if (cfgd_groupd_compat) {
	rv = setup_groupd();
	if (rv < 0)
	goto out;
	client_add(rv, process_groupd, cluster_dead);

	group_mode = GROUP_LIBGROUP;
	if (cfgd_groupd_compat == 2)
	set_group_mode();
	}
	log_debug("group_mode %d compat %d", group_mode, cfgd_groupd_compat);

	for (;;) {
	rv = poll(pollfd, client_maxi + 1, -1);
	if (rv == -1 && errno == EINTR) {
	if (daemon_quit && list_empty(&domains))
	goto out;
	daemon_quit = 0;
	continue;
	}
	if (rv < 0) {
	log_error("poll errno %d", errno);
	goto out;
	}

	pthread_mutex_lock(&query_mutex);

	for (i = 0; i <= client_maxi; i++) {
	if (client[i].fd < 0)
	continue;
	if (pollfd[i].revents & POLLIN) {
	workfn = client[i].workfn;
	workfn(i);
	}
	if (pollfd[i].revents & (POLLERR \| POLLHUP \| POLLNVAL)) {
	deadfn = client[i].deadfn;
	deadfn(i);
	}
	}
	pthread_mutex_unlock(&query_mutex);

	if (daemon_quit)
	break;
	}
	out:
	if (cfgd_groupd_compat)
	close_groupd();
	close_cpg();
	close_logging();
	close_ccs();
	close_cman();

	if (!list_empty(&domains))
	log_error("domain abandoned");
	}

	static void lockfile(void)
	{
	int fd, error;
	struct flock lock;
	char buf[33];

	memset(buf, 0, 33);

	fd = open(LOCKFILE_NAME, O_CREAT\|O_WRONLY,
	S_IRUSR\|S_IWUSR\|S_IRGRP\|S_IROTH);
	if (fd < 0) {
	fprintf(stderr, "cannot open/create lock file %s\n",
	LOCKFILE_NAME);
	exit(EXIT_FAILURE);
	}

	lock.l_type = F_WRLCK;
	lock.l_start = 0;
	lock.l_whence = SEEK_SET;
	lock.l_len = 0;

	error = fcntl(fd, F_SETLK, &lock);
	if (error) {
	fprintf(stderr, "is already running\n");
	exit(EXIT_FAILURE);
	}

	error = ftruncate(fd, 0);
	if (error) {
	fprintf(stderr, "cannot clear lock file %s\n", LOCKFILE_NAME);
	exit(EXIT_FAILURE);
	}

	sprintf(buf, "%d\n", getpid());

	error = write(fd, buf, strlen(buf));
	if (error <= 0) {
	fprintf(stderr, "cannot write lock file %s\n", LOCKFILE_NAME);
	exit(EXIT_FAILURE);
	}
	}

	static void print_usage(void)
	{
	printf("Usage:\n");
	printf("\n");
	printf("fenced [options]\n");
	printf("\n");
	printf("Options:\n");
	printf("\n");
	printf(" -D Enable debugging code and don't fork\n");
	printf(" -L <num> Enable (1) or disable (0) debugging to logsys (default %d)\n", DEFAULT_DEBUG_LOGSYS);
	printf(" -g <num> groupd compatibility mode, 0 off, 1 on, 2 detect (default %d)\n", DEFAULT_GROUPD_COMPAT);
	printf(" 0: use libcpg, no backward compat, best performance\n");
	printf(" 1: use libgroup for compat with cluster2/rhel5\n");
	printf(" 2: use groupd to detect old, or mode 1, nodes that\n"
	" require compat, use libcpg if none found\n");
	printf(" -r <path> Register a directory that needs to be empty for\n");
	printf(" the daemon to start. \"-\" to skip default directories\n");
	printf(" /sys/fs/gfs, /sys/fs/gfs2, /sys/kernel/dlm\n");
	printf(" -c All nodes are in a clean state to start; do no startup fencing\n");
	printf(" -s Skip startup fencing of nodes with no defined fence methods\n");
	printf(" -j <secs> Post-join fencing delay (default %d)\n", DEFAULT_POST_JOIN_DELAY);
	printf(" -f <secs> Post-fail fencing delay (default %d)\n", DEFAULT_POST_FAIL_DELAY);
	printf(" -R <secs> Override time (default %d)\n", DEFAULT_OVERRIDE_TIME);

	printf(" -O <path> Override path (default %s)\n", DEFAULT_OVERRIDE_PATH);
	printf(" -h Print this help, then exit\n");
	printf(" -V Print program version information, then exit\n");
	printf("\n");
	printf("Command line values override those in " DEFAULT_CONFIG_DIR "/" DEFAULT_CONFIG_FILE ".\n");
	printf("For an unbounded delay use <secs> value of -1.\n");
	printf("\n");
	}

	#define OPTION_STRING "L:g:cj:f:Dn:O:hVSse:r:"

	static void read_arguments(int argc, char **argv)
	{
	int cont = 1;
	int optchar;

	while (cont) {
	optchar = getopt(argc, argv, OPTION_STRING);

	switch (optchar) {

	case 'D':
	daemon_debug_opt = 1;
	break;

	case 'L':
	optd_debug_logsys = 1;
	cfgd_debug_logsys = atoi(optarg);
	break;

	case 'g':
	optd_groupd_compat = 1;
	cfgd_groupd_compat = atoi(optarg);
	break;

	case 'c':
	optd_clean_start = 1;
	cfgd_clean_start = 1;
	break;

	case 's':
	optd_skip_undefined = 1;
	cfgd_skip_undefined = 1;
	break;

	case 'j':
	optd_post_join_delay = 1;
	cfgd_post_join_delay = atoi(optarg);
	break;

	case 'f':
	optd_post_fail_delay = 1;
	cfgd_post_fail_delay = atoi(optarg);
	break;

	case 'R':
	optd_override_time = 1;
	cfgd_override_time = atoi(optarg);
	if (cfgd_override_time < 3)
	cfgd_override_time = 3;
	break;

	case 'O':
	optd_override_path = 1;
	cfgd_override_path = strdup(optarg);
	break;

	case 'r':
	register_controlled_dir(optarg);
	break;

	case 'h':
	print_usage();
	exit(EXIT_SUCCESS);
	break;

	case 'V':
	printf("fenced %s (built %s %s)\n", RELEASE_VERSION,
	__DATE__, __TIME__);
	printf("%s\n", REDHAT_COPYRIGHT);
	exit(EXIT_SUCCESS);
	break;

	case ':':
	case '?':
	fprintf(stderr, "Please use '-h' for usage.\n");
	exit(EXIT_FAILURE);
	break;

	case EOF:
	cont = 0;
	break;

	default:
	fprintf(stderr, "unknown option: %c", optchar);
	exit(EXIT_FAILURE);
	};
	}

	if (!optd_debug_logsys && getenv("FENCED_DEBUG")) {
	optd_debug_logsys = 1;
	cfgd_debug_logsys = atoi(getenv("FENCED_DEBUG"));
	}
	}

	static void set_oom_adj(int val)
	{
	FILE *fp;

	fp = fopen("/proc/self/oom_adj", "w");
	if (!fp)
	return;

	fprintf(fp, "%i", val);
	fclose(fp);
	}

	int main(int argc, char **argv)
	{
	INIT_LIST_HEAD(&domains);
	INIT_LIST_HEAD(&controlled_entries);

	init_logging();

	read_arguments(argc, argv);

	lockfile();

	if (!daemon_debug_opt) {
	if (daemon(0, 0) < 0) {
	perror("main: cannot fork");
	exit(EXIT_FAILURE);
	}
	umask(0);
	}
	signal(SIGTERM, sigterm_handler);

	set_oom_adj(-16);

	loop();

	return 0;
	}

	void daemon_dump_save(void)
	{
	int len, i;

	len = strlen(daemon_debug_buf);

	for (i = 0; i < len; i++) {
	dump_buf[dump_point++] = daemon_debug_buf[i];

	if (dump_point == FENCED_DUMP_SIZE) {
	dump_point = 0;
	dump_wrap = 1;
	}
	}
	}

	int daemon_debug_opt;
	int daemon_quit;
	struct list_head domains;
	int cman_quorate;
	int our_nodeid;
	char our_name[MAX_NODENAME_LEN+1];
	char daemon_debug_buf[256];
	char dump_buf[FENCED_DUMP_SIZE];
	int dump_point;
	int dump_wrap;
	int group_mode;

	diff --git a/fence/fenced/member_cman.c b/fence/fenced/member_cman.c
	index 16e08a82b..a35e746a6 100644
	--- a/fence/fenced/member_cman.c
	+++ b/fence/fenced/member_cman.c
	@@ -1,262 +1,271 @@
	#include "fd.h"
	#include "config.h"
	#include <libcman.h>

	#define BUFLEN MAX_NODENAME_LEN+1

	static cman_handle_t ch;
	static cman_handle_t ch_admin;
	static cman_node_t cman_nodes[MAX_NODES];
	static int cman_node_count;

	+void set_cman_dirty(void)
	+{
	+ int rv;
	+
	+ rv = cman_set_dirty(ch_admin);
	+ if (rv)
	+ log_error("cman_set_dirty error %d", rv);
	+}
	+
	void kick_node_from_cluster(int nodeid)
	{
	if (!nodeid) {
	log_error("telling cman to shut down cluster locally");
	cman_shutdown(ch_admin, CMAN_SHUTDOWN_ANYWAY);
	} else {
	log_error("telling cman to remove nodeid %d from cluster",
	nodeid);
	cman_kill_node(ch_admin, nodeid);
	}
	}

	static int name_equal(char name1, char name2)
	{
	char name3[BUFLEN], name4[BUFLEN];
	int i, len1, len2;

	len1 = strlen(name1);
	len2 = strlen(name2);

	if (len1 == len2 && !strncmp(name1, name2, len1))
	return 1;

	memset(name3, 0, BUFLEN);
	memset(name4, 0, BUFLEN);

	for (i = 0; i < BUFLEN && i < len1; i++) {
	if (name1[i] != '.')
	name3[i] = name1[i];
	else
	break;
	}

	for (i = 0; i < BUFLEN && i < len2; i++) {
	if (name2[i] != '.')
	name4[i] = name2[i];
	else
	break;
	}

	len1 = strlen(name3);
	len2 = strlen(name4);

	if (len1 == len2 && !strncmp(name3, name4, len1))
	return 1;

	return 0;
	}

	static cman_node_t find_cman_node_name(char name)
	{
	int i;

	for (i = 0; i < cman_node_count; i++) {
	if (name_equal(cman_nodes[i].cn_name, name))
	return &cman_nodes[i];
	}
	return NULL;
	}

	static cman_node_t *find_cman_node(int nodeid)
	{
	int i;

	for (i = 0; i < cman_node_count; i++) {
	if (cman_nodes[i].cn_nodeid == nodeid)
	return &cman_nodes[i];
	}
	return NULL;
	}

	char *nodeid_to_name(int nodeid)
	{
	cman_node_t *cn;

	cn = find_cman_node(nodeid);
	if (cn)
	return cn->cn_name;

	return "unknown";
	}

	int name_to_nodeid(char *name)
	{
	cman_node_t *cn;

	cn = find_cman_node_name(name);
	if (cn)
	return cn->cn_nodeid;

	return -1;
	}

	static void statechange(void)
	{
	int rv;

	cman_quorate = cman_is_quorate(ch);
	cman_node_count = 0;
	memset(&cman_nodes, 0, sizeof(cman_nodes));

	rv = cman_get_nodes(ch, MAX_NODES, &cman_node_count, cman_nodes);
	if (rv < 0)
	log_error("cman_get_nodes error %d %d", rv, errno);
	}

	static void cman_callback(cman_handle_t h, void *private, int reason, int arg)
	{
	int quorate = cman_quorate;

	switch (reason) {
	case CMAN_REASON_TRY_SHUTDOWN:
	if (list_empty(&domains))
	cman_replyto_shutdown(ch, 1);
	else {
	log_debug("no to cman shutdown");
	cman_replyto_shutdown(ch, 0);
	}
	break;
	case CMAN_REASON_STATECHANGE:
	statechange();

	/* domain may have been waiting for quorum */
	if (!quorate && cman_quorate && (group_mode == GROUP_LIBCPG))
	process_fd_changes();
	break;

	case CMAN_REASON_CONFIG_UPDATE:
	setup_logging();
	break;
	}
	}

	void process_cman(int ci)
	{
	int rv;

	rv = cman_dispatch(ch, CMAN_DISPATCH_ALL);
	if (rv == -1 && errno == EHOSTDOWN)
	cluster_dead(0);
	}

	int setup_cman(void)
	{
	cman_node_t node;
	int rv, fd;
	int init = 0, active = 0;

	retry_init:
	ch_admin = cman_admin_init(NULL);
	if (!ch_admin) {
	if (init++ < 2) {
	sleep(1);
	goto retry_init;
	}
	log_error("cman_admin_init error %d", errno);
	return -ENOTCONN;
	}

	ch = cman_init(NULL);
	if (!ch) {
	log_error("cman_init error %d", errno);
	return -ENOTCONN;
	}

	retry_active:
	rv = cman_is_active(ch);
	if (!rv) {
	if (active++ < 2) {
	sleep(1);
	goto retry_active;
	}
	log_error("cman_is_active error %d", errno);
	cman_finish(ch);
	return -ENOTCONN;
	}

	rv = cman_start_notification(ch, cman_callback);
	if (rv < 0) {
	log_error("cman_start_notification error %d %d", rv, errno);
	cman_finish(ch);
	return rv;
	}

	statechange();

	fd = cman_get_fd(ch);

	/* FIXME: wait here for us to be a member of the cluster */
	memset(&node, 0, sizeof(node));
	rv = cman_get_node(ch, CMAN_NODEID_US, &node);
	if (rv < 0) {
	log_error("cman_get_node us error %d %d", rv, errno);
	cman_finish(ch);
	fd = rv;
	goto out;
	}

	memset(our_name, 0, sizeof(our_name));
	strncpy(our_name, node.cn_name, CMAN_MAX_NODENAME_LEN);
	our_nodeid = node.cn_nodeid;

	log_debug("our_nodeid %d our_name %s", our_nodeid, our_name);
	out:
	return fd;
	}

	void close_cman(void)
	{
	cman_finish(ch);
	}

	int is_cman_member(int nodeid)
	{
	cman_node_t *cn;

	/* Note: in fence delay loop we aren't processing callbacks so won't
	have done a statechange() in response to a cman callback */
	statechange();

	cn = find_cman_node(nodeid);
	if (cn && cn->cn_member)
	return 1;

	log_debug("node %d not a cman member, cn %d", nodeid, cn ? 1 : 0);
	return 0;
	}

	struct node get_new_node(struct fd fd, int nodeid)
	{
	cman_node_t cn;
	struct node *node;
	int rv;

	node = malloc(sizeof(*node));
	if (!node)
	return NULL;
	memset(node, 0, sizeof(*node));

	node->nodeid = nodeid;

	memset(&cn, 0, sizeof(cn));
	rv = cman_get_node(ch, nodeid, &cn);
	if (rv < 0)
	log_debug("get_new_node %d no cman node %d", nodeid, rv);
	else
	strncpy(node->name, cn.cn_name, MAX_NODENAME_LEN);

	return node;
	}

	diff --git a/group/daemon/cman.c b/group/daemon/cman.c
	index a5ae333ca..f45d23f0f 100644
	--- a/group/daemon/cman.c
	+++ b/group/daemon/cman.c
	@@ -1,213 +1,203 @@

	/* Interface with corosync's cman API */

	#include <libcman.h>
	#include "gd_internal.h"

	static cman_handle_t ch;
	static cman_handle_t ch_admin;
	static int old_quorate;
	static cman_node_t old_nodes[MAX_NODES];
	static int old_node_count;
	static cman_node_t cman_nodes[MAX_NODES];
	static int cman_node_count;
	static char name_buf[CMAN_MAX_NODENAME_LEN+1];


	int kill_cman(int nodeid)
	{
	return cman_kill_node(ch_admin, nodeid);
	}

	-int set_cman_dirty(void)
	-{
	- int rv;
	-
	- rv = cman_set_dirty(ch_admin);
	- if (rv)
	- log_print("cman_set_dirty error %d", rv);
	- return rv;
	-}
	-
	static int is_member(cman_node_t *node_list, int count, int nodeid)
	{
	int i;

	for (i = 0; i < count; i++) {
	if (node_list[i].cn_nodeid == nodeid)
	return node_list[i].cn_member;
	}
	return 0;
	}

	static int is_old_member(int nodeid)
	{
	return is_member(old_nodes, old_node_count, nodeid);
	}

	static int is_cman_member(int nodeid)
	{
	return is_member(cman_nodes, cman_node_count, nodeid);
	}

	static void statechange(void)
	{
	int i, rv;

	old_quorate = cman_quorate;
	old_node_count = cman_node_count;
	memcpy(&old_nodes, &cman_nodes, sizeof(old_nodes));

	cman_quorate = cman_is_quorate(ch);

	cman_node_count = 0;
	memset(&cman_nodes, 0, sizeof(cman_nodes));
	rv = cman_get_nodes(ch, MAX_NODES, &cman_node_count, cman_nodes);
	if (rv < 0) {
	log_print("cman_get_nodes error %d %d", rv, errno);
	return;
	}

	/*
	printf("cman: %d old nodes:\n", old_node_count);
	for (i = 0; i < old_node_count; i++)
	printf("%d:%d ", old_nodes[i].cn_nodeid,
	old_nodes[i].cn_member);
	printf("\n");

	printf("cman: %d new nodes:\n", cman_node_count);
	for (i = 0; i < cman_node_count; i++)
	printf("%d:%d ", cman_nodes[i].cn_nodeid,
	cman_nodes[i].cn_member);
	printf("\n");
	*/

	if (old_quorate && !cman_quorate)
	log_debug("cman: lost quorum");
	if (!old_quorate && cman_quorate)
	log_debug("cman: have quorum");

	for (i = 0; i < old_node_count; i++) {
	if (old_nodes[i].cn_member &&
	!is_cman_member(old_nodes[i].cn_nodeid)) {

	log_debug("cman: node %d removed",
	old_nodes[i].cn_nodeid);
	add_recovery_set_cman(old_nodes[i].cn_nodeid);
	}
	}

	for (i = 0; i < cman_node_count; i++) {
	if (cman_nodes[i].cn_member &&
	!is_old_member(cman_nodes[i].cn_nodeid))
	log_debug("cman: node %d added",
	cman_nodes[i].cn_nodeid);
	}
	}

	static void cman_callback(cman_handle_t h, void *private, int reason, int arg)
	{
	switch (reason) {
	case CMAN_REASON_TRY_SHUTDOWN:
	cman_replyto_shutdown(ch, 1);
	break;
	case CMAN_REASON_STATECHANGE:
	statechange();
	break;
	}
	}

	void process_cman(int ci)
	{
	int rv;

	rv = cman_dispatch(ch, CMAN_DISPATCH_ALL);
	if (rv == -1 && errno == EHOSTDOWN)
	cluster_dead(0);
	}

	int setup_cman(void)
	{
	cman_node_t node;
	int rv, fd;
	int init = 0, active = 0;

	retry_init:
	ch = cman_init(NULL);
	if (!ch) {
	if (init++ < 2) {
	sleep(1);
	goto retry_init;
	}
	log_print("cman_init error %d", errno);
	return -ENOTCONN;
	}

	retry_active:
	rv = cman_is_active(ch);
	if (!rv) {
	if (active++ < 2) {
	sleep(1);
	goto retry_active;
	}
	log_print("cman_is_active error %d", errno);
	cman_finish(ch);
	return -ENOTCONN;
	}

	ch_admin = cman_admin_init(NULL);
	if (!ch_admin) {
	log_print("cman_admin_init error %d", errno);
	cman_finish(ch);
	return -ENOTCONN;
	}

	rv = cman_start_notification(ch, cman_callback);
	if (rv < 0) {
	log_print("cman_start_notification error %d %d", rv, errno);
	cman_finish(ch);
	cman_finish(ch_admin);
	return rv;
	}

	memset(&node, 0, sizeof(node));
	rv = cman_get_node(ch, CMAN_NODEID_US, &node);
	if (rv < 0) {
	log_print("cman_get_node us error %d %d", rv, errno);
	cman_stop_notification(ch);
	cman_finish(ch);
	cman_finish(ch_admin);
	return rv;
	}

	cman_node_count = 0;
	memset(&cman_nodes, 0, sizeof(cman_nodes));
	rv = cman_get_nodes(ch, MAX_NODES, &cman_node_count, cman_nodes);
	if (rv < 0) {
	log_print("cman_get_nodes error %d %d", rv, errno);
	cman_stop_notification(ch);
	cman_finish(ch);
	cman_finish(ch_admin);
	return rv;
	}

	cman_quorate = cman_is_quorate(ch);

	memset(name_buf, 0, sizeof(name_buf));
	strncpy(name_buf, node.cn_name, CMAN_MAX_NODENAME_LEN);
	our_name = name_buf;
	our_nodeid = node.cn_nodeid;
	log_debug("cman: our nodeid %d name %s quorum %d",
	our_nodeid, our_name, cman_quorate);

	fd = cman_get_fd(ch);

	return fd;
	}

	void close_cman(void)
	{
	cman_finish(ch);
	cman_finish(ch_admin);
	}

	diff --git a/group/daemon/gd_internal.h b/group/daemon/gd_internal.h
	index 3537f825c..3741f654b 100644
	--- a/group/daemon/gd_internal.h
	+++ b/group/daemon/gd_internal.h
	@@ -1,325 +1,324 @@
	#ifndef __GD_INTERNAL_DOT_H__
	#define __GD_INTERNAL_DOT_H__

	#include <unistd.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <stddef.h>
	#include <fcntl.h>
	#include <string.h>
	#include <strings.h>
	#include <ctype.h>
	#include <dirent.h>
	#include <syslog.h>
	#include <time.h>
	#include <sched.h>
	#include <sys/socket.h>
	#include <sys/un.h>
	#include <sys/types.h>
	#include <sys/errno.h>
	#include <sys/poll.h>
	#include <sys/stat.h>
	#include <sys/wait.h>
	#include <corosync/cpg.h>
	#include <corosync/engine/logsys.h>

	#include "list.h"
	#include "linux_endian.h"
	#include "groupd.h"
	#include "libgroup.h"

	#define MAX_NAMELEN 32 /* should match libgroup.h */
	#define MAX_LEVELS 4
	#define MAX_NODES 128

	extern int daemon_debug_opt;
	extern int daemon_debug_verbose;
	extern int daemon_quit;
	extern int cman_quorate;
	extern int our_nodeid;
	extern char *our_name;
	extern char daemon_debug_buf[256];
	extern char dump_buf[GROUPD_DUMP_SIZE];
	extern int dump_point;
	extern int dump_wrap;
	extern struct list_head gd_groups;
	extern struct list_head gd_levels[MAX_LEVELS];
	extern uint32_t gd_event_nr;

	#define GROUP_PENDING 1
	#define GROUP_LIBGROUP 2
	#define GROUP_LIBCPG 3

	extern int group_mode;

	#define DEFAULT_GROUPD_COMPAT 2
	#define DEFAULT_GROUPD_WAIT 5
	#define DEFAULT_GROUPD_MODE_DELAY 2
	#define DEFAULT_DEBUG_LOGSYS 0

	extern int optd_groupd_compat;
	extern int optd_groupd_wait;
	extern int optd_groupd_mode_delay;
	extern int optd_debug_logsys;

	extern int cfgd_groupd_compat;
	extern int cfgd_groupd_wait;
	extern int cfgd_groupd_mode_delay;
	extern int cfgd_debug_logsys;

	void daemon_dump_save(void);

	#define log_debug(fmt, args...) \
	do { \
	snprintf(daemon_debug_buf, 255, "%ld " fmt "\n", time(NULL), ##args); \
	daemon_dump_save(); \
	if (daemon_debug_opt) \
	fprintf(stderr, "%s", daemon_debug_buf); \
	if (cfgd_debug_logsys) \
	log_printf(LOG_DEBUG, "%s", daemon_debug_buf); \
	} while (0)

	#define log_group(g, fmt, args...) \
	do { \
	snprintf(daemon_debug_buf, 255, "%ld %d:%s " fmt "\n", time(NULL), \
	(g)->level, (g)->name, ##args); \
	daemon_dump_save(); \
	if (daemon_debug_opt) \
	fprintf(stderr, "%s", daemon_debug_buf); \
	if (cfgd_debug_logsys) \
	log_printf(LOG_DEBUG, "%s", daemon_debug_buf); \
	} while (0)

	#define log_print(fmt, args...) \
	do { \
	log_debug(fmt, ##args); \
	log_printf(LOG_ERR, fmt, ##args); \
	} while (0)

	#define log_error(g, fmt, args...) \
	do { \
	log_group(g, fmt, ##args); \
	log_printf(LOG_ERR, fmt, ##args); \
	} while (0)

	#define ASSERT(x) \
	do { \
	if (!(x)) { \
	log_print("Assertion failed on line %d of file %s\n" \
	"Assertion: \"%s\"\n", __LINE__, __FILE__, #x); \
	} \
	} while (0)

	#ifndef TRUE
	#define TRUE (1)
	#endif
	#ifndef FALSE
	#define FALSE (0)
	#endif

	struct group;
	struct app;
	struct event;
	struct node;
	struct msg;
	typedef struct group group_t;
	typedef struct app app_t;
	typedef struct event event_t;
	typedef struct node node_t;
	typedef struct msg msg_t;


	/*
	* Event - manages nodes joining/leaving/failing
	*/

	#define EST_JOIN_BEGIN 1
	#define EST_JOIN_STOP_WAIT 2
	#define EST_JOIN_ALL_STOPPED 3
	#define EST_JOIN_START_WAIT 4
	#define EST_JOIN_ALL_STARTED 5
	#define EST_LEAVE_BEGIN 6
	#define EST_LEAVE_STOP_WAIT 7
	#define EST_LEAVE_ALL_STOPPED 8
	#define EST_LEAVE_START_WAIT 9
	#define EST_LEAVE_ALL_STARTED 10
	#define EST_FAIL_BEGIN 11
	#define EST_FAIL_STOP_WAIT 12
	#define EST_FAIL_ALL_STOPPED 13
	#define EST_FAIL_START_WAIT 14
	#define EST_FAIL_ALL_STARTED 15

	struct event {
	struct list_head list;
	struct list_head memb;
	int event_nr;
	int state;
	int nodeid;
	uint64_t id;
	struct list_head extended;
	int start_app_before_pending_rev;
	int fail_all_stopped;
	};

	/*
	* Group
	*/

	struct group {
	struct list_head list; /* list of groups */
	struct list_head level_list;
	uint16_t level;
	uint32_t global_id;
	struct list_head memb;
	int memb_count;
	int namelen;
	char name[MAX_NAMELEN+1];
	app_t *app;
	struct list_head messages;
	cpg_handle_t cpg_handle;
	int cpg_fd;
	int cpg_client;
	int have_set_id;
	int joining;
	int leaving;
	};

	struct app {
	int client;
	int node_count;
	struct list_head nodes;
	struct list_head events;
	event_t *current_event;
	group_t *g;
	int need_first_event; /* for debugging */
	};

	#define MSG_APP_STOPPED 1
	#define MSG_APP_STARTED 2
	#define MSG_APP_RECOVER 3
	#define MSG_APP_INTERNAL 4
	#define MSG_GLOBAL_ID 5
	#define MSG_GROUP_VERSION 6

	#define MSG_VER_MAJOR 1
	#define MSG_VER_MINOR 1
	#define MSG_VER_PATCH 0

	struct msg {
	uint32_t ms_version[3];
	uint16_t ms_type;
	uint16_t ms_level;
	uint32_t ms_length;
	uint32_t ms_global_id;
	uint64_t ms_event_id;
	char ms_name[MAX_NAMELEN];
	};

	struct save_msg {
	struct list_head list;
	int nodeid;
	int print_ignore;
	int msg_len;
	msg_t msg;
	char *msg_long;
	};

	struct node {
	struct list_head list;
	int nodeid;
	int stopped;
	int started;
	};

	struct recovery_set {
	struct list_head list;
	struct list_head entries;
	int nodeid;
	int cman_update;
	int cpg_update;
	};

	struct recovery_entry {
	struct list_head list;
	group_t *group;
	int recovered;
	};


	/* app.c */
	void add_recovery_set_cman(int nodeid);
	struct recovery_set *add_recovery_set_cpg(int nodeid, int procdown);
	int queue_app_recover(group_t *g, int nodeid);
	int queue_app_join(group_t *g, int nodeid);
	int queue_app_leave(group_t *g, int nodeid);
	int queue_app_message(group_t g, struct save_msg save);
	int do_stopdone(char *name, int level);
	int do_startdone(char *name, int level, int event_nr);
	char ev_state_str(event_t ev);
	event_t find_queued_recover_event(group_t g);
	void extend_recover_event(group_t g, event_t ev, int nodeid);
	int process_apps(void);
	void del_event_nodes(event_t *ev);
	void dump_group(group_t *g);
	void dump_all_groups(void);
	node_t find_app_node(app_t a, int nodeid);
	int event_state_stopping(app_t *a);
	int event_state_starting(app_t *a);
	void msg_bswap_out(msg_t *msg);
	void msg_bswap_in(msg_t *msg);
	struct recovery_set *get_recovery_set(int nodeid);
	void groupd_down(int nodeid);
	char *msg_type(int type);
	int process_app(group_t *g);
	int is_our_join(event_t *ev);
	void purge_node_messages(group_t *g, int nodeid);

	/* main.c */
	void read_ccs_name(char path, char name);
	void read_ccs_yesno(char path, int yes, int *no);
	void read_ccs_int(char path, int config_val);
	void app_stop(app_t *a);
	void app_setid(app_t *a);
	void app_start(app_t *a);
	void app_finish(app_t *a);
	void app_terminate(app_t *a);
	void app_deliver(app_t a, struct save_msg save);
	int client_add(int fd, void (workfn)(int ci), void (deadfn)(int ci));
	void client_dead(int ci);
	void cluster_dead(int ci);

	/* cman.c */
	int setup_cman(void);
	void close_cman(void);
	void process_cman(int ci);
	int kill_cman(int nodeid);
	-int set_cman_dirty(void);

	/* cpg.c */
	int setup_cpg(void);
	int do_cpg_join(group_t *g);
	int do_cpg_leave(group_t *g);
	int send_message(group_t g, void buf, int len);
	int send_message_groupd(group_t g, void buf, int len, int type);
	void copy_groupd_data(group_data_t *data);
	int in_groupd_cpg(int nodeid);
	void group_mode_check_timeout(void);

	/* joinleave.c */
	void remove_group(group_t *g);
	int do_join(char *name, int level, int ci);
	int do_leave(char *name, int level);
	node_t *new_node(int nodeid);
	group_t find_group_level(char name, int level);
	int create_group(char name, int level, group_t *g_out);
	app_t create_app(group_t g);

	/* logging.c */

	void init_logging(void);
	void setup_logging();
	void close_logging(void);

	#endif /* __GD_INTERNAL_DOT_H__ */

	diff --git a/group/daemon/joinleave.c b/group/daemon/joinleave.c
	index 7148f167a..416939223 100644
	--- a/group/daemon/joinleave.c
	+++ b/group/daemon/joinleave.c
	@@ -1,168 +1,164 @@

	/* Initiate join/leave requests from apps */

	#include "gd_internal.h"


	group_t find_group_level(char name, int level)
	{
	group_t *g;

	list_for_each_entry(g, &gd_levels[level], level_list) {
	if (!strcmp(g->name, name))
	return g;
	}
	return NULL;
	}

	int create_group(char name, int level, group_t *g_out)
	{
	group_t *g;

	g = malloc(sizeof(*g));
	if (!g)
	return -ENOMEM;

	memset(g, 0, sizeof(*g));

	strcpy(g->name, name);
	g->namelen = strlen(name);
	g->level = level;
	INIT_LIST_HEAD(&g->memb);
	INIT_LIST_HEAD(&g->messages);

	list_add_tail(&g->list, &gd_groups);
	list_add_tail(&g->level_list, &gd_levels[level]);

	*g_out = g;
	return 0;
	}

	void free_group_memb(group_t *g)
	{
	node_t node, n;

	list_for_each_entry_safe(node, n, &g->memb, list) {
	list_del(&node->list);
	free(node);
	}
	}

	void remove_group(group_t *g)
	{
	list_del(&g->list);
	list_del(&g->level_list);
	free_group_memb(g);
	free(g);
	}

	app_t create_app(group_t g)
	{
	app_t *a;

	a = malloc(sizeof(app_t));
	if (!a)
	return NULL;
	memset(a, 0, sizeof(app_t));

	a->need_first_event = 1;
	INIT_LIST_HEAD(&a->nodes);
	INIT_LIST_HEAD(&a->events);
	a->g = g;
	g->app = a;

	return a;
	}

	int do_join(char *name, int level, int ci)
	{
	group_t *g;
	app_t *a;
	int rv;

	g = find_group_level(name, level);
	if (g) {
	log_group(g, "%d:%s can't join existing group", level, name);
	rv = -EEXIST;
	goto out;
	}

	- rv = set_cman_dirty();
	- if (rv)
	- goto out;
	-
	rv = create_group(name, level, &g);
	if (rv)
	goto out;

	a = create_app(g);
	if (!a) {
	rv = -ENOMEM;
	goto out;
	}

	a->client = ci;

	log_debug("%d:%s got join", level, name);
	g->joining = 1;
	rv = do_cpg_join(g);
	out:
	return rv;
	}

	int do_leave(char *name, int level)
	{
	group_t *g;
	event_t *ev;
	int rv;

	g = find_group_level(name, level);
	if (!g)
	return -ENOENT;

	if (!g->app) {
	log_group(g, "leave: no app");
	return -EINVAL;
	}

	if (g->joining) {
	log_error(g, "leave: still joining");
	return -EAGAIN;
	}

	if (g->leaving) {
	log_error(g, "leave: already leaving");
	return -EBUSY;
	}

	ev = g->app->current_event;

	if (ev && ev->nodeid == our_nodeid) {
	log_error(g, "leave: busy event %llx state %s",
	(unsigned long long)ev->id,
	ev_state_str(ev));
	return -EAGAIN;
	}

	list_for_each_entry(ev, &g->app->events, list) {
	ASSERT(ev->nodeid != our_nodeid);
	log_group(g, "do_leave: found queued event id %llx",
	(unsigned long long)ev->id);
	}

	log_debug("%d:%s got leave", level, name);
	g->leaving = 1;
	rv = do_cpg_leave(g);
	return rv;
	}

	node_t *new_node(int nodeid)
	{
	node_t *node;

	node = malloc(sizeof(*node));
	memset(node, 0, sizeof(*node));
	node->nodeid = nodeid;
	return node;
	}

File Metadata

Mime Type: text/x-diff
Expires: Sat, Nov 23, 11:07 AM (1 d, 18 h)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 1018639
Default Alt Text: (92 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions