No OneTemporary
Actions

Size

193 KB

Referenced Files

None

Subscribers

None

View Options

	diff --git a/gfs2/mount/util.c b/gfs2/mount/util.c
	index 48b0b39e7..404cbf167 100644
	--- a/gfs2/mount/util.c
	+++ b/gfs2/mount/util.c
	@@ -1,558 +1,565 @@
	#include "util.h"
	#include "libgfscontrol.h"

	extern char *prog_name;
	extern char *fsname;
	extern int verbose;

	static int gfs_controld_fd;

	/* opt_map stuff from util-linux */

	struct opt_map {
	char opt; / option name */
	int skip; /* skip in mtab option string (gfs: not used) */
	int inv; /* true if flag value should be inverted */
	int mask; /* flag mask value */
	};

	static struct opt_map opt_map[] = {
	{ "defaults", 0, 0, 0 }, /* default options */
	{ "ro", 1, 0, MS_RDONLY }, /* read-only */
	{ "rw", 1, 1, MS_RDONLY }, /* read-write */
	{ "exec", 0, 1, MS_NOEXEC }, /* permit execution of binaries */
	{ "noexec", 0, 0, MS_NOEXEC }, /* don't execute binaries */
	{ "suid", 0, 1, MS_NOSUID }, /* honor suid executables */
	{ "nosuid", 0, 0, MS_NOSUID }, /* don't honor suid executables */
	{ "dev", 0, 1, MS_NODEV }, /* interpret device files */
	{ "nodev", 0, 0, MS_NODEV }, /* don't interpret devices */
	{ "sync", 0, 0, MS_SYNCHRONOUS}, /* synchronous I/O */
	{ "async", 0, 1, MS_SYNCHRONOUS}, /* asynchronous I/O */
	{ "remount", 0, 0, MS_REMOUNT}, /* Alter flags of mounted FS */
	{ "bind", 0, 0, MS_BIND }, /* Remount part of tree elsewhere */
	{ "mand", 0, 0, MS_MANDLOCK }, /* Allow mandatory locks on this FS */
	{ "nomand", 0, 1, MS_MANDLOCK }, /* Forbid mandatory locks on this FS */
	{ "atime", 0, 1, MS_NOATIME }, /* Update access time */
	{ "noatime", 0, 0, MS_NOATIME }, /* Do not update access time */
	{ "diratime", 0, 1, MS_NODIRATIME }, /* Update dir access times */
	{ "nodiratime", 0, 0, MS_NODIRATIME },/* Do not update dir access times */

	/* options used by the mount command only (not in sys/mount.h): */
	{ "dirsync", 0, 0, 0 }, /* synchronous directory modifications */
	{ "loop", 1, 0, 0 }, /* use a loop device */
	{ "auto", 0, 1, 0 }, /* Can be mounted using -a */
	{ "noauto", 0, 0, 0 }, /* Can only be mounted explicitly */
	{ "users", 0, 0, 0 }, /* Allow ordinary user to mount */
	{ "nousers", 0, 1, 0 }, /* Forbid ordinary user to mount */
	{ "user", 0, 0, 0 }, /* Allow ordinary user to mount */
	{ "nouser", 0, 1, 0 }, /* Forbid ordinary user to mount */
	{ "owner", 0, 0, 0 }, /* Let the owner of the device mount */
	{ "noowner", 0, 1, 0 }, /* Device owner has no special privs */
	{ "_netdev", 0, 0, 0 }, /* Network device required (netfs) */
	{ NULL, 0, 0, 0 }
	};

	/* if option has a corresponding MS_XXX, set the bit in the flags */

	static int set_flag(char o, int flags)
	{
	struct opt_map *om;

	for (om = opt_map; om->opt; om++) {
	if (strcmp(om->opt, o))
	continue;

	if (om->inv)
	*flags &= ~om->mask;
	else
	*flags \|= om->mask;

	log_debug(" %s flag %x for \"%s\", flags = %x",
	om->inv ? "clear" : "set", om->mask, om->opt, *flags);

	return 1;
	}

	return 0;
	}

	/* opts is the string of all mount options, this function finds
	the options that have MS_XXX flags and sets the appropriate flag
	bit. The options without an MS_ flag are copied into the extra
	string. The values of some specific options are saved for later
	internal use. */

	void parse_opts(struct mount_options *mo)
	{
	char data[PATH_MAX+1];
	char options, o, *v;
	int extra_len = 0;

	log_debug("parse_opts: opts = \"%s\"", mo->opts);

	memset(data, 0, sizeof(data));
	strncpy(data, mo->opts, PATH_MAX);

	for (options = data; (o = strsep(&options, ",")); ) {
	if (!*o)
	continue;

	if (set_flag(o, &mo->flags))
	continue;

	if (!strncmp("hostdata", o, 8)) {
	if (mo->hostdata[0])
	warn("duplicate hostdata strings");
	else
	strcat(mo->hostdata, o);
	continue;
	}

	if (extra_len + 1 + strlen(o) > PATH_MAX)
	die("extra options string is too long\n");

	if (mo->extra[0]) {
	strcat(mo->extra, ",");
	extra_len += 1;
	}

	log_debug(" add extra %s", o);

	strcat(mo->extra, o);
	extra_len += strlen(o);

	v = strchr(o, '=');
	if (v)
	*v++ = 0;

	/* we grab these now so we don't have to parse them out
	again later when doing proto-specific stuff */

	if (!strcmp(o, "lockproto")) {
	if (!v)
	die("option lockproto needs value\n");
	strncpy(mo->lockproto, v, 255);
	}

	if (!strcmp(o, "locktable")) {
	if (!v)
	die("option locktable needs value\n");
	strncpy(mo->locktable, v, 255);
	}
	}

	log_debug("parse_opts: flags = %x", mo->flags);
	log_debug("parse_opts: extra = \"%s\"", mo->extra);
	log_debug("parse_opts: hostdata = \"%s\"", mo->hostdata);
	log_debug("parse_opts: lockproto = \"%s\"", mo->lockproto);
	log_debug("parse_opts: locktable = \"%s\"", mo->locktable);
	}

	/* - when unmounting, we don't know the dev and need this function to set it;
	we also want to select the _last_ line with a matching dir since it will
	be the top-most fs that the umount(2) will unmount
	- when mounting, we do know the dev and need this function to use it in the
	comparison (for multiple fs's with the same mountpoint) */

	void read_proc_mounts(struct mount_options *mo)
	{
	FILE *file;
	char line[PATH_MAX];
	char path[PATH_MAX];
	char type[PATH_MAX];
	char opts[PATH_MAX];
	char device[PATH_MAX];
	char save_line[PATH_MAX];
	char save_opts[PATH_MAX];
	char save_device[PATH_MAX];
	int found = 0;

	file = fopen("/proc/mounts", "r");
	if (!file)
	die("can't open /proc/mounts: %s\n", strerror(errno));

	while (fgets(line, PATH_MAX, file)) {
	if (sscanf(line, "%s %s %s %s", device, path, type, opts) != 4)
	continue;
	if (strcmp(path, mo->dir))
	continue;
	if (mo->dev[0] && strcmp(device, mo->dev))
	continue;
	if (strcmp(type, fsname))
	die("%s is not a %s filesystem\n", mo->dir, fsname);

	/* when there is an input dev specified (mount), we should get
	only one matching line; when there is no input dev specified
	(umount), we want the _last_ matching line */

	strncpy(save_device, device, PATH_MAX);
	strncpy(save_opts, opts, PATH_MAX);
	strncpy(save_line, line, PATH_MAX);
	found = 1;
	}

	fclose(file);

	if (!found)
	die("can't find /proc/mounts entry for directory %s\n", mo->dir);
	else {
	strncpy(mo->dev, save_device, PATH_MAX);
	strncpy(mo->opts, save_opts, PATH_MAX);
	strncpy(mo->proc_entry, save_line, PATH_MAX);
	}

	log_debug("read_proc_mounts: device = \"%s\"", mo->dev);
	log_debug("read_proc_mounts: opts = \"%s\"", mo->opts);
	}

	void gfs2_inum_in(struct gfs2_inum no, char buf)
	{
	struct gfs2_inum str = (struct gfs2_inum )buf;

	no->no_formal_ino = be64_to_cpu(str->no_formal_ino);
	no->no_addr = be64_to_cpu(str->no_addr);
	}

	void gfs2_meta_header_in(struct gfs2_meta_header mh, char buf)
	{
	struct gfs2_meta_header str = (struct gfs2_meta_header )buf;

	mh->mh_magic = be32_to_cpu(str->mh_magic);
	mh->mh_type = be32_to_cpu(str->mh_type);
	mh->mh_format = be32_to_cpu(str->mh_format);
	}

	void gfs2_sb_in(struct gfs2_sb sb, char buf)
	{
	struct gfs2_sb str = (struct gfs2_sb )buf;

	gfs2_meta_header_in(&sb->sb_header, buf);

	sb->sb_fs_format = be32_to_cpu(str->sb_fs_format);
	sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format);
	sb->sb_bsize = be32_to_cpu(str->sb_bsize);
	sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift);

	gfs2_inum_in(&sb->sb_master_dir, (char *)&str->sb_master_dir);
	gfs2_inum_in(&sb->sb_root_dir, (char *)&str->sb_root_dir);

	memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN);
	memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN);
	}

	int get_sb(char device, struct gen_sb sb_out)
	{
	int fd;

	fd = open(device, O_RDONLY);
	if (fd < 0)
	die("can't open %s: %s\n", device, strerror(errno));

	if (!strcmp(fsname, "gfs2")) {
	char buf[GFS2_BASIC_BLOCK];
	struct gfs2_sb sb;

	do_lseek(fd, GFS2_SB_ADDR * GFS2_BASIC_BLOCK);
	do_read(fd, buf, GFS2_BASIC_BLOCK);
	gfs2_sb_in(&sb, buf);

	if (sb.sb_header.mh_magic != GFS2_MAGIC \|\|
	sb.sb_header.mh_type != GFS2_METATYPE_SB) {
	die("there isn't a GFS2 filesystem on %s, "
	"magic=%x type=%x\n", device,
	sb.sb_header.mh_magic, sb.sb_header.mh_type);
	}

	if (sb.sb_fs_format != GFS2_FORMAT_FS \|\|
	sb.sb_multihost_format != GFS2_FORMAT_MULTI) {
	die("there appears to be a GFS, not GFS2, filesystem "
	"on %s\n", device);
	}

	strncpy(sb_out->lockproto, sb.sb_lockproto, 256);
	strncpy(sb_out->locktable, sb.sb_locktable, 256);

	} else if (!strcmp(fsname, "gfs")) {
	char buf[GFS_BASIC_BLOCK];
	struct gfs_sb sb;

	do_lseek(fd, GFS_SB_ADDR * GFS_BASIC_BLOCK);
	do_read(fd, buf, GFS2_BASIC_BLOCK);
	gfs_sb_in(&sb, buf);

	if (sb.sb_header.mh_magic != GFS_MAGIC \|\|
	sb.sb_header.mh_type != GFS_METATYPE_SB) {
	die("there isn't a GFS filesystem on %s\n", device);
	}

	if (sb.sb_fs_format != GFS_FORMAT_FS \|\|
	sb.sb_multihost_format != GFS_FORMAT_MULTI) {
	die("there appears to be a GFS2, not GFS, filesystem "
	"on %s\n", device);
	}

	strncpy(sb_out->lockproto, sb.sb_lockproto, 256);
	strncpy(sb_out->locktable, sb.sb_locktable, 256);
	}

	close(fd);
	return 0;
	}

	char select_lockproto(struct mount_options mo, struct gen_sb *sb)
	{
	/* find the effective lockproto, proto specified in mount options
	overrides the sb lockproto */

	if (mo->lockproto[0])
	return mo->lockproto;
	else
	return sb->lockproto;
	}

	int lock_dlm_join(struct mount_options mo, struct gen_sb sb)
	{
	struct gfsc_mount_args ma;
	int fd, rv, result;

	memset(&ma, 0, sizeof(ma));

	strncpy(ma.dir, mo->dir, PATH_MAX);
	strncpy(ma.type, fsname, PATH_MAX);
	strncpy(ma.proto, "lock_dlm", PATH_MAX);
	strncpy(ma.options, mo->opts, PATH_MAX);
	strncpy(ma.dev, mo->dev, PATH_MAX);
	if (mo->locktable[0])
	strncpy(ma.table, mo->locktable, PATH_MAX);
	else
	strncpy(ma.table, sb->locktable, PATH_MAX);

	fd = gfsc_fs_connect();
	if (fd < 0) {
	warn("gfs_controld join connect error: %s", strerror(errno));
	return fd;
	}

	/* tell gfs_controld to join the mountgroup */

	rv = gfsc_fs_join(fd, &ma);
	if (rv < 0) {
	warn("gfs_controld join write error: %s", strerror(errno));
	goto fail;
	}

	/* read the result of the join from gfs_controld */

	rv = gfsc_fs_result(fd, &result, &ma);
	if (rv < 0) {
	warn("gfs_controld result read error: %s", strerror(errno));
	goto fail;
	}

	rv = result;

	switch (rv) {
	case 0:
	case -EALREADY:
	break;

	case -EPROTONOSUPPORT:
	warn("lockproto not supported");
	goto fail;

	case -EOPNOTSUPP:
	warn("jid, first and id are reserved options");
	goto fail;

	case -EBADFD:
	warn("no colon found in table name");
	goto fail;

	case -ENAMETOOLONG:
	warn("fs name too long");
	goto fail;

	case -ESTALE:
	warn("fs is being unmounted");
	goto fail;

	case -EADDRINUSE:
	warn("different fs appears to exist with the same name");
	goto fail;

	case -EBUSY:
	warn("mount point already used or other mount in progress");
	goto fail;

	case -ENOMEM:
	warn("out of memory");
	goto fail;

	case -EBADR:
	warn("fs is for a different cluster");
	goto fail;

	case -ENOANO:
	warn("node not a member of the default fence domain");
	goto fail;

	case -EROFS:
	warn("read-only mount invalid with spectator option");
	goto fail;

	case -EMLINK:
	warn("option string too long");
	goto fail;

	default:
	warn("gfs_controld join error: %d", rv);
	goto fail;
	}

	/*
	* In addition to the result, gfs_controld also returns
	* "hostdata=jid=X:id=Y:first=Z" in ma.hostdata.
	* This is first combined with any hostdata the user gave on
	* the command line and then the full hostdata is combined
	* with the "extra" mount otions into the "extra_plus" string.
	*/

	if (strlen(mo->hostdata) + strlen(ma.hostdata) + 1 > PATH_MAX) {
	warn("hostdata too long");
	rv = -1;
	goto fail;
	}

	if (!mo->hostdata[0])
	snprintf(mo->hostdata, PATH_MAX, "%s", ma.hostdata);
	else {
	char *p = strstr(ma.hostdata, "=") + 1;
	strcat(mo->hostdata, ":");
	strcat(mo->hostdata, p);
	}

	log_debug("lock_dlm_join: hostdata: \"%s\"", mo->hostdata);

	if (strlen(mo->extra) == 0)
	snprintf(mo->extra_plus, PATH_MAX, "%s", mo->hostdata);
	else
	snprintf(mo->extra_plus, PATH_MAX, "%s,%s",
	mo->extra, mo->hostdata);

	/* keep gfs_controld connection open and reuse it below to
	send the result of mount(2) to gfs_controld, except in
	the case of another mount (EALREADY) */

	if (rv == -EALREADY)
	gfsc_fs_disconnect(fd);
	else
	gfs_controld_fd = fd;

	return 0;

	fail:
	gfsc_fs_disconnect(fd);
	return rv;
	}

	void lock_dlm_mount_done(struct mount_options mo, struct gen_sb sb,
	int result)
	{
	struct gfsc_mount_args ma;
	int rv;

	if (!gfs_controld_fd)
	return;

	memset(&ma, 0, sizeof(ma));

	strncpy(ma.dir, mo->dir, PATH_MAX);
	strncpy(ma.type, fsname, PATH_MAX);
	strncpy(ma.proto, "lock_dlm", PATH_MAX);
	strncpy(ma.options, mo->opts, PATH_MAX);
	strncpy(ma.dev, mo->dev, PATH_MAX);
	if (mo->locktable[0])
	strncpy(ma.table, mo->locktable, PATH_MAX);
	else
	strncpy(ma.table, sb->locktable, PATH_MAX);

	/* tell gfs_controld the result of mount(2) */

	rv = gfsc_fs_mount_done(gfs_controld_fd, &ma, result);
	if (rv)
	warn("gfs_controld mount_done write error: %s", strerror(errno));

	gfsc_fs_disconnect(gfs_controld_fd);
	}

	int lock_dlm_leave(struct mount_options mo, struct gen_sb sb, int mnterr)
	{
	struct gfsc_mount_args ma;
	int rv;

	memset(&ma, 0, sizeof(ma));

	strncpy(ma.dir, mo->dir, PATH_MAX);
	strncpy(ma.type, fsname, PATH_MAX);
	if (mo->locktable[0])
	strncpy(ma.table, mo->locktable, PATH_MAX);
	else
	strncpy(ma.table, sb->locktable, PATH_MAX);

	rv = gfsc_fs_leave(&ma, mnterr);
	if (rv)
	warn("leave: gfs_controld leave error: %s", strerror(errno));

	return rv;
	}

	int lock_dlm_remount(struct mount_options mo, struct gen_sb sb)
	{
	struct gfsc_mount_args ma;
	char *mode;
	int fd, rv, result;

	memset(&ma, 0, sizeof(ma));

	- /* FIXME: how to check for spectator remounts, we want
	- to disallow remount to/from spectator */
	+ if (strstr(mo->extra, "spectator")) {
	+ warn("spectator remounts not allowed");
	+ return -1;
	+ }

	if (mo->flags & MS_RDONLY)
	mode = "ro";
	else
	mode = "rw";

	strncpy(ma.dir, mo->dir, PATH_MAX);
	strncpy(ma.type, fsname, PATH_MAX);
	strncpy(ma.options, mode, PATH_MAX);
	+ if (mo->locktable[0])
	+ strncpy(ma.table, mo->locktable, PATH_MAX);
	+ else
	+ strncpy(ma.table, sb->locktable, PATH_MAX);
	+

	fd = gfsc_fs_connect();
	if (fd < 0) {
	warn("gfs_controld remount connect error: %s", strerror(errno));
	return fd;
	}

	/* tell gfs_controld about the new mount options */

	rv = gfsc_fs_remount(fd, &ma);
	if (rv) {
	warn("gfs_controld remount write error: %s", strerror(errno));
	goto out;
	}

	/* read the result of the remount from gfs_controld */

	rv = gfsc_fs_result(fd, &result, &ma);
	if (rv < 0) {
	warn("gfs_controld result read error: %s", strerror(errno));
	goto out;
	}

	rv = result;
	if (rv)
	warn("remount not allowed from gfs_controld");
	out:
	gfsc_fs_disconnect(fd);
	return rv;
	}

	diff --git a/group/gfs_controld/cpg-new.c b/group/gfs_controld/cpg-new.c
	index cfe0cf66f..fdc58bada 100644
	--- a/group/gfs_controld/cpg-new.c
	+++ b/group/gfs_controld/cpg-new.c
	@@ -1,2644 +1,2681 @@
	/******************************************************************************
	*******************************************************************************
	**
	** Copyright (C) 2008 Red Hat, Inc. All rights reserved.
	**
	** This copyrighted material is made available to anyone wishing to use,
	** modify, copy, or redistribute it subject to the terms and conditions
	** of the GNU General Public License v.2.
	**
	*******************************************************************************
	******************************************************************************/

	#include "gfs_daemon.h"
	#include "config.h"
	#include "libdlmcontrol.h"

	#define MAX_JOURNALS 256

	uint32_t cpgname_to_crc(const char *data, int len);

	static unsigned int protocol_active[3] = {1, 0, 0};
	static int dlmcontrol_fd;

	/* gfs_header types */
	enum {
	GFS_MSG_START = 1,
	GFS_MSG_MOUNT_DONE = 2,
	GFS_MSG_FIRST_RECOVERY_DONE = 3,
	GFS_MSG_RECOVERY_RESULT = 4,
	+ GFS_MSG_REMOUNT = 5,
	};

	/* gfs_header flags */
	#define GFS_MFLG_JOINING 1 /* accompanies start, we are joining */

	struct gfs_header {
	uint16_t version[3];
	uint16_t type; /* GFS_MSG_ */
	uint32_t nodeid; /* sender */
	uint32_t to_nodeid; /* recipient, 0 for all */
	uint32_t global_id; /* global unique id for this lockspace */
	uint32_t flags; /* GFS_MFLG_ */
	uint32_t msgdata; /* in-header payload depends on MSG type */
	uint32_t pad1;
	uint64_t pad2;
	};

	/* mg_info and id_info: for syncing state in start message */

	struct mg_info {
	uint32_t mg_info_size;
	uint32_t id_info_size;
	uint32_t id_info_count;

	uint32_t started_count;

	int member_count;
	int joined_count;
	int remove_count;
	int failed_count;

	int first_recovery_needed;
	int first_recovery_master;
	};

	#define IDI_NODEID_IS_MEMBER 0x00000001
	#define IDI_JID_NEEDS_RECOVERY 0x00000002
	#define IDI_MOUNT_DONE 0x00000008
	#define IDI_MOUNT_ERROR 0x00000010
	#define IDI_MOUNT_RO 0x00000020
	#define IDI_MOUNT_SPECTATOR 0x00000040

	struct id_info {
	int nodeid;
	int jid;
	uint32_t flags;
	};

	#define JID_NONE -1

	struct journal {
	struct list_head list;
	int jid;
	int nodeid;
	int failed_nodeid;
	int needs_recovery;

	int local_recovery_busy;
	int local_recovery_done;
	int local_recovery_result;
	int failed_recovery_count;
	};

	struct node {
	struct list_head list;
	int nodeid;
	int jid;
	int ro;
	int spectator;
	int kernel_mount_done;
	int kernel_mount_error;

	int check_dlm;
	int dlm_notify_callback;
	int dlm_notify_result;

	int failed_reason;
	uint32_t added_seq;
	uint32_t removed_seq;
	uint64_t add_time;
	};

	struct member {
	struct list_head list;
	int nodeid;
	int start; /* 1 if we received a start message for this change */
	int added; /* 1 if added by this change */
	int failed; /* 1 if failed in this change */
	int disallowed;
	char start_msg; / full copy of the start message from this node */
	struct mg_info mg_info; / shortcut into started_msg */
	};

	/* One of these change structs is created for every confchg a cpg gets. */

	#define CGST_WAIT_CONDITIONS 1
	#define CGST_WAIT_MESSAGES 2

	struct change {
	struct list_head list;
	struct list_head members;
	struct list_head removed; /* nodes removed by this change */
	struct list_head saved_messages; /* saved messages */
	int member_count;
	int joined_count;
	int remove_count;
	int failed_count;
	int state;
	int we_joined;
	uint32_t seq; /* used as a reference for debugging, and for queries */
	uint32_t combined_seq; /* for queries */
	};

	struct save_msg {
	struct list_head list;
	int len;
	char buf[0];
	};

	/*
	cpg confchg's arrive telling us that mountgroup members have
	joined/left/failed. A "change" struct is created for each confchg,
	and added to the mg->changes list.

	apply_changes()
	---------------

	<a new node won't know whether first_recovery_needed or not, but it also
	won't have any conditions to wait for, so a new node will go directly to
	sending out start message regardless>

	if first_recovery_needed,
	(or new, where new is not having completed a start barrier yet)
	all nodes: skip wait conditions
	all nodes: send start message

	else !first_recovery_needed,
	all nodes: if failures in changes, wait for conditions:
	local mount to complete if in progress, stop_kernel, dlm_notified
	all nodes: send start message

	<new changes that arrive result in going back to beginning; start messages
	from this aborted start cycle will be ignored>

	all nodes: wait for all start messages

	<once all start messages are received, new changes will be handled in a
	new batch after all current changes are cleared at end of sync_state>

	if start cycle / start barrier completes (start messages received from
	all nodes without being interrupted by a change), go on to sync_state
	which puts all members (as defined by the most recent change) in sync.

	"old nodes" are nodes that have completed a start cycle before (have
	a non-zero started_count), and "new nodes" are nodes that have not
	completed a start cycle before (they are being added by one of the
	changes in this start cycle)

	sync_state()
	------------

	if old nodes have first_recovery_needed, or all nodes are new
	all nodes: mg->first_recovery_needed = 1
	all nodes: mg->first_recovery_master = prev or new low nodeid
	new nodes: instantiate existing state to match old nodes
	old nodes: update state per the changes in the completed start cycle
	all nodes: assign jids to new members
	all nodes: clear all change structs

	else !first_recovery_needed,
	new nodes: instantiate existing state to match old nodes
	old nodes: update state per the changes in the completed start cycle
	all nodes: assign jids to new members
	all nodes: clear all change structs

	<new changes that arrive from here on result in going back to the top>

	recover_and_start()
	-------------------

	if first_recovery_needed,
	master: tells mount to run with first=1 (if not already)
	all nodes: wait for first_recovery_done message
	master: sends first_recovery_done message when mount is done
	all nodes: mg->first_recovery_needed = 0
	all nodes: start kernel / tell mount.gfs to mount(2) (master already did)
	all nodes: send message with result of kernel mount

	else !first_recovery_needed,
	all nodes: if there are no journals to recover, goto start kernel
	old nodes: tell kernel to recover jids, send message with each result
	all nodes: wait for all recoveries to be done
	all nodes: start kernel
	new nodes: tell mount.gfs to mount(2)
	new nodes: send message with result of kernel mount

	[If no one can recover some journal(s), all will be left waiting, unstarted.
	A new change from a new mount will result in things going back to the top,
	and hopefully the new node will be successful at doing the journal
	recoveries when it comes through the recover_and_start() section, which
	would let everyone start again.]
	*/

	static void process_mountgroup(struct mountgroup *mg);

	static char *msg_name(int type)
	{
	switch (type) {
	case GFS_MSG_START:
	return "start";
	case GFS_MSG_MOUNT_DONE:
	return "mount_done";
	case GFS_MSG_FIRST_RECOVERY_DONE:
	return "first_recovery_done";
	case GFS_MSG_RECOVERY_RESULT:
	return "recovery_result";
	+ case GFS_MSG_REMOUNT:
	+ return "remount";
	default:
	return "unknown";
	}
	}

	static int _send_message(cpg_handle_t h, void *buf, int len, int type)
	{
	struct iovec iov;
	cpg_error_t error;
	int retries = 0;

	iov.iov_base = buf;
	iov.iov_len = len;

	retry:
	error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1);
	if (error == CPG_ERR_TRY_AGAIN) {
	retries++;
	usleep(1000);
	if (!(retries % 100))
	log_error("cpg_mcast_joined retry %d %s",
	retries, msg_name(type));
	goto retry;
	}
	if (error != CPG_OK) {
	log_error("cpg_mcast_joined error %d handle %llx %s",
	error, (unsigned long long)h, msg_name(type));
	return -1;
	}

	if (retries)
	log_debug("cpg_mcast_joined retried %d %s",
	retries, msg_name(type));

	return 0;
	}

	/* header fields caller needs to set: type, to_nodeid, flags, msgdata */

	static void gfs_send_message(struct mountgroup mg, char buf, int len)
	{
	struct gfs_header hd = (struct gfs_header ) buf;
	int type = hd->type;

	hd->version[0] = cpu_to_le16(protocol_active[0]);
	hd->version[1] = cpu_to_le16(protocol_active[1]);
	hd->version[2] = cpu_to_le16(protocol_active[2]);
	hd->type = cpu_to_le16(hd->type);
	hd->nodeid = cpu_to_le32(our_nodeid);
	hd->to_nodeid = cpu_to_le32(hd->to_nodeid);
	hd->global_id = cpu_to_le32(mg->id);
	hd->flags = cpu_to_le32(hd->flags);
	hd->msgdata = cpu_to_le32(hd->msgdata);

	_send_message(mg->cpg_handle, buf, len, type);
	}

	static struct member find_memb(struct change cg, int nodeid)
	{
	struct member *memb;

	list_for_each_entry(memb, &cg->members, list) {
	if (memb->nodeid == nodeid)
	return memb;
	}
	return NULL;
	}

	static struct mountgroup *find_mg_handle(cpg_handle_t h)
	{
	struct mountgroup *mg;

	list_for_each_entry(mg, &mountgroups, list) {
	if (mg->cpg_handle == h)
	return mg;
	}
	return NULL;
	}

	static struct mountgroup *find_mg_ci(int ci)
	{
	struct mountgroup *mg;

	list_for_each_entry(mg, &mountgroups, list) {
	if (mg->cpg_client == ci)
	return mg;
	}
	return NULL;
	}

	static struct journal find_journal(struct mountgroup mg, int jid)
	{
	struct journal *j;

	list_for_each_entry(j, &mg->journals, list) {
	if (j->jid == jid)
	return j;
	}
	return NULL;
	}

	static struct journal find_journal_by_nodeid(struct mountgroup mg, int nodeid)
	{
	struct journal *j;

	list_for_each_entry(j, &mg->journals, list) {
	if (j->nodeid == nodeid)
	return j;
	}
	return NULL;
	}

	static void free_cg(struct change *cg)
	{
	struct member memb, safe;
	struct save_msg sm, sm2;

	list_for_each_entry_safe(memb, safe, &cg->members, list) {
	list_del(&memb->list);
	if (memb->start_msg)
	free(memb->start_msg);
	free(memb);
	}
	list_for_each_entry_safe(memb, safe, &cg->removed, list) {
	list_del(&memb->list);
	if (memb->start_msg)
	free(memb->start_msg);
	free(memb);
	}
	list_for_each_entry_safe(sm, sm2, &cg->saved_messages, list) {
	list_del(&sm->list);
	free(sm);
	}
	free(cg);
	}

	static void free_mg(struct mountgroup *mg)
	{
	struct change cg, cg_safe;
	struct node node, node_safe;

	list_for_each_entry_safe(cg, cg_safe, &mg->changes, list) {
	list_del(&cg->list);
	free_cg(cg);
	}

	if (mg->started_change)
	free_cg(mg->started_change);

	list_for_each_entry_safe(node, node_safe, &mg->node_history, list) {
	list_del(&node->list);
	free(node);
	}

	free(mg);
	}

	static struct node get_node_history(struct mountgroup mg, int nodeid)
	{
	struct node *node;

	list_for_each_entry(node, &mg->node_history, list) {
	if (node->nodeid == nodeid)
	return node;
	}
	return NULL;
	}

	static void node_history_init(struct mountgroup *mg, int nodeid,
	struct change *cg)
	{
	struct node *node;

	node = get_node_history(mg, nodeid);
	if (node)
	goto out;

	node = malloc(sizeof(struct node));
	if (!node)
	return;
	memset(node, 0, sizeof(struct node));

	node->nodeid = nodeid;
	node->add_time = 0;
	list_add_tail(&node->list, &mg->node_history);
	out:
	node->added_seq = cg->seq; /* for queries */
	}

	static void node_history_start(struct mountgroup *mg, int nodeid)
	{
	struct node *node;

	node = get_node_history(mg, nodeid);
	if (!node) {
	log_error("node_history_start no nodeid %d", nodeid);
	return;
	}

	node->add_time = time(NULL);
	}

	static void node_history_left(struct mountgroup *mg, int nodeid,
	struct change *cg)
	{
	struct node *node;

	node = get_node_history(mg, nodeid);
	if (!node) {
	log_error("node_history_left no nodeid %d", nodeid);
	return;
	}

	node->add_time = 0;
	node->removed_seq = cg->seq; /* for queries */
	}

	static void node_history_fail(struct mountgroup *mg, int nodeid,
	struct change *cg, int reason)
	{
	struct node *node;

	node = get_node_history(mg, nodeid);
	if (!node) {
	log_error("node_history_fail no nodeid %d", nodeid);
	return;
	}

	node->check_dlm = 1;

	node->removed_seq = cg->seq; /* for queries */
	node->failed_reason = reason; /* for queries */
	}

	static int is_added(struct mountgroup *mg, int nodeid)
	{
	struct change *cg;
	struct member *memb;

	list_for_each_entry(cg, &mg->changes, list) {
	memb = find_memb(cg, nodeid);
	if (memb && memb->added)
	return 1;
	}
	return 0;
	}

	static int nodes_failed(struct mountgroup *mg)
	{
	struct change *cg;

	list_for_each_entry(cg, &mg->changes, list) {
	if (cg->failed_count)
	return 1;
	}
	return 0;
	}

	/* find a start message from an old node to use; it doesn't matter which old
	node we take the start message from, they should all be the same */

	static int get_id_list(struct mountgroup mg, struct id_info *ids,
	int count, int size)
	{
	struct change *cg;
	struct member *memb;

	cg = list_first_entry(&mg->changes, struct change, list);

	list_for_each_entry(memb, &cg->members, list) {
	if (!memb->mg_info->started_count)
	continue;

	*count = memb->mg_info->id_info_count;
	*size = memb->mg_info->id_info_size;
	ids = (struct id_info )(memb->start_msg +
	sizeof(struct gfs_header) +
	memb->mg_info->mg_info_size);
	return 0;
	}
	return -1;
	}

	static struct id_info get_id_struct(struct id_info ids, int count, int size,
	int nodeid)
	{
	struct id_info *id = ids;
	int i;

	for (i = 0; i < count; i++) {
	if (id->nodeid == nodeid)
	return id;
	id = (struct id_info )((char )id + size);
	}
	return NULL;
	}

	static void start_kernel(struct mountgroup *mg)
	{
	struct change *cg = mg->started_change;

	if (!mg->kernel_stopped) {
	log_error("start_kernel cg %u not stopped", cg->seq);
	return;
	}

	log_group(mg, "start_kernel cg %u member_count %d",
	cg->seq, cg->member_count);

	set_sysfs(mg, "block", 0);
	mg->kernel_stopped = 0;

	if (mg->joining) {
	client_reply_join_full(mg, 0);
	mg->joining = 0;
	mg->mount_client_notified = 1;
	}
	}

	static void stop_kernel(struct mountgroup *mg)
	{
	if (!mg->kernel_stopped) {
	log_group(mg, "stop_kernel");
	set_sysfs(mg, "block", 1);
	mg->kernel_stopped = 1;
	}
	}

	void process_dlmcontrol(int ci)
	{
	struct mountgroup *mg;
	struct node *node;
	char name[GFS_MOUNTGROUP_LEN+1];
	int rv, type, nodeid, result;

	memset(name, 0, sizeof(name));

	rv = dlmc_fs_result(dlmcontrol_fd, name, &type, &nodeid, &result);
	if (rv) {
	log_error("process_dlmcontrol dlmc_fs_result %d", rv);
	return;
	}

	mg = find_mg(name);
	if (!mg) {
	log_error("process_dlmcontrol no mg %s", name);
	return;
	}

	if (type == DLMC_RESULT_NOTIFIED) {
	log_group(mg, "process_dlmcontrol notified nodeid %d result %d",
	nodeid, result);

	node = get_node_history(mg, nodeid);
	if (!node) {
	/* shouldn't happen */
	log_error("process_dlmcontrol no nodeid %d", nodeid);
	return;
	}

	if (mg->dlm_notify_nodeid != nodeid) {
	/* shouldn't happen */
	log_error("process_dlmcontrol node %d expected %d",
	nodeid, mg->dlm_notify_nodeid);
	return;
	}

	mg->dlm_notify_nodeid = 0;
	node->dlm_notify_callback = 1;
	node->dlm_notify_result = result;

	} else if (type == DLMC_RESULT_REGISTER) {
	log_group(mg, "process_dlmcontrol register nodeid %d result %d",
	nodeid, result);
	} else {
	log_group(mg, "process_dlmcontrol unknown type %d", type);
	}

	poll_dlm = 0;

	process_mountgroup(mg);
	}

	static int check_dlm_notify_done(struct mountgroup *mg)
	{
	struct node *node;
	int rv;

	/* we're waiting for a notify result from the dlm (could we fire off
	all dlmc_fs_notified() calls at once instead of serially?) */

	if (mg->dlm_notify_nodeid)
	return 0;

	list_for_each_entry(node, &mg->node_history, list) {

	/* check_dlm is set when we see a node fail, and is cleared
	below when we find that the dlm has also seen it fail */

	if (!node->check_dlm)
	continue;

	/* we're in sync with the dlm for this nodeid, i.e. we've
	both seen this node fail */

	if (node->dlm_notify_callback && !node->dlm_notify_result) {
	node->dlm_notify_callback = 0;
	node->check_dlm = 0;
	continue;
	}

	/* we're not in sync with the dlm for this nodeid, i.e.
	the dlm hasn't seen this node fail yet; try calling
	dlmc_fs_notified() again in a bit */

	if (node->dlm_notify_callback && node->dlm_notify_result) {
	log_group(mg, "check_dlm_notify result %d will retry nodeid %d",
	node->dlm_notify_result, node->nodeid);
	node->dlm_notify_callback = 0;
	poll_dlm = 1;
	return 0;
	}

	/* check if the dlm has seen this nodeid fail, we get the
	answer asynchronously in process_dlmcontrol */

	log_group(mg, "check_dlm_notify nodeid %d begin", node->nodeid);

	rv = dlmc_fs_notified(dlmcontrol_fd, mg->name, node->nodeid);
	if (rv) {
	log_error("dlmc_fs_notified error %d", rv);
	return 0;
	}

	mg->dlm_notify_nodeid = node->nodeid;
	return 0;
	}

	log_group(mg, "check_dlm_notify done");
	return 1;
	}

	static int wait_conditions_done(struct mountgroup *mg)
	{
	if (mg->first_recovery_needed) {
	log_group(mg, "wait_conditions skip for first_recovery_needed");
	return 1;
	}

	if (!mg->started_count) {
	log_group(mg, "wait_conditions skip for zero started_count");
	return 1;
	}

	if (!nodes_failed(mg)) {
	log_group(mg, "wait_conditions skip for zero nodes_failed");
	return 1;
	}

	if (!mg->mount_client_notified) {
	log_group(mg, "wait_conditions skip mount client not notified");
	return 1;
	}

	if (mg->kernel_mount_done && mg->kernel_mount_error) {
	log_group(mg, "wait_conditions skip for kernel_mount_error");
	return 1;
	}

	if (!mg->kernel_mount_done) {
	log_group(mg, "wait_conditions need mount_done");
	return 0;
	}

	stop_kernel(mg);

	if (!check_dlm_notify_done(mg))
	return 0;

	return 1;
	}

	static int wait_messages_done(struct mountgroup *mg)
	{
	struct change *cg = list_first_entry(&mg->changes, struct change, list);
	struct member *memb;
	int need = 0, total = 0;

	list_for_each_entry(memb, &cg->members, list) {
	if (!memb->start)
	need++;
	total++;
	}

	if (need) {
	log_group(mg, "wait_messages need %d of %d", need, total);
	return 0;
	}

	log_group(mg, "wait_messages got all %d", total);
	return 1;
	}

	static void cleanup_changes(struct mountgroup *mg)
	{
	struct change *cg = list_first_entry(&mg->changes, struct change, list);
	struct change *safe;

	list_del(&cg->list);
	if (mg->started_change)
	free_cg(mg->started_change);
	mg->started_change = cg;

	/* zero started_count means "never started" */

	mg->started_count++;
	if (!mg->started_count)
	mg->started_count++;

	cg->combined_seq = cg->seq; /* for queries */

	list_for_each_entry_safe(cg, safe, &mg->changes, list) {
	mg->started_change->combined_seq = cg->seq; /* for queries */
	list_del(&cg->list);
	free_cg(cg);
	}
	}

	/* do the change details in the message match the details of the given change */

	static int match_change(struct mountgroup mg, struct change cg,
	struct gfs_header hd, struct mg_info mi,
	struct id_info *ids)
	{
	struct id_info *id;
	struct member *memb;
	uint32_t seq = hd->msgdata;
	int i, members_mismatch;

	/* We can ignore messages if we're not in the list of members.
	The one known time this will happen is after we've joined
	the cpg, we can get messages for changes prior to the change
	in which we're added. */

	id = get_id_struct(ids, mi->id_info_count, mi->id_info_size,our_nodeid);

	if (!id \|\| !(id->flags & IDI_NODEID_IS_MEMBER)) {
	log_group(mg, "match_change %d:%u skip cg %u we are not in members",
	hd->nodeid, seq, cg->seq);
	return 0;
	}

	memb = find_memb(cg, hd->nodeid);
	if (!memb) {
	log_group(mg, "match_change %d:%u skip cg %u sender not member",
	hd->nodeid, seq, cg->seq);
	return 0;
	}

	/* verify this is the right change by matching the counts
	and the nodeids of the current members */

	if (mi->member_count != cg->member_count \|\|
	mi->joined_count != cg->joined_count \|\|
	mi->remove_count != cg->remove_count \|\|
	mi->failed_count != cg->failed_count) {
	log_group(mg, "match_change %d:%u skip cg %u expect counts "
	"%d %d %d %d", hd->nodeid, seq, cg->seq,
	cg->member_count, cg->joined_count,
	cg->remove_count, cg->failed_count);
	return 0;
	}

	members_mismatch = 0;
	id = ids;

	for (i = 0; i < mi->id_info_count; i++) {
	if (id->flags & IDI_NODEID_IS_MEMBER) {
	memb = find_memb(cg, id->nodeid);
	if (!memb) {
	log_group(mg, "match_change %d:%u skip cg %u "
	"no memb %d", hd->nodeid, seq,
	cg->seq, id->nodeid);
	members_mismatch = 1;
	break;
	}
	}
	id = (struct id_info )((char )id + mi->id_info_size);
	}

	if (members_mismatch)
	return 0;

	log_group(mg, "match_change %d:%u matches cg %u", hd->nodeid, seq,
	cg->seq);
	return 1;
	}

	/* Unfortunately, there's no really simple way to match a message with the
	specific change that it was sent for. We hope that by passing all the
	details of the change in the message, we will be able to uniquely match
	it to the correct change. */

	/* A start message will usually be for the first (current) change on our list.
	In some cases it will be for a non-current change, and we can ignore it:

	1. A,B,C get confchg1 adding C
	2. C sends start for confchg1
	3. A,B,C get confchg2 adding D
	4. A,B,C,D recv start from C for confchg1 - ignored
	5. C,D send start for confchg2
	6. A,B send start for confchg2
	7. A,B,C,D recv all start messages for confchg2; start barrier/cycle done

	In step 4, how do the nodes know whether the start message from C is
	for confchg1 or confchg2? Hopefully by comparing the counts and members. */

	static struct change find_change(struct mountgroup mg, struct gfs_header *hd,
	struct mg_info mi, struct id_info ids)
	{
	struct change *cg;

	list_for_each_entry_reverse(cg, &mg->changes, list) {
	if (!match_change(mg, cg, hd, mi, ids))
	continue;
	return cg;
	}

	log_group(mg, "find_change %d:%u no match", hd->nodeid, hd->msgdata);
	return NULL;
	}

	static void mg_info_in(struct mg_info *mi)
	{
	mi->mg_info_size = le32_to_cpu(mi->mg_info_size);
	mi->id_info_size = le32_to_cpu(mi->id_info_size);
	mi->id_info_count = le32_to_cpu(mi->id_info_count);
	mi->started_count = le32_to_cpu(mi->started_count);
	mi->member_count = le32_to_cpu(mi->member_count);
	mi->joined_count = le32_to_cpu(mi->joined_count);
	mi->remove_count = le32_to_cpu(mi->remove_count);
	mi->failed_count = le32_to_cpu(mi->failed_count);
	mi->first_recovery_needed = le32_to_cpu(mi->first_recovery_needed);
	mi->first_recovery_master = le32_to_cpu(mi->first_recovery_master);
	}

	static void id_info_in(struct id_info *id)
	{
	id->nodeid = le32_to_cpu(id->nodeid);
	id->jid = le32_to_cpu(id->jid);
	id->flags = le32_to_cpu(id->flags);
	}

	static void ids_in(struct mg_info mi, struct id_info ids)
	{
	struct id_info *id;
	int i;

	id = ids;
	for (i = 0; i < mi->id_info_count; i++) {
	id_info_in(id);
	id = (struct id_info )((char )id + mi->id_info_size);
	}
	}

	static void receive_start(struct mountgroup mg, struct gfs_header hd, int len)
	{
	struct change *cg;
	struct member *memb;
	struct mg_info *mi;
	struct id_info *ids;
	uint32_t seq = hd->msgdata;
	int added;

	log_group(mg, "receive_start %d:%u len %d", hd->nodeid, seq, len);

	mi = (struct mg_info )((char )hd + sizeof(struct gfs_header));
	ids = (struct id_info )((char )mi + sizeof(struct mg_info));

	mg_info_in(mi);
	ids_in(mi, ids);

	cg = find_change(mg, hd, mi, ids);
	if (!cg)
	return;

	memb = find_memb(cg, hd->nodeid);
	if (!memb) {
	/* this should never happen since match_change checks it */
	log_error("receive_start no member %d", hd->nodeid);
	return;
	}

	added = is_added(mg, hd->nodeid);

	if (added && mi->started_count) {
	log_error("receive_start %d:%u add node with started_count %u",
	hd->nodeid, seq, mi->started_count);

	/* see comment in fence/fenced/cpg.c */
	memb->disallowed = 1;
	return;
	}

	node_history_start(mg, hd->nodeid);
	memb->start = 1;

	if (memb->start_msg) {
	/* shouldn't happen */
	log_error("receive_start %d:%u dup start msg", hd->nodeid, seq);
	return;
	}

	/* save a copy of each start message */
	memb->start_msg = malloc(len);
	if (!memb->start_msg) {
	log_error("receive_start len %d no mem", len);
	return;
	}
	memcpy(memb->start_msg, hd, len);

	/* a shortcut to the saved mg_info */
	memb->mg_info = (struct mg_info *)(memb->start_msg +
	sizeof(struct gfs_header));
	}

	/* start messages are associated with a specific change and use the
	find_change/match_change routines to make sure all start messages
	are matched with the same change on all nodes. The current set of
	changes are cleared after a completed start cycle. Other messages
	happen outside the context of changes. An "incomplete" start cycle
	is when a confchg arrives (adding a new change struct) before all
	start messages have been received for the current change. In this
	case, all members send a new start message for the latest change,
	and any start messages received for the previous change(s) are ignored.

	To sync state with start messages, we need to include:
	- the state before applying any of the current set of queued changes
	(new nodes will initialize with this)
	- the essential info from changes in the set that's being started,
	so nodes added by one of the queued changes can apply the same changes
	to the init state that the existing nodes do. */

	/* recovery_result and mount_done messages may arrive between the time
	that an old node sends start and the time a new node receives it.
	two old nodes may also send start before/after a recovery_result or
	mount_done message, creating inconsistent data in their start messages.

	Soln: a new node saves recovery_result/mount_done messages between
	last confchg and final start. the new node knows that a start message
	from an old node may or may not include the effects from rr/md messages
	since the last confchg, but will include all effects from prior to
	the last confchg. The saved rr/md messages can be applied on top of
	the state from an old node's start message; applying them a second time
	should not change anything, producing the same result. */

	static int count_ids(struct mountgroup *mg)
	{
	struct change *cg;
	struct member *memb;
	struct journal *j;
	int count = 0;

	cg = list_first_entry(&mg->changes, struct change, list);

	list_for_each_entry(memb, &cg->members, list)
	count++;

	list_for_each_entry(j, &mg->journals, list)
	if (j->needs_recovery)
	count++;

	list_for_each_entry(cg, &mg->changes, list) {
	list_for_each_entry(memb, &cg->removed, list) {
	j = find_journal_by_nodeid(mg, memb->nodeid);
	if (j)
	count++;
	}
	}

	return count;
	}

	/* old member: current member that has completed a start cycle
	new member: current member that has not yet completed a start cycle */

	static void send_start(struct mountgroup *mg)
	{
	struct change cg, c;
	struct gfs_header *hd;
	struct mg_info *mi;
	struct id_info *id;
	struct member *memb;
	struct node *node;
	struct journal *j;
	char *buf;
	uint32_t flags;
	int len, id_count, jid;
	int old_memb = 0, new_memb = 0, old_journal = 0, new_journal = 0;

	cg = list_first_entry(&mg->changes, struct change, list);

	id_count = count_ids(mg);

	/* sanity check */

	if (!mg->started_count && id_count != cg->member_count) {
	log_error("send_start bad counts id_count %d member_count %d",
	cg->member_count, id_count);
	return;
	}

	len = sizeof(struct gfs_header) + sizeof(struct mg_info) +
	id_count * sizeof(struct id_info);

	buf = malloc(len);
	if (!buf) {
	log_error("send_start len %d no mem", len);
	return;
	}
	memset(buf, 0, len);

	hd = (struct gfs_header *)buf;
	mi = (struct mg_info )(buf + sizeof(hd));
	id = (struct id_info )(buf + sizeof(hd) + sizeof(*mi));

	/* fill in header (gfs_send_message handles part of header) */

	hd->type = GFS_MSG_START;
	hd->msgdata = cg->seq;
	hd->flags \|= mg->joining ? GFS_MFLG_JOINING : 0;

	/* fill in mg_info */

	mi->mg_info_size = cpu_to_le32(sizeof(struct mg_info));
	mi->id_info_size = cpu_to_le32(sizeof(struct id_info));
	mi->id_info_count = cpu_to_le32(id_count);
	mi->started_count = cpu_to_le32(mg->started_count);
	mi->member_count = cpu_to_le32(cg->member_count);
	mi->joined_count = cpu_to_le32(cg->joined_count);
	mi->remove_count = cpu_to_le32(cg->remove_count);
	mi->failed_count = cpu_to_le32(cg->failed_count);
	mi->first_recovery_needed = cpu_to_le32(mg->first_recovery_needed);
	mi->first_recovery_master = cpu_to_le32(mg->first_recovery_master);

	/* fill in id_info entries */

	/* New members send info about themselves, and empty id_info slots for
	all other members. Old members send full info about all old
	members, and empty id_info slots about new members. The union of
	start messages from a single old node and all new nodes give a
	complete picture of state for all members. In sync_state, all nodes
	(old and new) make this union, and then assign jid's to new nodes. */

	list_for_each_entry(memb, &cg->members, list) {

	if (!mg->started_count \|\| is_added(mg, memb->nodeid)) {
	/* send empty slot for new member */
	jid = JID_NONE;
	flags = IDI_NODEID_IS_MEMBER;

	/* include our own info which no one knows yet */
	if (!mg->started_count && memb->nodeid == our_nodeid) {
	flags \|= mg->ro ? IDI_MOUNT_RO : 0;
	flags \|= mg->spectator ? IDI_MOUNT_SPECTATOR : 0;
	}
	new_memb++;

	} else {
	/* send full info for old member */
	node = get_node_history(mg, memb->nodeid);
	if (!node) {
	log_error("send_start no nodeid %d", memb->nodeid);
	continue;
	}

	jid = node->jid;
	flags = IDI_NODEID_IS_MEMBER;
	flags \|= node->ro ? IDI_MOUNT_RO : 0;
	flags \|= node->spectator ? IDI_MOUNT_SPECTATOR : 0;
	flags \|= node->kernel_mount_done ? IDI_MOUNT_DONE : 0;
	flags \|= node->kernel_mount_error ? IDI_MOUNT_ERROR : 0;
	old_memb++;
	}

	id->nodeid = cpu_to_le32(memb->nodeid);
	id->jid = cpu_to_le32(jid);
	id->flags = cpu_to_le32(flags);
	id++;
	}

	/* journals needing recovery from previous start cycles */

	list_for_each_entry(j, &mg->journals, list) {
	if (j->needs_recovery) {
	id->jid = cpu_to_le32(j->jid);
	id->flags = cpu_to_le32(IDI_JID_NEEDS_RECOVERY);
	id++;
	old_journal++;
	}
	}

	/* journals needing recovery from the current start cycle */

	list_for_each_entry(c, &mg->changes, list) {
	list_for_each_entry(memb, &c->removed, list) {
	j = find_journal_by_nodeid(mg, memb->nodeid);
	if (j) {
	id->jid = cpu_to_le32(j->jid);
	id->flags = cpu_to_le32(IDI_JID_NEEDS_RECOVERY);
	id++;
	new_journal++;
	}
	}
	}

	/* sanity check */

	if (!mg->started_count && (old_memb \|\| old_journal \|\| new_journal)) {
	log_error("send_start cg %u bad counts om %d nm %d oj %d nj %d",
	cg->seq, old_memb, new_memb, old_journal, new_journal);
	return;
	}

	log_group(mg, "send_start cg %u id_count %d om %d nm %d oj %d nj %d",
	cg->seq, id_count, old_memb, new_memb, old_journal,
	new_journal);

	gfs_send_message(mg, buf, len);

	free(buf);
	}

	static void send_mount_done(struct mountgroup *mg, int result)
	{
	struct gfs_header h;

	memset(&h, 0, sizeof(h));

	h.type = GFS_MSG_MOUNT_DONE;
	h.msgdata = result;

	gfs_send_message(mg, (char *)&h, sizeof(h));
	}

	static void send_first_recovery_done(struct mountgroup *mg)
	{
	struct gfs_header h;

	memset(&h, 0, sizeof(h));

	h.type = GFS_MSG_FIRST_RECOVERY_DONE;

	gfs_send_message(mg, (char *)&h, sizeof(h));
	}

	static void send_recovery_result(struct mountgroup *mg, int jid, int result)
	{
	struct gfs_header *hd;
	char *buf;
	int len, *p;

	len = sizeof(struct gfs_header) + 2 * sizeof(int);

	buf = malloc(len);
	if (!buf) {
	return;
	}
	memset(buf, 0, len);

	hd = (struct gfs_header *)buf;
	hd->type = GFS_MSG_RECOVERY_RESULT;

	p = (int *)(buf + sizeof(struct gfs_header));

	p[0] = cpu_to_le32(jid);
	p[1] = cpu_to_le32(result);

	gfs_send_message(mg, buf, len);

	free(buf);
	}

	+void send_remount(struct mountgroup mg, struct gfsc_mount_args ma)
	+{
	+ struct gfs_header h;
	+
	+ memset(&h, 0, sizeof(h));
	+
	+ h.type = GFS_MSG_REMOUNT;
	+ h.msgdata = strstr(ma->options, "ro") ? 1 : 0;
	+
	+ gfs_send_message(mg, (char *)&h, sizeof(h));
	+}
	+
	static void save_message(struct mountgroup mg, struct gfs_header hd, int len)
	{
	struct change *cg;
	struct save_msg *sm;

	cg = list_first_entry(&mg->changes, struct change, list);

	sm = malloc(sizeof(struct save_msg) + len);
	if (!sm) {
	log_error("save_message len %d no mem", len);
	return;
	}

	sm->len = len;
	memcpy(sm->buf, hd, len);

	list_add_tail(&sm->list, &cg->saved_messages);
	}

	void gfs_mount_done(struct mountgroup *mg)
	{
	send_mount_done(mg, mg->kernel_mount_error);
	}

	static void receive_mount_done(struct mountgroup mg, struct gfs_header hd,
	int len)
	{
	struct node *node;

	log_group(mg, "receive_mount_done from %d result %d",
	hd->nodeid, hd->msgdata);

	node = get_node_history(mg, hd->nodeid);
	if (!node) {
	log_error("receive_mount_done no nodeid %d", hd->nodeid);
	return;
	}

	node->kernel_mount_done = 1;
	node->kernel_mount_error = hd->msgdata;
	}

	static void receive_recovery_result(struct mountgroup *mg,
	struct gfs_header *hd, int len)
	{
	struct journal *j;
	int jid, result, *p;

	p = (int )((char )hd + sizeof(struct gfs_header));
	jid = le32_to_cpu(p[0]);
	result = le32_to_cpu(p[1]);

	log_group(mg, "receive_recovery_result from %d jid %d result %d",
	hd->nodeid, jid, result);

	j = find_journal(mg, jid);
	if (!j) {
	log_error("receive_recovery_result from %d no jid %d",
	hd->nodeid, jid);
	return;
	}

	if (!j->needs_recovery)
	return;

	if (result == LM_RD_SUCCESS)
	j->needs_recovery = 0;
	else {
	j->failed_recovery_count++;
	log_group(mg, "jid %d failed_recovery_count %d", jid,
	j->failed_recovery_count);
	}
	}

	static void receive_first_recovery_done(struct mountgroup *mg,
	struct gfs_header *hd, int len)
	{
	int master = mg->first_recovery_master;

	log_group(mg, "receive_first_recovery_done from %d master %d "
	"mount_client_notified %d",
	hd->nodeid, master, mg->mount_client_notified);

	if (master != hd->nodeid)
	log_error("receive_first_recovery_done from %d master %d",
	hd->nodeid, master);

	if (list_empty(&mg->changes)) {
	/* everything is idle, no changes in progress */

	mg->first_recovery_needed = 0;
	mg->first_recovery_master = 0;
	mg->first_recovery_msg = 1;

	if (master != our_nodeid)
	start_kernel(mg);
	} else {
	/* Everyone will receive this message in the same sequence
	wrt other start messages and confchgs:

	- If a new confchg arrives after this message (and before
	the final start message in the current start cycle),
	a new start cycle will begin. All nodes before the
	confchg will have frn=0 due to receiving this message,
	and nodes added by the confchg will see frn=0 in all
	start messages (in any_nodes_first_recovery() which
	returns 0).

	- If the final start message arrives after this message,
	the start cycle will complete, running sync_state(), on
	all current nodes with all having seen this message.
	Old and new nodes in the current start cycle will see
	this msg and use it (first_recovery_msg) instead of the
	first_recovery_needed/master data in the start messages
	(which may be inconsistent due to members sending their
	start messages either before or after receiving this
	message). */

	mg->first_recovery_needed = 0;
	mg->first_recovery_master = 0;
	mg->first_recovery_msg = 1;
	}
	}

	+static void receive_remount(struct mountgroup mg, struct gfs_header hd,
	+ int len)
	+{
	+ struct node *node;
	+
	+ log_group(mg, "receive_remount from %d ro %d", hd->nodeid, hd->msgdata);
	+
	+ node = get_node_history(mg, hd->nodeid);
	+ if (!node) {
	+ log_error("receive_remount no nodeid %d", hd->nodeid);
	+ return;
	+ }
	+
	+ node->ro = hd->msgdata;
	+
	+ if (hd->nodeid == our_nodeid)
	+ mg->ro = node->ro;
	+}
	+
	/* start message from all nodes shows zero started_count */

	static int all_nodes_new(struct mountgroup *mg)
	{
	struct change *cg;
	struct member *memb;

	cg = list_first_entry(&mg->changes, struct change, list);

	list_for_each_entry(memb, &cg->members, list) {
	if (memb->mg_info->started_count)
	return 0;
	}
	return 1;
	}

	/* does start message from any node with non-zero started_count have
	first_recovery_needed set? (verify that all started nodes agree on
	first_recovery_needed) */

	static int any_nodes_first_recovery(struct mountgroup *mg)
	{
	struct change *cg;
	struct member *memb;
	int yes = 0, no = 0, master = 0;

	cg = list_first_entry(&mg->changes, struct change, list);

	list_for_each_entry(memb, &cg->members, list) {
	if (!memb->mg_info->started_count)
	continue;
	if (memb->mg_info->first_recovery_needed)
	yes++;
	else
	no++;
	}

	if (no && yes) {
	/* disagreement on first_recovery_needed, shouldn't happen */
	log_error("any_nodes_first_recovery no %d yes %d", no, yes);
	return 1;
	}

	if (no)
	return 0;

	/* sanity check: verify agreement on the master */

	list_for_each_entry(memb, &cg->members, list) {
	if (!memb->mg_info->started_count)
	continue;
	if (!master) {
	master = memb->mg_info->first_recovery_master;
	continue;
	}
	if (master == memb->mg_info->first_recovery_master)
	continue;

	/* disagreement on master, shouldn't happen */
	log_error("any_nodes_first_recovery master %d vs %d",
	master, memb->mg_info->first_recovery_master);
	}

	return 1;
	}

	/* If all nodes new, there's no previous master, pick low nodeid;
	if not all nodes new, there will be a previous master, use that one unless
	it's no longer a member; if master is no longer a member pick low nodeid.
	The current master will already be set in mg->first_recovery_master for old
	nodes, but new nodes will need to look in the start messages to find it. */

	static int pick_first_recovery_master(struct mountgroup *mg, int all_new)
	{
	struct change *cg;
	struct member *memb;
	int old = 0, low = 0;

	cg = list_first_entry(&mg->changes, struct change, list);

	list_for_each_entry(memb, &cg->members, list) {
	if (memb->mg_info->started_count)
	old = memb->mg_info->first_recovery_master;

	if (!low)
	low = memb->nodeid;
	else if (memb->nodeid < low)
	low = memb->nodeid;
	}

	memb = find_memb(cg, old);

	if (!memb \|\| all_new) {
	log_group(mg, "pick_first_recovery_master low %d old %d",
	low, old);
	return low;
	}

	log_group(mg, "pick_first_recovery_master old %d", old);
	return old;
	}

	/* use a start message from an old node to create node info for each old node */

	static void create_old_nodes(struct mountgroup *mg)
	{
	struct change *cg;
	struct member *memb;
	struct node *node;
	struct journal *j;
	struct id_info ids, id;
	int id_count, id_size, rv;

	/* get ids from a start message of an old node */

	rv = get_id_list(mg, &ids, &id_count, &id_size);
	if (rv) {
	/* all new nodes, no old nodes */
	log_group(mg, "create_old_nodes all new");
	return;
	}

	/* use id list to set info for all old nodes */

	cg = list_first_entry(&mg->changes, struct change, list);

	list_for_each_entry(memb, &cg->members, list) {
	if (!memb->mg_info->started_count)
	continue;

	node = get_node_history(mg, memb->nodeid);
	id = get_id_struct(ids, id_count, id_size, memb->nodeid);

	if (!node \|\| !id) {
	/* shouldn't happen */
	log_error("create_old_nodes %d node %d id %d",
	memb->nodeid, !!node, !!id);
	return;
	}

	if (!(id->flags & IDI_NODEID_IS_MEMBER) \|\|
	(id->flags & IDI_JID_NEEDS_RECOVERY)) {
	/* shouldn't happen */
	log_error("create_old_nodes %d bad flags %x",
	memb->nodeid, id->flags);
	return;
	}

	node->jid = id->jid;
	node->kernel_mount_done = !!(id->flags & IDI_MOUNT_DONE);
	node->kernel_mount_error = !!(id->flags & IDI_MOUNT_ERROR);
	node->ro = !!(id->flags & IDI_MOUNT_RO);
	node->spectator = !!(id->flags & IDI_MOUNT_SPECTATOR);

	j = malloc(sizeof(struct journal));
	if (!j) {
	log_error("create_old_nodes no mem");
	return;
	}
	memset(j, 0, sizeof(struct journal));

	j->nodeid = node->nodeid;
	j->jid = node->jid;
	list_add(&j->list, &mg->journals);

	log_group(mg, "create_old_nodes %d jid %d ro %d spect %d "
	"kernel_mount_done %d error %d",
	node->nodeid, node->jid, node->ro, node->spectator,
	node->kernel_mount_done, node->kernel_mount_error);
	}
	}

	/* use start messages from new nodes to create node info for each new node */

	static void create_new_nodes(struct mountgroup *mg)
	{
	struct change *cg;
	struct member *memb;
	struct id_info ids, id;
	struct node *node;

	cg = list_first_entry(&mg->changes, struct change, list);

	list_for_each_entry(memb, &cg->members, list) {
	if (memb->mg_info->started_count)
	continue;

	node = get_node_history(mg, memb->nodeid);
	if (!node) {
	/* shouldn't happen */
	log_error("create_new_nodes %d no node", memb->nodeid);
	return;
	}

	ids = (struct id_info *)(memb->start_msg +
	sizeof(struct gfs_header) +
	memb->mg_info->mg_info_size);

	id = get_id_struct(ids, memb->mg_info->id_info_count,
	memb->mg_info->id_info_size, memb->nodeid);

	if (!(id->flags & IDI_NODEID_IS_MEMBER) \|\|
	(id->flags & IDI_JID_NEEDS_RECOVERY)) {
	/* shouldn't happen */
	log_error("create_new_nodes %d bad flags %x",
	memb->nodeid, id->flags);
	return;
	}

	node->jid = JID_NONE;
	node->ro = !!(id->flags & IDI_MOUNT_RO);
	node->spectator = !!(id->flags & IDI_MOUNT_SPECTATOR);

	log_group(mg, "create_new_nodes %d ro %d spect %d",
	node->nodeid, node->ro, node->spectator);
	}
	}

	static void create_failed_journals(struct mountgroup *mg)
	{
	struct journal *j;
	struct id_info ids, id;
	int id_count, id_size;
	int rv, i;

	rv = get_id_list(mg, &ids, &id_count, &id_size);
	if (rv) {
	/* all new nodes, no old nodes */
	return;
	}

	id = ids;

	for (i = 0; i < id_count; i++) {
	if (!(id->flags & IDI_JID_NEEDS_RECOVERY))
	continue;

	j = malloc(sizeof(struct journal));
	if (!j) {
	log_error("create_failed_journals no mem");
	return;
	}
	memset(j, 0, sizeof(struct journal));

	j->jid = id->jid;
	j->needs_recovery = 1;
	list_add(&j->list, &mg->journals);

	id = (struct id_info )((char )id + id_size);

	log_group(mg, "create_failed_journals jid %d", j->jid);
	}
	}

	static void set_failed_journals(struct mountgroup *mg)
	{
	struct change *cg;
	struct member *memb;
	struct journal *j;

	cg = list_first_entry(&mg->changes, struct change, list);

	list_for_each_entry(cg, &mg->changes, list) {
	list_for_each_entry(memb, &cg->removed, list) {
	if (!memb->failed)
	continue;

	j = find_journal_by_nodeid(mg, memb->nodeid);
	if (j) {
	j->needs_recovery = 1;
	j->failed_nodeid = j->nodeid;
	j->nodeid = 0;
	log_group(mg, "set_failed_journals jid %d "
	"nodeid %d", j->jid, memb->nodeid);
	} else {
	log_group(mg, "set_failed_journals no journal "
	"for nodeid %d ", memb->nodeid);
	}
	}
	}
	}

	/* returns nodeid of new member with the next highest nodeid */

	static int next_new_nodeid(struct mountgroup *mg, int prev)
	{
	struct change *cg;
	struct member *memb;
	int low = 0;

	cg = list_first_entry(&mg->changes, struct change, list);

	list_for_each_entry(memb, &cg->members, list) {
	if (memb->mg_info->started_count)
	continue;
	if (memb->nodeid <= prev)
	continue;
	if (!low)
	low = memb->nodeid;
	else if (memb->nodeid < low)
	low = memb->nodeid;
	}

	return low;
	}

	/* returns lowest unused jid */

	static int next_free_jid(struct mountgroup *mg)
	{
	int i;

	for (i = 0; i < MAX_JOURNALS; i++) {
	if (!find_journal(mg, i))
	return i;
	}
	return -1;
	}

	static void create_new_journals(struct mountgroup *mg)
	{
	struct journal j, safe;
	struct change *cg;
	struct node *node;
	int nodeid = 0;

	cg = list_first_entry(&mg->changes, struct change, list);

	/* first get rid of journal structs that are no longer used
	or dirty, i.e. from nodes that have unmounted/left, or
	journals that have been recovered */

	list_for_each_entry_safe(j, safe, &mg->journals, list) {
	if (j->needs_recovery)
	continue;

	if (find_memb(cg, j->nodeid))
	continue;

	list_del(&j->list);
	free(j);
	}

	while (1) {
	nodeid = next_new_nodeid(mg, nodeid);
	if (!nodeid)
	break;

	node = get_node_history(mg, nodeid);
	if (!node) {
	/* shouldn't happen */
	log_error("create_new_journals no nodeid %d", nodeid);
	continue;
	}

	if (node->spectator)
	node->jid = JID_NONE;
	else
	node->jid = next_free_jid(mg);

	if (node->nodeid == our_nodeid)
	mg->our_jid = node->jid;

	log_group(mg, "create_new_journals %d gets jid %d",
	node->nodeid, node->jid);

	if (node->jid == JID_NONE)
	continue;

	j = malloc(sizeof(struct journal));
	if (!j) {
	log_error("create_new_journals no mem");
	continue;
	}
	memset(j, 0, sizeof(struct journal));

	j->nodeid = nodeid;
	j->jid = node->jid;
	list_add(&j->list, &mg->journals);
	}
	}

	/* recovery_result and mount_done messages are saved by new members until
	they've completed the start cycle and have member state to apply them to.
	The start messages from old nodes may not reflect the rr/md updates. */

	static void apply_saved_messages(struct mountgroup *mg)
	{
	struct change *cg;
	struct save_msg sm, safe;
	struct gfs_header *hd;

	cg = list_first_entry(&mg->changes, struct change, list);

	list_for_each_entry_safe(sm, safe, &cg->saved_messages, list) {
	hd = (struct gfs_header *)sm->buf;

	switch (hd->type) {
	case GFS_MSG_MOUNT_DONE:
	receive_mount_done(mg, hd, sm->len);
	break;
	case GFS_MSG_RECOVERY_RESULT:
	receive_recovery_result(mg, hd, sm->len);
	break;
	}

	list_del(&sm->list);
	free(sm);
	}
	}

	/* this is run immediately after receiving the final start message in a start
	cycle, so all nodes will run this in the same sequence wrt other messages
	and confchgs */

	static void sync_state(struct mountgroup *mg)
	{
	/* This is needed for the case where the first_recovery_done message
	arrives while a change/start cycle is in progress. The
	first_recovery data in the start messages (used by new nodes in this
	cycle to determine the first_recovery state) may be inconsistent in
	different start messages (because nodes sent their start messages at
	different times wrt the first_recovery_done message.) But, in the
	case where the new nodes received the first_recovery_done message,
	they can just use that and don't need the (possibly inconsistent)
	first recovery data in the start messages. */

	if (mg->first_recovery_msg) {
	if (mg->first_recovery_needed \|\| mg->first_recovery_master) {
	/* shouldn't happen */
	log_error("sync_state first_recovery_msg needed %d "
	"master %d", mg->first_recovery_needed,
	mg->first_recovery_master);
	}

	log_group(mg, "sync_state first_recovery_msg");
	goto out;
	}

	/* This is the path the initial start cycle for the group always
	follows. It's the case where one or more nodes are all starting up
	for the first time. No one has completed a start cycle yet because
	everyone is joining, and one node needs to do first recovery. */

	if (all_nodes_new(mg)) {
	if (mg->first_recovery_needed \|\| mg->first_recovery_master) {
	/* shouldn't happen */
	log_error("sync_state all_nodes_new first_recovery "
	"needed %d master %d",
	mg->first_recovery_needed,
	mg->first_recovery_master);
	}
	mg->first_recovery_needed = 1;
	mg->first_recovery_master = pick_first_recovery_master(mg, 1);

	log_group(mg, "sync_state all_nodes_new first_recovery_needed "
	"master %d", mg->first_recovery_master);
	goto out;
	}

	/* This is for the case where new nodes are added to existing members
	that have first_recovery_needed set. */

	if (any_nodes_first_recovery(mg)) {
	mg->first_recovery_needed = 1;
	mg->first_recovery_master = pick_first_recovery_master(mg, 0);

	log_group(mg, "sync_state first_recovery_needed master %d",
	mg->first_recovery_master);
	goto out;
	}

	/* Normal case where nodes join an established group that completed
	first recovery sometime in the past. Existing nodes that weren't
	around during first recovery come through here, and new nodes
	being added in this cycle come through here. */

	if (mg->first_recovery_needed) {
	/* shouldn't happen */
	log_error("sync_state frn should not be set");
	}

	out:
	if (!mg->started_count) {
	create_old_nodes(mg);
	create_new_nodes(mg);
	create_failed_journals(mg);
	apply_saved_messages(mg);
	create_new_journals(mg);
	} else {
	create_new_nodes(mg);
	set_failed_journals(mg);
	create_new_journals(mg);
	}
	}

	static void apply_changes(struct mountgroup *mg)
	{
	struct change *cg;

	cg = list_first_entry(&mg->changes, struct change, list);

	switch (cg->state) {

	case CGST_WAIT_CONDITIONS:
	if (wait_conditions_done(mg)) {
	send_start(mg);
	cg->state = CGST_WAIT_MESSAGES;
	}
	break;

	case CGST_WAIT_MESSAGES:
	if (wait_messages_done(mg)) {
	sync_state(mg);
	cleanup_changes(mg);
	}
	break;

	default:
	log_error("apply_changes invalid state %d", cg->state);
	}
	}

	/* We send messages with the info from kernel uevents or mount.gfs ipc,
	and then process the uevent/ipc upon receiving the message for it, so
	that it can be processed in the same order by all nodes. */

	void process_recovery_uevent(char *table)
	{
	struct mountgroup *mg;
	struct journal *j;
	char *name = strstr(table, ":") + 1;
	int jid, recover_status, first_done;
	int rv;

	mg = find_mg(name);
	if (!mg) {
	log_error("process_recovery_uevent mg not found %s", table);
	return;
	}

	rv = read_sysfs_int(mg, "recover_done", &jid);
	if (rv < 0) {
	log_error("process_recovery_uevent recover_done read %d", rv);
	return;
	}

	rv = read_sysfs_int(mg, "recover_status", &recover_status);
	if (rv < 0) {
	log_error("process_recovery_uevent recover_status read %d", rv);
	return;
	}

	if (!mg->first_recovery_needed) {
	if (!mg->local_recovery_busy) {
	/* This will happen in two known situations:
	- we get a recovery_done uevent for our own journal
	when we mount (jid == mg->our_jid)
	- the first mounter will read first_done and clear
	first_recovery_needed before seeing the change
	uevent from others_may_mount */
	log_group(mg, "process_recovery_uevent jid %d ignore",
	jid);
	return;
	}

	mg->local_recovery_busy = 0;

	if (mg->local_recovery_jid != jid) {
	log_error("process_recovery_uevent jid %d exp %d",
	jid, mg->local_recovery_jid);
	return;
	}

	j = find_journal(mg, jid);
	if (!j) {
	log_error("process_recovery_uevent no journal %d", jid);
	return;
	}

	log_group(mg, "process_recovery_uevent jid %d status %d",
	jid, recover_status);

	j->local_recovery_done = 1;
	j->local_recovery_result = recover_status;

	/* j->needs_recovery will be cleared when we receive this
	recovery_result message */

	send_recovery_result(mg, jid, recover_status);
	} else {

	/*
	* Assumption here is that only the first mounter will get
	* uevents when first_recovery_needed is set.
	*/

	/* make a local record of jid and recover_status; we may want
	to check below that we've seen uevents for all jids
	during first recovery before sending first_recovery_done. */

	log_group(mg, "process_recovery_uevent jid %d status %d "
	"ignore during first_recovery", jid, recover_status);

	rv = read_sysfs_int(mg, "first_done", &first_done);
	if (rv < 0) {
	log_error("process_recovery_uevent first_done read %d",
	rv);
	return;
	}

	if (first_done) {
	log_group(mg, "process_recovery_uevent first_done");
	send_first_recovery_done(mg);
	}
	}

	process_mountgroup(mg);
	}

	static void start_journal_recovery(struct mountgroup *mg, int jid)
	{
	int rv;

	log_group(mg, "start_journal_recovery jid %d", jid);

	rv = set_sysfs(mg, "recover", jid);
	if (rv < 0) {
	log_error("start_journal_recovery %d error %d", jid, rv);
	return;
	}

	mg->local_recovery_busy = 1;
	mg->local_recovery_jid = jid;
	}

	static int wait_recoveries_done(struct mountgroup *mg)
	{
	struct journal *j;
	int wait_count = 0;

	list_for_each_entry(j, &mg->journals, list) {
	if (j->needs_recovery) {
	log_group(mg, "wait_recoveries jid %d unrecovered",
	j->jid);
	wait_count++;
	}
	}

	if (wait_count)
	return 0;

	log_group(mg, "wait_recoveries done");
	return 1;
	}

	/* pick a jid that has not been successfully recovered by someone else
	(received recovery_result success message) and hasn't been recovered
	by us (local record); if nothing to recover, return 0 */

	static int pick_journal_to_recover(struct mountgroup mg, int jid)
	{
	struct journal *j;

	list_for_each_entry(j, &mg->journals, list) {
	if (j->needs_recovery && !j->local_recovery_done) {
	*jid = j->jid;
	return 1;
	}
	}
	return 0;
	}

	/* processing that happens after all changes have been dealt with */

	static void recover_and_start(struct mountgroup *mg)
	{
	int jid;

	if (mg->first_recovery_needed) {
	if (mg->first_recovery_master == our_nodeid &&
	!mg->mount_client_notified) {
	log_group(mg, "recover_and_start first start_kernel");
	mg->first_mounter = 1; /* adds first=1 to hostdata */
	start_kernel(mg); /* includes reply to mount.gfs */
	}
	return;
	}

	/* The normal non-first-recovery mode. When a recovery_done message
	is received, check whether any more journals need recovery. If
	so, start recovery on the next one, if not, start the kernel. */

	if (!wait_recoveries_done(mg)) {
	if (!mg->kernel_mount_done \|\| mg->kernel_mount_error)
	return;
	if (mg->spectator)
	return;
	if (mg->local_recovery_busy)
	return;
	if (pick_journal_to_recover(mg, &jid))
	start_journal_recovery(mg, jid);
	} else {
	if (!mg->kernel_stopped)
	return;
	log_group(mg, "recover_and_start start_kernel");
	start_kernel(mg);
	}
	}

	static void process_mountgroup(struct mountgroup *mg)
	{
	if (!list_empty(&mg->changes))
	apply_changes(mg);

	if (mg->started_change && list_empty(&mg->changes))
	recover_and_start(mg);
	}

	void process_mountgroups(void)
	{
	struct mountgroup mg, safe;

	list_for_each_entry_safe(mg, safe, &mountgroups, list)
	process_mountgroup(mg);
	}

	static int add_change(struct mountgroup *mg,
	struct cpg_address *member_list, int member_list_entries,
	struct cpg_address *left_list, int left_list_entries,
	struct cpg_address *joined_list, int joined_list_entries,
	struct change **cg_out)
	{
	struct change *cg;
	struct member *memb;
	int i, error;

	cg = malloc(sizeof(struct change));
	if (!cg)
	goto fail_nomem;
	memset(cg, 0, sizeof(struct change));
	INIT_LIST_HEAD(&cg->members);
	INIT_LIST_HEAD(&cg->removed);
	INIT_LIST_HEAD(&cg->saved_messages);
	cg->state = CGST_WAIT_CONDITIONS;
	cg->seq = ++mg->change_seq;
	if (!cg->seq)
	cg->seq = ++mg->change_seq;

	cg->member_count = member_list_entries;
	cg->joined_count = joined_list_entries;
	cg->remove_count = left_list_entries;

	for (i = 0; i < member_list_entries; i++) {
	memb = malloc(sizeof(struct member));
	if (!memb)
	goto fail_nomem;
	memset(memb, 0, sizeof(struct member));
	memb->nodeid = member_list[i].nodeid;
	list_add_tail(&memb->list, &cg->members);
	}

	for (i = 0; i < left_list_entries; i++) {
	memb = malloc(sizeof(struct member));
	if (!memb)
	goto fail_nomem;
	memset(memb, 0, sizeof(struct member));
	memb->nodeid = left_list[i].nodeid;
	if (left_list[i].reason == CPG_REASON_NODEDOWN \|\|
	left_list[i].reason == CPG_REASON_PROCDOWN) {
	memb->failed = 1;
	cg->failed_count++;
	}
	list_add_tail(&memb->list, &cg->removed);

	if (memb->failed)
	node_history_fail(mg, memb->nodeid, cg,
	left_list[i].reason);
	else
	node_history_left(mg, memb->nodeid, cg);

	log_group(mg, "add_change cg %u remove nodeid %d reason %d",
	cg->seq, memb->nodeid, left_list[i].reason);

	if (left_list[i].reason == CPG_REASON_PROCDOWN)
	kick_node_from_cluster(memb->nodeid);
	}

	for (i = 0; i < joined_list_entries; i++) {
	memb = find_memb(cg, joined_list[i].nodeid);
	if (!memb) {
	log_error("no member %d", joined_list[i].nodeid);
	error = -ENOENT;
	goto fail;
	}
	memb->added = 1;

	if (memb->nodeid == our_nodeid)
	cg->we_joined = 1;
	else
	node_history_init(mg, memb->nodeid, cg);

	log_group(mg, "add_change cg %u joined nodeid %d", cg->seq,
	memb->nodeid);
	}

	if (cg->we_joined) {
	log_group(mg, "add_change cg %u we joined", cg->seq);
	list_for_each_entry(memb, &cg->members, list)
	node_history_init(mg, memb->nodeid, cg);
	}

	log_group(mg, "add_change cg %u counts member %d joined %d remove %d "
	"failed %d", cg->seq, cg->member_count, cg->joined_count,
	cg->remove_count, cg->failed_count);

	list_add(&cg->list, &mg->changes);
	*cg_out = cg;
	return 0;

	fail_nomem:
	log_error("no memory");
	error = -ENOMEM;
	fail:
	free_cg(cg);
	return error;
	}

	static int we_left(struct cpg_address *left_list, int left_list_entries)
	{
	int i;

	for (i = 0; i < left_list_entries; i++) {
	if (left_list[i].nodeid == our_nodeid)
	return 1;
	}
	return 0;
	}

	static void confchg_cb(cpg_handle_t handle, struct cpg_name *group_name,
	struct cpg_address *member_list, int member_list_entries,
	struct cpg_address *left_list, int left_list_entries,
	struct cpg_address *joined_list, int joined_list_entries)
	{
	struct mountgroup *mg;
	struct change *cg;
	int rv;

	mg = find_mg_handle(handle);
	if (!mg) {
	log_error("confchg_cb no mountgroup for cpg %s",
	group_name->value);
	return;
	}

	if (mg->leaving && we_left(left_list, left_list_entries)) {
	/* we called cpg_leave(), and this should be the final
	cpg callback we receive */
	log_group(mg, "confchg for our leave");
	dlmc_fs_unregister(dlmcontrol_fd, mg->name);
	cpg_finalize(mg->cpg_handle);
	client_dead(mg->cpg_client);
	list_del(&mg->list);
	free_mg(mg);
	return;
	}

	rv = add_change(mg, member_list, member_list_entries,
	left_list, left_list_entries,
	joined_list, joined_list_entries, &cg);
	if (rv)
	return;

	process_mountgroup(mg);
	}

	static void deliver_cb(cpg_handle_t handle, struct cpg_name *group_name,
	uint32_t nodeid, uint32_t pid, void *data, int len)
	{
	struct mountgroup *mg;
	struct gfs_header *hd;

	mg = find_mg_handle(handle);
	if (!mg) {
	log_error("deliver_cb no mg for cpg %s", group_name->value);
	return;
	}

	hd = (struct gfs_header *)data;

	hd->version[0] = le16_to_cpu(hd->version[0]);
	hd->version[1] = le16_to_cpu(hd->version[1]);
	hd->version[2] = le16_to_cpu(hd->version[2]);
	hd->type = le16_to_cpu(hd->type);
	hd->nodeid = le32_to_cpu(hd->nodeid);
	hd->to_nodeid = le32_to_cpu(hd->to_nodeid);
	hd->global_id = le32_to_cpu(hd->global_id);
	hd->flags = le32_to_cpu(hd->flags);
	hd->msgdata = le32_to_cpu(hd->msgdata);

	if (hd->version[0] != protocol_active[0]) {
	log_error("reject message from %d version %u.%u.%u vs %u.%u.%u",
	nodeid, hd->version[0], hd->version[1],
	hd->version[2], protocol_active[0],
	protocol_active[1], protocol_active[2]);
	return;
	}

	if (hd->nodeid != nodeid) {
	log_error("bad msg nodeid %d %d", hd->nodeid, nodeid);
	return;
	}

	switch (hd->type) {
	case GFS_MSG_START:
	receive_start(mg, hd, len);
	break;
	case GFS_MSG_MOUNT_DONE:
	if (!mg->started_count)
	save_message(mg, hd, len);
	else
	receive_mount_done(mg, hd, len);
	break;
	case GFS_MSG_FIRST_RECOVERY_DONE:
	receive_first_recovery_done(mg, hd, len);
	break;
	case GFS_MSG_RECOVERY_RESULT:
	if (!mg->started_count)
	save_message(mg, hd, len);
	else
	receive_recovery_result(mg, hd, len);
	break;
	+ case GFS_MSG_REMOUNT:
	+ receive_remount(mg, hd, len);
	+ break;
	default:
	log_error("unknown msg type %d", hd->type);
	}

	process_mountgroup(mg);
	}

	static cpg_callbacks_t cpg_callbacks = {
	.cpg_deliver_fn = deliver_cb,
	.cpg_confchg_fn = confchg_cb,
	};

	static void process_mountgroup_cpg(int ci)
	{
	struct mountgroup *mg;
	cpg_error_t error;

	mg = find_mg_ci(ci);
	if (!mg) {
	log_error("process_mountgroup_cpg no mountgroup for ci %d", ci);
	return;
	}

	error = cpg_dispatch(mg->cpg_handle, CPG_DISPATCH_ALL);
	if (error != CPG_OK) {
	log_error("cpg_dispatch error %d", error);
	return;
	}

	update_flow_control_status();
	}

	int gfs_join_mountgroup(struct mountgroup *mg)
	{
	cpg_error_t error;
	cpg_handle_t h;
	struct cpg_name name;
	int i = 0, fd, ci, rv;

	/* I think this registration with dlm_controld could be done
	just about anywhere before we do the mount(2). */
	rv = dlmc_fs_register(dlmcontrol_fd, mg->name);
	if (rv) {
	log_error("dlmc_fs_register failed %d", rv);
	return rv;
	}

	error = cpg_initialize(&h, &cpg_callbacks);
	if (error != CPG_OK) {
	log_error("cpg_initialize error %d", error);
	goto fail;
	}

	cpg_fd_get(h, &fd);

	ci = client_add(fd, process_mountgroup_cpg, NULL);

	mg->cpg_handle = h;
	mg->cpg_client = ci;
	mg->cpg_fd = fd;
	mg->kernel_stopped = 1;
	mg->joining = 1;

	memset(&name, 0, sizeof(name));
	sprintf(name.value, "gfs:%s", mg->name);
	name.length = strlen(name.value) + 1;

	/* TODO: allow global_id to be set in cluster.conf? */
	mg->id = cpgname_to_crc(name.value, name.length);

	retry:
	error = cpg_join(h, &name);
	if (error == CPG_ERR_TRY_AGAIN) {
	sleep(1);
	if (!(++i % 10))
	log_error("cpg_join error retrying");
	goto retry;
	}
	if (error != CPG_OK) {
	log_error("cpg_join error %d", error);
	cpg_finalize(h);
	goto fail_client;
	}

	return 0;

	fail_client:
	client_dead(ci);
	cpg_finalize(h);
	fail:
	dlmc_fs_unregister(dlmcontrol_fd, mg->name);
	return -ENOTCONN;
	}

	/* If mount(2) fails, we'll often get two leaves, one from seeing the remove
	uevent, and the other from mount.gfs. I suspect they could arrive in either
	order. We can just ignore the second. The second would either not find
	the mg here, or would see mg->leaving of 1 from the first. */

	void gfs_leave_mountgroup(char *mgname, int mnterr)
	{
	struct mountgroup *mg;
	cpg_error_t error;
	struct cpg_name name;
	int i = 0;

	mg = find_mg(mgname);
	if (!mg) {
	log_debug("leave: %s not found", mgname);
	return;
	}

	if (mg->leaving) {
	log_debug("leave: %s already leaving", mgname);
	return;
	}

	mg->leaving = 1;

	memset(&name, 0, sizeof(name));
	sprintf(name.value, "gfs:%s", mg->name);
	name.length = strlen(name.value) + 1;

	retry:
	error = cpg_leave(mg->cpg_handle, &name);
	if (error == CPG_ERR_TRY_AGAIN) {
	sleep(1);
	if (!(++i % 10))
	log_error("cpg_leave error retrying");
	goto retry;
	}
	if (error != CPG_OK)
	log_error("cpg_leave error %d", error);
	}

	int setup_cpg(void)
	{
	cpg_error_t error;

	error = cpg_initialize(&libcpg_handle, &cpg_callbacks);
	if (error != CPG_OK) {
	log_error("setup_cpg cpg_initialize error %d", error);
	return -1;
	}

	/* join "gfs_controld" cpg to interact with other daemons in
	the cluster before we start processing uevents? Could this
	also help in handling transient partitions? */

	return 0;
	}

	int setup_dlmcontrol(void)
	{
	dlmcontrol_fd = dlmc_fs_connect();
	return dlmcontrol_fd;
	}

	int set_mountgroup_info(struct mountgroup mg, struct gfsc_mountgroup out)
	{
	struct change cg, last = NULL;

	strncpy(out->name, mg->name, GFS_MOUNTGROUP_LEN);
	out->global_id = mg->id;

	if (mg->joining)
	out->flags \|= GFSC_MF_JOINING;
	if (mg->leaving)
	out->flags \|= GFSC_MF_LEAVING;
	if (mg->kernel_stopped)
	out->flags \|= GFSC_MF_KERNEL_STOPPED;
	if (mg->kernel_mount_done)
	out->flags \|= GFSC_MF_KERNEL_MOUNT_DONE;
	if (mg->kernel_mount_error)
	out->flags \|= GFSC_MF_KERNEL_MOUNT_ERROR;
	if (mg->first_recovery_needed)
	out->flags \|= GFSC_MF_FIRST_RECOVERY_NEEDED;
	if (mg->first_recovery_msg)
	out->flags \|= GFSC_MF_FIRST_RECOVERY_MSG;
	if (mg->local_recovery_busy)
	out->flags \|= GFSC_MF_LOCAL_RECOVERY_BUSY;

	if (!mg->started_change)
	goto next;

	cg = mg->started_change;

	out->cg_prev.member_count = cg->member_count;
	out->cg_prev.joined_count = cg->joined_count;
	out->cg_prev.remove_count = cg->remove_count;
	out->cg_prev.failed_count = cg->failed_count;
	out->cg_prev.combined_seq = cg->combined_seq;
	out->cg_prev.seq = cg->seq;

	next:
	if (list_empty(&mg->changes))
	goto out;

	list_for_each_entry(cg, &mg->changes, list)
	last = cg;

	cg = list_first_entry(&mg->changes, struct change, list);

	out->cg_next.member_count = cg->member_count;
	out->cg_next.joined_count = cg->joined_count;
	out->cg_next.remove_count = cg->remove_count;
	out->cg_next.failed_count = cg->failed_count;
	out->cg_next.combined_seq = last->seq;
	out->cg_next.seq = cg->seq;

	/* FIXME: use real definitions for these conditions
	(also in dlm_controld) */

	if (cg->state == CGST_WAIT_CONDITIONS)
	out->cg_next.wait_condition = 4;
	if (!mg->kernel_mount_done)
	out->cg_next.wait_condition = 1;
	if (mg->dlm_notify_nodeid)
	out->cg_next.wait_condition = 2;
	if (poll_dlm)
	out->cg_next.wait_condition = 3;

	if (cg->state == CGST_WAIT_MESSAGES)
	out->cg_next.wait_messages = 1;
	out:
	return 0;
	}

	static int _set_node_info(struct mountgroup mg, struct change cg, int nodeid,
	struct gfsc_node *node)
	{
	struct member *m = NULL;
	struct node *n;

	node->nodeid = nodeid;

	if (cg)
	m = find_memb(cg, nodeid);
	if (!m)
	goto history;

	node->flags \|= GFSC_NF_MEMBER;

	if (m->start)
	node->flags \|= GFSC_NF_START;
	if (m->disallowed)
	node->flags \|= GFSC_NF_DISALLOWED;

	history:
	n = get_node_history(mg, nodeid);
	if (!n)
	goto out;

	node->jid = n->jid;

	if (n->kernel_mount_done)
	node->flags \|= GFSC_NF_KERNEL_MOUNT_DONE;
	if (n->kernel_mount_error)
	node->flags \|= GFSC_NF_KERNEL_MOUNT_ERROR;
	if (n->check_dlm)
	node->flags \|= GFSC_NF_CHECK_DLM;
	if (n->ro)
	node->flags \|= GFSC_NF_READONLY;
	if (n->spectator)
	node->flags \|= GFSC_NF_SPECTATOR;

	node->added_seq = n->added_seq;
	node->removed_seq = n->removed_seq;
	node->failed_reason = n->failed_reason;
	out:
	return 0;
	}

	int set_node_info(struct mountgroup mg, int nodeid, struct gfsc_node node)
	{
	struct change *cg;

	if (!list_empty(&mg->changes)) {
	cg = list_first_entry(&mg->changes, struct change, list);
	return _set_node_info(mg, cg, nodeid, node);
	}

	return _set_node_info(mg, mg->started_change, nodeid, node);
	}

	int set_mountgroups(int count, struct gfsc_mountgroup *mgs_out)
	{
	struct mountgroup *mg;
	struct gfsc_mountgroup mgs, mgp;
	int mg_count = 0;

	list_for_each_entry(mg, &mountgroups, list)
	mg_count++;

	mgs = malloc(mg_count * sizeof(struct gfsc_mountgroup));
	if (!mgs)
	return -ENOMEM;
	memset(mgs, 0, mg_count * sizeof(struct gfsc_mountgroup));

	mgp = mgs;
	list_for_each_entry(mg, &mountgroups, list) {
	set_mountgroup_info(mg, mgp++);
	}

	*count = mg_count;
	*mgs_out = mgs;
	return 0;
	}

	int set_mountgroup_nodes(struct mountgroup mg, int option, int node_count,
	struct gfsc_node **nodes_out)
	{
	struct change *cg;
	struct node *n;
	struct gfsc_node nodes = NULL, nodep;
	struct member *memb;
	int count = 0;

	if (option == GFSC_NODES_ALL) {
	if (!list_empty(&mg->changes))
	cg = list_first_entry(&mg->changes, struct change,list);
	else
	cg = mg->started_change;

	list_for_each_entry(n, &mg->node_history, list)
	count++;

	} else if (option == GFSC_NODES_MEMBERS) {
	if (!mg->started_change)
	goto out;
	cg = mg->started_change;
	count = cg->member_count;

	} else if (option == GFSC_NODES_NEXT) {
	if (list_empty(&mg->changes))
	goto out;
	cg = list_first_entry(&mg->changes, struct change, list);
	count = cg->member_count;
	} else
	goto out;

	nodes = malloc(count * sizeof(struct gfsc_node));
	if (!nodes)
	return -ENOMEM;
	memset(nodes, 0, count * sizeof(struct gfsc_node));
	nodep = nodes;

	if (option == GFSC_NODES_ALL) {
	list_for_each_entry(n, &mg->node_history, list)
	_set_node_info(mg, cg, n->nodeid, nodep++);
	} else {
	list_for_each_entry(memb, &cg->members, list)
	_set_node_info(mg, cg, memb->nodeid, nodep++);
	}
	out:
	*node_count = count;
	*nodes_out = nodes;
	return 0;
	}

	diff --git a/group/gfs_controld/cpg-old.c b/group/gfs_controld/cpg-old.c
	index 56e4f089b..517e2229d 100644
	--- a/group/gfs_controld/cpg-old.c
	+++ b/group/gfs_controld/cpg-old.c
	@@ -1,2472 +1,2445 @@
	#include "gfs_daemon.h"
	#include "config.h"
	#include "cpg-old.h"
	#include "libgroup.h"

	#define ASSERT(x) \
	do { \
	if (!(x)) { \
	log_error("Assertion failed on line %d of file %s\n" \
	"Assertion: \"%s\"\n", __LINE__, __FILE__, #x); \
	} \
	} while (0)

	#define JID_INIT -9

	/* mg_member opts bit field */

	enum {
	MEMB_OPT_RW = 1,
	MEMB_OPT_RO = 2,
	MEMB_OPT_SPECT = 4,
	MEMB_OPT_RECOVER = 8,
	};

	/* mg_member state: local_recovery_status, recovery_status */

	enum {
	RS_NEED_RECOVERY = 1,
	RS_SUCCESS,
	RS_GAVEUP,
	RS_NOFS,
	RS_READONLY,
	};

	extern group_handle_t gh;

	/* cpg message protocol
	1.0.0 is initial version
	2.0.0 is incompatible with 1.0.0 and allows plock ownership */
	static unsigned int protocol_v100[3] = {1, 0, 0};
	static unsigned int protocol_v200[3] = {2, 0, 0};
	static unsigned int protocol_active[3];

	static struct list_head withdrawn_mounts;
	static struct cpg_name daemon_name;


	static void send_journals(struct mountgroup *mg, int nodeid);


	static char *msg_name(int type)
	{
	switch (type) {
	case MSG_JOURNAL:
	return "MSG_JOURNAL";
	case MSG_OPTIONS:
	return "MSG_OPTIONS";
	case MSG_REMOUNT:
	return "MSG_REMOUNT";
	case MSG_PLOCK:
	return "MSG_PLOCK";
	case MSG_MOUNT_STATUS:
	return "MSG_MOUNT_STATUS";
	case MSG_RECOVERY_STATUS:
	return "MSG_RECOVERY_STATUS";
	case MSG_RECOVERY_DONE:
	return "MSG_RECOVERY_DONE";
	case MSG_WITHDRAW:
	return "MSG_WITHDRAW";
	}
	return "unknown";
	}

	static int _send_message(cpg_handle_t h, void *buf, int len, int type)
	{
	struct iovec iov;
	cpg_error_t error;
	int retries = 0;

	iov.iov_base = buf;
	iov.iov_len = len;

	retry:
	error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1);
	if (error == CPG_ERR_TRY_AGAIN) {
	retries++;
	usleep(1000);
	if (!(retries % 100))
	log_error("cpg_mcast_joined retry %d %s",
	retries, msg_name(type));
	goto retry;
	}
	if (error != CPG_OK) {
	log_error("cpg_mcast_joined error %d handle %llx %s",
	error, (unsigned long long)h, msg_name(type));
	return -1;
	}

	if (retries)
	log_debug("cpg_mcast_joined retried %d %s",
	retries, msg_name(type));

	return 0;
	}

	int send_group_message_old(struct mountgroup mg, int len, char buf)
	{
	struct gdlm_header hd = (struct gdlm_header ) buf;
	int type = hd->type;

	hd->version[0] = cpu_to_le16(protocol_active[0]);
	hd->version[1] = cpu_to_le16(protocol_active[1]);
	hd->version[2] = cpu_to_le16(protocol_active[2]);
	hd->type = cpu_to_le16(hd->type);
	hd->nodeid = cpu_to_le32(hd->nodeid);
	hd->to_nodeid = cpu_to_le32(hd->to_nodeid);
	memcpy(hd->name, mg->name, strlen(mg->name));

	return _send_message(libcpg_handle, buf, len, type);
	}

	static struct mg_member find_memb_nodeid(struct mountgroup mg, int nodeid)
	{
	struct mg_member *memb;

	list_for_each_entry(memb, &mg->members, list) {
	if (memb->nodeid == nodeid)
	return memb;
	}
	return NULL;
	}

	static struct mg_member find_memb_jid(struct mountgroup mg, int jid)
	{
	struct mg_member *memb;

	list_for_each_entry(memb, &mg->members, list) {
	if (memb->jid == jid)
	return memb;
	}
	return NULL;
	}

	static void notify_mount_client(struct mountgroup *mg)
	{
	struct mg_member *memb;

	if (!mg->mount_client_result && mg->mount_client_delay) {
	log_group(mg, "notify_mount_client delayed");
	return;
	}

	client_reply_join_full(mg, mg->mount_client_result);

	if (mg->mount_client_result) {
	log_group(mg, "leaving due to mount error: %d",
	mg->mount_client_result);

	memb = find_memb_nodeid(mg, our_nodeid);
	if (memb->finished)
	group_leave(gh, mg->name);
	else {
	log_group(mg, "delay leave until after join");
	mg->group_leave_on_finish = 1;
	}
	} else {
	mg->mount_client_notified = 1;
	}
	}

	/* we can receive recovery_status messages from other nodes doing start before
	we actually process the corresponding start callback ourselves */

	void save_message_old(struct mountgroup mg, char buf, int len, int from,
	int type)
	{
	struct save_msg *sm;

	sm = malloc(sizeof(struct save_msg) + len);
	if (!sm)
	return;
	memset(sm, 0, sizeof(struct save_msg) + len);

	memcpy(&sm->buf, buf, len);
	sm->type = type;
	sm->len = len;
	sm->nodeid = from;

	log_group(mg, "save %s from %d len %d", msg_name(type), from, len);

	list_add_tail(&sm->list, &mg->saved_messages);
	}

	static int first_mounter_recovery(struct mountgroup *mg)
	{
	struct mg_member *memb;

	list_for_each_entry(memb, &mg->members, list) {
	if (memb->opts & MEMB_OPT_RECOVER)
	return memb->nodeid;
	}
	return 0;
	}

	static int local_first_mounter_recovery(struct mountgroup *mg)
	{
	int nodeid;

	nodeid = first_mounter_recovery(mg);
	if (nodeid == our_nodeid)
	return 1;
	return 0;
	}

	int remote_first_mounter_recovery(struct mountgroup *mg)
	{
	int nodeid;

	nodeid = first_mounter_recovery(mg);
	if (nodeid && (nodeid != our_nodeid))
	return 1;
	return 0;
	}

	static void start_done(struct mountgroup *mg)
	{
	log_group(mg, "start_done %d", mg->start_event_nr);
	group_start_done(gh, mg->name, mg->start_event_nr);
	}

	void send_withdraw_old(struct mountgroup *mg)
	{
	struct gdlm_header *hd;
	int len;
	char *buf;

	len = sizeof(struct gdlm_header);

	buf = malloc(len);
	if (!buf)
	return;
	memset(buf, 0, len);

	hd = (struct gdlm_header *)buf;
	hd->type = MSG_WITHDRAW;
	hd->nodeid = our_nodeid;
	hd->to_nodeid = 0;

	log_group(mg, "send_withdraw");

	send_group_message_old(mg, len, buf);

	free(buf);
	}

	static void receive_withdraw(struct mountgroup mg, char buf, int len, int from)
	{
	struct mg_member *memb;

	memb = find_memb_nodeid(mg, from);
	if (!memb) {
	log_group(mg, "receive_withdraw no member %d", from);
	return;
	}
	log_group(mg, "receive_withdraw from %d", from);
	memb->withdrawing = 1;

	if (from == our_nodeid)
	group_leave(gh, mg->name);
	}

	#define SEND_RS_INTS 3

	static void send_recovery_status(struct mountgroup *mg)
	{
	struct gdlm_header *hd;
	struct mg_member *memb;
	int len, *p, i, n = 0;
	char *buf;

	list_for_each_entry(memb, &mg->members_gone, list) {
	if (memb->local_recovery_status == RS_SUCCESS)
	n++;
	}

	len = sizeof(struct gdlm_header) + (n * SEND_RS_INTS * sizeof(int));

	buf = malloc(len);
	if (!buf)
	return;
	memset(buf, 0, len);

	hd = (struct gdlm_header *)buf;
	hd->type = MSG_RECOVERY_STATUS;
	hd->nodeid = our_nodeid;
	hd->to_nodeid = 0;
	p = (int *) (buf + sizeof(struct gdlm_header));

	i = 0;
	list_for_each_entry(memb, &mg->members_gone, list) {
	if (memb->local_recovery_status != RS_SUCCESS)
	continue;
	p[i] = cpu_to_le32(memb->nodeid);
	i++;
	p[i] = cpu_to_le32(memb->jid);
	i++;
	p[i] = cpu_to_le32(memb->local_recovery_status);
	i++;
	}

	log_group(mg, "send_recovery_status for %d nodes len %d", n, len);

	send_group_message_old(mg, len, buf);

	free(buf);
	}

	/* Note: we can get more than one node reporting success in recovering
	the journal for a failed node. The first has really recovered it,
	the rest have found the fs clean and report success. */

	static void _receive_recovery_status(struct mountgroup mg, char buf, int len,
	int from)
	{
	struct mg_member *memb;
	int *p, n, i, nodeid, jid, status, found = 0;

	n = (len - sizeof(struct gdlm_header)) / (SEND_RS_INTS * sizeof(int));

	p = (int *) (buf + sizeof(struct gdlm_header));

	for (i = 0; i < n; i++) {
	nodeid = le32_to_cpu(p[i * SEND_RS_INTS]);
	jid = le32_to_cpu(p[i * SEND_RS_INTS + 1]);
	status = le32_to_cpu(p[i * SEND_RS_INTS + 2]);

	ASSERT(status == RS_SUCCESS);

	found = 0;
	list_for_each_entry(memb, &mg->members_gone, list) {
	if (memb->nodeid != nodeid)
	continue;
	ASSERT(memb->jid == jid);
	ASSERT(memb->recovery_status == RS_NEED_RECOVERY \|\|
	memb->recovery_status == RS_SUCCESS);
	memb->recovery_status = status;
	found = 1;
	break;
	}

	log_group(mg, "receive_recovery_status from %d len %d "
	"nodeid %d jid %d status %d found %d",
	from, len, nodeid, jid, status, found);
	}

	if (from == our_nodeid)
	start_done(mg);
	}

	static void process_saved_recovery_status(struct mountgroup *mg)
	{
	struct save_msg sm, sm2;

	if (list_empty(&mg->saved_messages))
	return;

	log_group(mg, "process_saved_recovery_status");

	list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
	if (sm->type != MSG_RECOVERY_STATUS)
	continue;
	_receive_recovery_status(mg, sm->buf, sm->len, sm->nodeid);
	list_del(&sm->list);
	free(sm);
	}
	}

	static void assign_next_first_mounter(struct mountgroup *mg)
	{
	struct mg_member memb, next = NULL;
	int low = -1;

	list_for_each_entry(memb, &mg->members, list) {
	if (memb->jid == -2)
	continue;
	if (memb->jid == -9)
	continue;
	if (memb->spectator \|\| memb->readonly \|\| memb->withdrawing \|\|
	memb->ms_kernel_mount_done)
	continue;
	if (low == -1 \|\| memb->nodeid < low) {
	next = memb;
	low = memb->nodeid;
	}
	}

	if (next) {
	log_group(mg, "next first mounter is %d jid %d opts %x",
	next->nodeid, next->jid, next->opts);
	next->opts \|= MEMB_OPT_RECOVER;
	ASSERT(next->jid >= 0);
	} else
	log_group(mg, "no next mounter available yet");
	}

	#define SEND_MS_INTS 4

	void send_mount_status_old(struct mountgroup *mg)
	{
	struct gdlm_header *hd;
	int len, *p;
	char *buf;

	len = sizeof(struct gdlm_header) + (SEND_MS_INTS * sizeof(int));

	buf = malloc(len);
	if (!buf)
	return;
	memset(buf, 0, len);

	hd = (struct gdlm_header *)buf;
	hd->type = MSG_MOUNT_STATUS;
	hd->nodeid = our_nodeid;
	hd->to_nodeid = 0;

	p = (int *) (buf + sizeof(struct gdlm_header));

	p[0] = cpu_to_le32(mg->first_mounter);
	p[1] = cpu_to_le32(mg->kernel_mount_error);
	p[2] = 0; /* unused */
	p[3] = 0; /* unused */

	log_group(mg, "send_mount_status kernel_mount_error %d "
	"first_mounter %d",
	mg->kernel_mount_error,
	mg->first_mounter);

	send_group_message_old(mg, len, buf);

	free(buf);
	}

	static void _receive_mount_status(struct mountgroup mg, char buf, int len,
	int from)
	{
	struct mg_member memb, us;
	int *p;

	p = (int *) (buf + sizeof(struct gdlm_header));

	memb = find_memb_nodeid(mg, from);
	if (!memb) {
	log_group(mg, "_receive_mount_status no node %d", from);
	return;
	}

	memb->ms_kernel_mount_done = 1;
	memb->ms_first_mounter = le32_to_cpu(p[0]);
	memb->ms_kernel_mount_error = le32_to_cpu(p[1]);

	log_group(mg, "_receive_mount_status from %d kernel_mount_error %d "
	"first_mounter %d opts %x", from,
	memb->ms_kernel_mount_error, memb->ms_first_mounter,
	memb->opts);

	if (memb->opts & MEMB_OPT_RECOVER) {
	ASSERT(memb->ms_first_mounter);
	}
	if (memb->ms_first_mounter) {
	ASSERT(memb->opts & MEMB_OPT_RECOVER);
	}

	if (memb->ms_first_mounter) {
	memb->opts &= ~MEMB_OPT_RECOVER;

	if (!memb->ms_kernel_mount_error) {
	/* the first mounter has successfully mounted, we can
	go ahead and mount now */

	if (mg->mount_client_delay) {
	mg->mount_client_delay = 0;
	notify_mount_client(mg);
	}
	} else {
	/* first mounter mount failed, next low node should be
	made first mounter */

	memb->jid = -2;
	if (from == our_nodeid)
	mg->our_jid = -2;

	assign_next_first_mounter(mg);

	/* if we became the next first mounter, then notify
	mount client */

	us = find_memb_nodeid(mg, our_nodeid);
	if (us->opts & MEMB_OPT_RECOVER) {
	log_group(mg, "we are next first mounter");
	mg->first_mounter = 1;
	mg->first_mounter_done = 0;
	mg->mount_client_delay = 0;
	notify_mount_client(mg);
	}
	}
	}
	}

	static void receive_mount_status(struct mountgroup mg, char buf, int len,
	int from)
	{
	log_group(mg, "receive_mount_status from %d len %d last_cb %d",
	from, len, mg->last_callback);

	if (!mg->got_our_options) {
	log_group(mg, "ignore mount_status from %d", from);
	return;
	}

	if (!mg->got_our_journals)
	save_message_old(mg, buf, len, from, MSG_MOUNT_STATUS);
	else
	_receive_mount_status(mg, buf, len, from);
	}

	/* We delay processing mount_status msesages until we receive the journals
	message for our own mount. Our journals message is a snapshot of the memb
	list at the time our options message is received on the remote node. We
	ignore any messages that would change the memb list prior to seeing our own
	options message and we save any messages that would change the memb list
	after seeing our own options message and before we receive the memb list
	from the journals message. */

	static void process_saved_mount_status(struct mountgroup *mg)
	{
	struct save_msg sm, sm2;

	if (list_empty(&mg->saved_messages))
	return;

	log_group(mg, "process_saved_mount_status");

	list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
	if (sm->type != MSG_MOUNT_STATUS)
	continue;
	_receive_mount_status(mg, sm->buf, sm->len, sm->nodeid);
	list_del(&sm->list);
	free(sm);
	}
	}

	static void receive_recovery_status(struct mountgroup mg, char buf, int len,
	int from)
	{
	switch (mg->last_callback) {
	case DO_STOP:
	save_message_old(mg, buf, len, from, MSG_RECOVERY_STATUS);
	break;
	case DO_START:
	_receive_recovery_status(mg, buf, len, from);
	break;
	default:
	log_group(mg, "receive_recovery_status %d last_callback %d",
	from, mg->last_callback);
	}
	}

	/* tell others that all journals are recovered; they should clear
	memb's from members_gone, clear needs_recovery and unblock locks */

	static void send_recovery_done(struct mountgroup *mg)
	{
	struct gdlm_header *hd;
	int len;
	char *buf;

	len = sizeof(struct gdlm_header);

	buf = malloc(len);
	if (!buf)
	return;
	memset(buf, 0, len);

	hd = (struct gdlm_header *)buf;
	hd->type = MSG_RECOVERY_DONE;
	hd->nodeid = our_nodeid;
	hd->to_nodeid = 0;

	send_group_message_old(mg, len, buf);

	free(buf);
	}

	static void receive_recovery_done(struct mountgroup mg, char buf, int len,
	int from)
	{
	struct mg_member memb, safe;

	log_group(mg, "receive_recovery_done from %d needs_recovery %d",
	from, mg->needs_recovery);

	list_for_each_entry_safe(memb, safe, &mg->members_gone, list) {
	log_group(mg, "receive_recovery_done clear jid %d nodeid %d",
	memb->jid, memb->nodeid);
	list_del(&memb->list);
	free(memb);
	}

	mg->needs_recovery = 0;
	mg->kernel_stopped = 0; /* for queries */
	set_sysfs(mg, "block", 0);
	}

	-static void send_remount(struct mountgroup *mg, int ro)
	+void send_remount_old(struct mountgroup mg, struct gfsc_mount_args ma)
	{
	struct gdlm_header *hd;
	- int len;
	char *buf;
	+ int len;
	+ int ro = strstr(ma->options, "ro") ? 1 : 0;

	len = sizeof(struct gdlm_header) + MAX_OPTIONS_LEN;

	buf = malloc(len);
	if (!buf)
	return;
	memset(buf, 0, len);

	hd = (struct gdlm_header *)buf;
	hd->type = MSG_REMOUNT;
	hd->nodeid = our_nodeid;
	hd->to_nodeid = 0;

	strcpy(buf+sizeof(struct gdlm_header), ro ? "ro" : "rw");

	- log_group(mg, "send_remount len %d \"%s\"", len,
	+ log_group(mg, "send_remount_old len %d \"%s\"", len,
	buf+sizeof(struct gdlm_header));

	send_group_message_old(mg, len, buf);

	free(buf);
	}

	static void receive_remount(struct mountgroup mg, char buf, int len, int from)
	{
	struct mg_member *memb;
	char *options;
	int rw = 0, ro = 0;
	int result = 0;

	options = (char *) (buf + sizeof(struct gdlm_header));

	memb = find_memb_nodeid(mg, from);
	if (!memb) {
	log_error("receive_remount: unknown nodeid %d", from);
	return;
	}

	if (strstr(options, "rw"))
	rw = 1;
	else if (strstr(options, "ro"))
	ro = 1;
	else {
	result = -EINVAL;
	goto out;
	}

	/* FIXME: check if we've even fully completed our normal mount yet
	(received our own mount-status?) if not, then disallow remount */

	/* FIXME: going ro->rw may mean we can now do journal or first-mounter
	recovery that we couldn't do before. */

	memb->readonly = ro;
	memb->rw = !ro;

	if (ro) {
	memb->opts &= ~MEMB_OPT_RW;
	memb->opts \|= MEMB_OPT_RO;
	} else {
	memb->opts &= ~MEMB_OPT_RO;
	memb->opts \|= MEMB_OPT_RW;
	}
	out:
	if (from == our_nodeid) {
	if (!result) {
	mg->rw = memb->rw;
	mg->ro = memb->readonly;
	}
	- client_reply_remount(mg, result);
	+ client_reply_remount(mg, mg->remount_client, result);
	}

	log_group(mg, "receive_remount from %d rw=%d ro=%d opts=%x",
	from, memb->rw, memb->readonly, memb->opts);
	}

	static void set_our_memb_options(struct mountgroup *mg)
	{
	struct mg_member *memb;
	memb = find_memb_nodeid(mg, our_nodeid);
	ASSERT(memb);

	if (mg->ro) {
	memb->readonly = 1;
	memb->opts \|= MEMB_OPT_RO;
	} else if (mg->spectator) {
	memb->spectator = 1;
	memb->opts \|= MEMB_OPT_SPECT;
	} else if (mg->rw) {
	memb->rw = 1;
	memb->opts \|= MEMB_OPT_RW;
	}
	}

	static void send_options(struct mountgroup *mg)
	{
	struct gdlm_header *hd;
	int len;
	char *buf;

	len = sizeof(struct gdlm_header) + MAX_OPTIONS_LEN;

	buf = malloc(len);
	if (!buf)
	return;
	memset(buf, 0, len);

	hd = (struct gdlm_header *)buf;
	hd->type = MSG_OPTIONS;
	hd->nodeid = our_nodeid;
	hd->to_nodeid = 0;

	strncpy(buf+sizeof(struct gdlm_header), mg->mount_args.options,
	MAX_OPTIONS_LEN-1);

	log_group(mg, "send_options len %d \"%s\"", len,
	buf+sizeof(struct gdlm_header));

	send_group_message_old(mg, len, buf);

	free(buf);
	}

	/* We set the new member's jid to the lowest unused jid. If we're the lowest
	existing member (by nodeid), then send jid info to the new node. */

	/* Look at rw/ro/spectator status of all existing mounters and whether
	we need to do recovery. Based on that, decide if the current mount
	mode (ro/spectator) is permitted; if not, set jid = -2. If spectator
	mount and it's ok, set jid = -1. If ro or rw mount and it's ok, set
	real jid. */

	static int assign_journal(struct mountgroup mg, struct mg_member new)
	{
	struct mg_member memb, memb_recover = NULL, *memb_mounted = NULL;
	int i, total, rw_count, ro_count, spect_count, invalid_count;

	total = rw_count = ro_count = spect_count = invalid_count = 0;

	list_for_each_entry(memb, &mg->members, list) {
	if (memb->nodeid == new->nodeid)
	continue;
	total++;
	if (memb->jid == -2)
	invalid_count++;
	else if (memb->spectator)
	spect_count++;
	else if (memb->rw)
	rw_count++;
	else if (memb->readonly)
	ro_count++;

	if (memb->opts & MEMB_OPT_RECOVER) {
	memb_recover = memb;
	log_group(mg, "assign_journal: memb %d has OPT_RECOVER",
	memb->nodeid);
	}

	if (memb->ms_kernel_mount_done && !memb->ms_kernel_mount_error)
	memb_mounted = memb;
	}

	log_group(mg, "assign_journal: total %d iv %d rw %d ro %d spect %d "
	"needs_recovery %d", total, invalid_count, rw_count,
	ro_count, spect_count, mg->needs_recovery);

	if (new->spectator) {
	log_group(mg, "assign_journal: new spectator allowed");
	new->jid = -1;
	goto out;
	}

	for (i = 0; i < 1024; i++) {
	memb = find_memb_jid(mg, i);
	if (!memb) {
	new->jid = i;
	break;
	}
	}

	/* Repeat first-mounter recovery: the fs has been mounted and in-use,
	but nodes have failed and none of the current mounters has been able
	to do recovery (all remaining nodes may be ro/spect for example).
	This puts us into the special "needs_recovery" state where new
	mounters are asked to do first-mounter recovery of the fs while
	the current mounters sit in a blocked state. */

	if (mg->needs_recovery) {
	if (!memb_recover) {
	log_group(mg, "assign_journal: needs_recovery: "
	"new memb %d gets OPT_RECOVER",
	new->nodeid);
	new->opts \|= MEMB_OPT_RECOVER;
	} else {
	log_group(mg, "assign_journal: needs_recovery: "
	"new memb %d memb %d has OPT_RECOVER",
	new->nodeid, memb_recover->nodeid);
	}
	goto out;
	}

	/* Initial first-mounter recovery: the fs is coming online, the first
	mg member assumes first-mounter role and other nodes join the mg
	while the first-mounter is working. These non-first mounters wait
	for the first-mounter to finish before notifying mount.gfs. If the
	first-mounter fails, one of them will become the first-mounter. */

	/* it shouldn't be possible to have someone doing first mounter
	recovery and also have someone with the fs fully mounted */

	if (memb_mounted && memb_recover) {
	log_group(mg, "memb_mounted %d memb_recover %d",
	memb_mounted->nodeid, memb_recover->nodeid);
	ASSERT(0);
	}

	/* someone has successfully mounted the fs which means the fs doesn't
	need first mounter recovery */

	if (memb_mounted) {
	log_group(mg, "assign_journal: no first recovery needed %d",
	memb_mounted->nodeid);
	goto out;
	}

	/* someone is currently doing first mounter recovery, they'll send
	mount_status when they're done letting everyone know the result */

	if (memb_recover) {
	log_group(mg, "assign_journal: %d doing first recovery",
	memb_recover->nodeid);
	goto out;
	}

	/* when we received our journals, no one was flagged with OPT_RECOVER
	which means no first mounter recovery is needed or is current */

	if (mg->global_first_recover_done) {
	log_group(mg, "assign_journal: global_first_recover_done");
	goto out;
	}

	/* no one has done kernel mount successfully and no one is doing first
	mounter recovery, the new node gets to try first mounter recovery */

	log_group(mg, "kernel_mount_done %d kernel_mount_error %d "
	"first_mounter %d first_mounter_done %d",
	mg->kernel_mount_done, mg->kernel_mount_error,
	mg->first_mounter, mg->first_mounter_done);

	log_group(mg, "assign_journal: new memb %d gets OPT_RECOVER for: "
	"fs not mounted", new->nodeid);
	new->opts \|= MEMB_OPT_RECOVER;

	out:
	log_group(mg, "assign_journal: new member %d got jid %d opts %x",
	new->nodeid, new->jid, new->opts);

	if (mg->master_nodeid == our_nodeid) {
	store_plocks(mg, new->nodeid);
	send_journals(mg, new->nodeid);
	}
	return 0;
	}

	static void _receive_options(struct mountgroup mg, char buf, int len,
	int from)
	{
	struct mg_member *memb;
	struct gdlm_header *hd;
	char *options;

	hd = (struct gdlm_header *)buf;
	options = (char *) (buf + sizeof(struct gdlm_header));

	memb = find_memb_nodeid(mg, from);
	if (!memb) {
	log_error("unknown nodeid %d for options message", from);
	return;
	}

	if (strstr(options, "spectator")) {
	memb->spectator = 1;
	memb->opts \|= MEMB_OPT_SPECT;
	} else if (strstr(options, "rw")) {
	memb->rw = 1;
	memb->opts \|= MEMB_OPT_RW;
	} else if (strstr(options, "ro")) {
	memb->readonly = 1;
	memb->opts \|= MEMB_OPT_RO;
	}

	log_group(mg, "_receive_options from %d rw=%d ro=%d spect=%d opts=%x",
	from, memb->rw, memb->readonly, memb->spectator, memb->opts);

	assign_journal(mg, memb);
	}

	static void receive_options(struct mountgroup mg, char buf, int len, int from)
	{
	struct gdlm_header hd = (struct gdlm_header )buf;
	struct mg_member *memb;

	log_group(mg, "receive_options from %d len %d last_cb %d",
	from, len, mg->last_callback);

	if (hd->nodeid == our_nodeid) {
	mg->got_our_options = 1;
	mg->save_plocks = 1;
	return;
	}

	if (!mg->got_our_options) {
	log_group(mg, "ignore options from %d", from);
	return;
	}

	/* we can receive an options message before getting the start
	that adds the mounting node that sent the options, or
	we can receive options messages before we get the journals
	message for out own mount */

	memb = find_memb_nodeid(mg, from);

	if (!memb \|\| !mg->got_our_journals)
	save_message_old(mg, buf, len, from, MSG_OPTIONS);
	else
	_receive_options(mg, buf, len, from);
	}

	static void process_saved_options(struct mountgroup *mg)
	{
	struct save_msg sm, sm2;

	if (list_empty(&mg->saved_messages))
	return;

	log_group(mg, "process_saved_options");

	list_for_each_entry_safe(sm, sm2, &mg->saved_messages, list) {
	if (sm->type != MSG_OPTIONS)
	continue;
	_receive_options(mg, sm->buf, sm->len, sm->nodeid);
	list_del(&sm->list);
	free(sm);
	}
	}

	#define NUM 3

	/* send nodeid/jid/opts of every member to nodeid */

	static void send_journals(struct mountgroup *mg, int nodeid)
	{
	struct mg_member *memb;
	struct gdlm_header *hd;
	int i, len;
	char *buf;
	int *ids;

	len = sizeof(struct gdlm_header) + (mg->memb_count * NUM * sizeof(int));

	buf = malloc(len);
	if (!buf)
	return;
	memset(buf, 0, len);

	hd = (struct gdlm_header *)buf;
	hd->type = MSG_JOURNAL;
	hd->nodeid = our_nodeid;
	hd->to_nodeid = nodeid;
	ids = (int *) (buf + sizeof(struct gdlm_header));

	i = 0;
	list_for_each_entry(memb, &mg->members, list) {
	ids[i] = cpu_to_le32(memb->nodeid);
	i++;
	ids[i] = cpu_to_le32(memb->jid);
	i++;
	ids[i] = cpu_to_le32(memb->opts);
	i++;
	}

	log_group(mg, "send_journals to %d len %d count %d", nodeid, len, i);

	send_group_message_old(mg, len, buf);

	free(buf);
	}

	static void received_our_jid(struct mountgroup *mg)
	{
	log_group(mg, "received_our_jid %d", mg->our_jid);

	/* we've been given jid of -2 which means we're not permitted
	to mount the fs; probably because we're trying to mount readonly
	but the next mounter is required to be rw */

	if (mg->our_jid == -2) {
	mg->mount_client_result = -EUCLEAN;
	goto out;
	}

	/* fs needs recovery and existing mounters can't recover it,
	i.e. they're spectator/readonly or the first mounter's
	mount(2) failed, so we're told to do first-mounter recovery
	on the fs. */

	if (local_first_mounter_recovery(mg)) {
	log_group(mg, "we're told to do first mounter recovery");
	mg->first_mounter = 1;
	mg->first_mounter_done = 0;
	mg->mount_client_delay = 0;
	mg->save_plocks = 0;
	goto out;
	} else if (remote_first_mounter_recovery(mg)) {
	/* delay notifying mount client until we get a successful
	mount status from the first mounter */
	log_group(mg, "other node doing first mounter recovery, "
	"set mount_client_delay");
	mg->mount_client_delay = 1;
	mg->save_plocks = 0;
	return;
	}

	retrieve_plocks(mg);
	mg->save_plocks = 0;
	process_saved_plocks(mg);
	out:
	notify_mount_client(mg);
	}

	static void _receive_journals(struct mountgroup mg, char buf, int len,
	int from)
	{
	struct mg_member memb, memb2;
	struct gdlm_header *hd;
	int *ids, count, i, nodeid, jid, opts;
	int current_first_recover = 0;

	hd = (struct gdlm_header *)buf;

	count = (len - sizeof(struct gdlm_header)) / (NUM * sizeof(int));
	ids = (int *) (buf + sizeof(struct gdlm_header));

	for (i = 0; i < count; i++) {
	nodeid = le32_to_cpu(ids[i * NUM]);
	jid = le32_to_cpu(ids[i * NUM + 1]);
	opts = le32_to_cpu(ids[i * NUM + 2]);

	log_debug("receive nodeid %d jid %d opts %x",
	nodeid, jid, opts);

	memb = find_memb_nodeid(mg, nodeid);
	memb2 = find_memb_jid(mg, jid);

	if (!memb \|\| memb2) {
	log_error("invalid journals message "
	"nodeid %d jid %d opts %x",
	nodeid, jid, opts);
	}
	if (!memb)
	continue;

	memb->jid = jid;

	if (nodeid == our_nodeid) {
	mg->our_jid = jid;
	/* set_our_memb_options() sets rest */
	if (opts & MEMB_OPT_RECOVER)
	memb->opts \|= MEMB_OPT_RECOVER;
	} else {
	memb->opts = opts;
	if (opts & MEMB_OPT_RO)
	memb->readonly = 1;
	else if (opts & MEMB_OPT_RW)
	memb->rw = 1;
	else if (opts & MEMB_OPT_SPECT)
	memb->spectator = 1;
	}

	if (opts & MEMB_OPT_RECOVER)
	current_first_recover = 1;
	}

	/* FIXME: use global_first_recover_done more widely instead of
	as a single special case */
	if (!current_first_recover)
	mg->global_first_recover_done = 1;

	process_saved_mount_status(mg);

	/* we delay processing any options messages from new mounters
	until after we receive the journals message for our own mount */

	process_saved_options(mg);

	received_our_jid(mg);
	}

	static void receive_journals(struct mountgroup mg, char buf, int len,
	int from)
	{
	struct gdlm_header hd = (struct gdlm_header )buf;
	struct mg_member *memb;
	int count;

	count = (len - sizeof(struct gdlm_header)) / (NUM * sizeof(int));

	log_group(mg, "receive_journals from %d to %d len %d count %d cb %d",
	from, hd->to_nodeid, len, count, mg->last_callback);

	/* just like we can receive an options msg from a newly added node
	before we get the start adding it, we can receive the journals
	message sent to it before we get the start adding it */

	memb = find_memb_nodeid(mg, hd->to_nodeid);
	if (!memb) {
	log_group(mg, "receive_journals from %d to unknown %d",
	from, hd->to_nodeid);
	return;
	}
	memb->needs_journals = 0;

	if (hd->to_nodeid && hd->to_nodeid != our_nodeid)
	return;

	if (mg->got_our_journals) {
	log_group(mg, "receive_journals from %d duplicate", from);
	return;
	}
	mg->got_our_journals = 1;

	_receive_journals(mg, buf, len, from);
	}

	static void add_ordered_member(struct mountgroup mg, struct mg_member new)
	{
	struct mg_member *memb = NULL;
	struct list_head *tmp;
	struct list_head *newlist = &new->list;
	struct list_head *head = &mg->members;

	list_for_each(tmp, head) {
	memb = list_entry(tmp, struct mg_member, list);
	if (new->nodeid < memb->nodeid)
	break;
	}

	if (!memb)
	list_add_tail(newlist, head);
	else {
	/* FIXME: can use list macro here */
	newlist->prev = tmp->prev;
	newlist->next = tmp;
	tmp->prev->next = newlist;
	tmp->prev = newlist;
	}
	}

	static int add_member(struct mountgroup *mg, int nodeid)
	{
	struct mg_member *memb;

	memb = malloc(sizeof(struct mg_member));
	if (!memb)
	return -ENOMEM;

	memset(memb, 0, sizeof(*memb));

	memb->nodeid = nodeid;
	memb->jid = JID_INIT;
	add_ordered_member(mg, memb);
	mg->memb_count++;

	if (!mg->init)
	memb->needs_journals = 1;

	return 0;
	}

	static int is_member(struct mountgroup *mg, int nodeid)
	{
	struct mg_member *memb;

	list_for_each_entry(memb, &mg->members, list) {
	if (memb->nodeid == nodeid)
	return 1;
	}
	return 0;
	}

	static int is_removed(struct mountgroup *mg, int nodeid)
	{
	struct mg_member *memb;

	list_for_each_entry(memb, &mg->members_gone, list) {
	if (memb->nodeid == nodeid)
	return 1;
	}
	return 0;
	}

	/* New mounters may be waiting for a journals message that a failed node (as
	master) would have sent. If the master failed and we're the new master,
	then send a journals message to any nodes for whom we've not seen a journals
	message. We also need to checkpoint the plock state for the new nodes to
	read after they get their journals message. */

	static void resend_journals(struct mountgroup *mg)
	{
	struct mg_member *memb;
	int stored_plocks = 0;

	list_for_each_entry(memb, &mg->members, list) {
	if (!memb->needs_journals)
	continue;

	if (!stored_plocks) {
	store_plocks(mg, memb->nodeid);
	stored_plocks = 1;
	}

	log_group(mg, "resend_journals to %d", memb->nodeid);
	send_journals(mg, memb->nodeid);
	}
	}

	/* The master node is the member of the group with the lowest nodeid who
	was also a member of the last "finished" group, i.e. a member of the
	group the last time it got a finish callback. The job of the master
	is to send state info to new nodes joining the group, and doing that
	requires that the master has all the state to send -- a new joining
	node that has the lowest nodeid doesn't have any state, which is why
	we add the "finished" requirement. */

	static void update_master_nodeid(struct mountgroup *mg)
	{
	struct mg_member *memb;
	int new = -1, low = -1;

	list_for_each_entry(memb, &mg->members, list) {
	if (low == -1 \|\| memb->nodeid < low)
	low = memb->nodeid;
	if (!memb->finished)
	continue;
	if (new == -1 \|\| memb->nodeid < new)
	new = memb->nodeid;
	}
	mg->master_nodeid = new;
	mg->low_nodeid = low;
	}

	/* This can happen before we receive a journals message for our mount. */

	static void recover_members(struct mountgroup *mg, int num_nodes,
	int nodeids, int pos_out, int *neg_out)
	{
	struct mg_member memb, safe, *memb_gone_recover = NULL;
	int i, found, id, pos = 0, neg = 0, prev_master_nodeid;
	int master_failed = 0;

	/* move departed nodes from members list to members_gone */

	list_for_each_entry_safe(memb, safe, &mg->members, list) {
	found = 0;
	for (i = 0; i < num_nodes; i++) {
	if (memb->nodeid == nodeids[i]) {
	found = 1;
	break;
	}
	}

	if (!found) {
	neg++;

	list_move(&memb->list, &mg->members_gone);
	memb->gone_event = mg->start_event_nr;
	memb->gone_type = mg->start_type;
	mg->memb_count--;

	memb->tell_gfs_to_recover = 0;
	memb->recovery_status = 0;
	memb->local_recovery_status = 0;

	/* - journal cb for failed or withdrawing nodes
	- failed node was assigned a journal
	- no journal cb if failed node was spectator
	- no journal cb if we've already done a journl cb */

	if ((memb->gone_type == GROUP_NODE_FAILED \|\|
	memb->withdrawing) &&
	memb->jid != JID_INIT &&
	memb->jid != -2 &&
	!memb->spectator &&
	!memb->wait_gfs_recover_done) {
	memb->tell_gfs_to_recover = 1;
	memb->recovery_status = RS_NEED_RECOVERY;
	memb->local_recovery_status = RS_NEED_RECOVERY;
	}

	log_group(mg, "remove member %d tell_gfs_to_recover %d "
	"(%d,%d,%d,%d,%d,%d)",
	memb->nodeid, memb->tell_gfs_to_recover,
	mg->spectator,
	mg->start_type,
	memb->withdrawing,
	memb->jid,
	memb->spectator,
	memb->wait_gfs_recover_done);

	if (mg->master_nodeid == memb->nodeid &&
	memb->gone_type == GROUP_NODE_FAILED)
	master_failed = 1;

	if (memb->opts & MEMB_OPT_RECOVER)
	memb_gone_recover = memb;
	}
	}

	/* add new nodes to members list */

	for (i = 0; i < num_nodes; i++) {
	id = nodeids[i];
	if (is_member(mg, id))
	continue;
	add_member(mg, id);
	pos++;
	log_group(mg, "add member %d", id);
	}

	prev_master_nodeid = mg->master_nodeid;
	update_master_nodeid(mg);

	*pos_out = pos;
	*neg_out = neg;

	log_group(mg, "total members %d master_nodeid %d prev %d",
	mg->memb_count, mg->master_nodeid, prev_master_nodeid);


	/* The master failed and we're the new master, we need to:

	- unlink the ckpt that the failed master had open so new ckpts
	can be created down the road
	- resend journals msg to any nodes that needed one from the
	failed master
	- store plocks in ckpt for the new mounters to read when they
	get the journals msg from us */

	if (neg && master_failed &&
	(prev_master_nodeid != -1) &&
	(prev_master_nodeid != mg->master_nodeid) &&
	(our_nodeid == mg->master_nodeid)) {
	log_group(mg, "unlink ckpt for failed master %d",
	prev_master_nodeid);
	unlink_checkpoint(mg);
	resend_journals(mg);
	}

	/* Do we need a new first mounter?

	If we've not gotten a journals message yet (implies we're mounting)
	and there's only one node left in the group (us, after removing the
	failed node), then it's possible that the failed node was doing
	first mounter recovery, so we need to become first mounter.

	If we've received a journals message, we can check if the failed
	node was doing first mounter recovery (MEMB_OPT_RECOVER set) and
	if so select the next first mounter. */

	if (!neg)
	return;

	if (!mg->got_our_journals && mg->memb_count == 1) {
	log_group(mg, "we are left alone, act as first mounter");
	unlink_checkpoint(mg);
	memb = find_memb_nodeid(mg, our_nodeid);
	memb->jid = 0;
	memb->opts \|= MEMB_OPT_RECOVER;
	mg->our_jid = 0;
	mg->first_mounter = 1;
	mg->first_mounter_done = 0;
	mg->got_our_options = 1;
	mg->got_our_journals = 1;
	mg->mount_client_delay = 0;
	notify_mount_client(mg);
	return;
	}

	if (memb_gone_recover) {
	log_group(mg, "failed node %d had MEMB_OPT_RECOVER",
	memb_gone_recover->nodeid);
	memb_gone_recover->tell_gfs_to_recover = 0;
	}

	if (memb_gone_recover && mg->got_our_journals) {
	assign_next_first_mounter(mg);
	memb = find_memb_nodeid(mg, our_nodeid);
	if (memb->opts & MEMB_OPT_RECOVER) {
	log_group(mg, "first mounter failed, we get "
	"MEMB_OPT_RECOVER");
	unlink_checkpoint(mg);
	memb->opts \|= MEMB_OPT_RECOVER;
	mg->first_mounter = 1;
	mg->first_mounter_done = 0;
	mg->mount_client_delay = 0;
	notify_mount_client(mg);
	}
	}
	}

	int gfs_join_mountgroup_old(struct mountgroup mg, struct gfsc_mount_args ma)
	{
	int rv;

	if (strlen(ma->options) > MAX_OPTIONS_LEN-1) {
	log_error("join: options too long %zu", strlen(ma->options));
	return -EMLINK;
	}

	rv = group_join(gh, mg->name);
	if (rv)
	return -ENOTCONN;
	return 0;
	}

	/* recover_members() discovers which nodes need journal recovery
	and moves the memb structs for those nodes into members_gone
	and sets memb->tell_gfs_to_recover on them */

	/* we don't want to tell gfs-kernel to do journal recovery for a failed
	node in a number of cases:
	- we're a spectator or readonly mount
	- gfs-kernel is currently withdrawing
	- we're mounting and haven't received a journals message yet
	- we're mounting and got a kernel mount error back from mount.gfs
	- we're mounting and haven't notified mount.gfs yet (to do mount(2))
	- we're mounting and got_kernel_mount is 0, i.e. we've not seen a uevent
	related to the kernel mount yet
	(some of the mounting checks should be obviated by others)

	the problem we're trying to avoid here is telling gfs-kernel to do
	recovery when it can't for some reason and then waiting forever for
	a recovery_done signal that will never arrive. */

	static void recover_journals(struct mountgroup *mg)
	{
	struct mg_member *memb;
	int rv;

	if (mg->spectator \|\|
	mg->ro \|\|
	mg->withdraw \|\|
	mg->our_jid == JID_INIT \|\|
	mg->kernel_mount_error \|\|
	!mg->mount_client_notified \|\|
	!mg->got_kernel_mount \|\|
	!mg->kernel_mount_done) {
	log_group(mg, "recover_journals: unable %d,%d,%d,%d,%d,%d,%d,%d",
	mg->spectator,
	mg->ro,
	mg->withdraw,
	mg->our_jid,
	mg->kernel_mount_error,
	mg->mount_client_notified,
	mg->got_kernel_mount,
	mg->kernel_mount_done);

	list_for_each_entry(memb, &mg->members_gone, list) {
	log_group(mg, "member gone %d jid %d "
	"tell_gfs_to_recover %d",
	memb->nodeid, memb->jid,
	memb->tell_gfs_to_recover);

	if (memb->tell_gfs_to_recover) {
	memb->tell_gfs_to_recover = 0;
	memb->local_recovery_status = RS_READONLY;
	}
	}
	start_done(mg);
	return;
	}

	/* we feed one jid into the kernel for recovery instead of all
	at once because we need to get the result of each independently
	through the single recovery_done sysfs file */

	list_for_each_entry(memb, &mg->members_gone, list) {
	if (memb->wait_gfs_recover_done) {
	log_group(mg, "delay new gfs recovery, "
	"wait_gfs_recover_done for nodeid %d jid %d",
	memb->nodeid, memb->jid);
	return;
	}
	}

	list_for_each_entry(memb, &mg->members_gone, list) {
	if (!memb->tell_gfs_to_recover)
	continue;

	log_group(mg, "recover journal %d nodeid %d",
	memb->jid, memb->nodeid);

	rv = set_sysfs(mg, "recover", memb->jid);
	if (rv < 0) {
	memb->local_recovery_status = RS_NOFS;
	continue;
	}
	memb->tell_gfs_to_recover = 0;
	memb->wait_gfs_recover_done = 1;
	return;
	}

	/* no more journals to attempt to recover, if we've been successful
	recovering any then send out status, if not then start_done...
	receiving no status message from us before start_done means we
	didn't successfully recover any journals. If we send out status,
	then delay start_done until we get our own message (so all nodes
	will get the status before finish) */

	list_for_each_entry(memb, &mg->members_gone, list) {
	if (memb->local_recovery_status == RS_SUCCESS) {
	send_recovery_status(mg);
	log_group(mg, "delay start_done until status recvd");
	return;
	}
	}

	start_done(mg);
	}

	/* In some cases, we may be joining a mountgroup with needs_recovery
	set (there are journals that need recovery and current members can't
	recover them because they're ro). In this case, we're told to act
	like the first mounter to cause gfs to try to recovery all journals
	when it mounts. When gfs does this, we'll get recovery_done's for
	the individual journals it recovers (ignored) and finally, if all
	journals are ok, an others_may_mount/first_done. */

	/* When gfs does first-mount recovery, the mount(2) fails if it can't
	recover one of the journals. If we get o_m_m, then we know it was
	able to successfully recover all the journals. */

	/* When we're the first mounter, gfs does recovery on all the journals
	and does "recovery_done" callbacks when it finishes each. We ignore
	these and wait for gfs to be finished with all at which point it calls
	others_may_mount() and first_done is set. */

	static int kernel_recovery_done_first(struct mountgroup *mg)
	{
	int rv, first_done;

	rv = read_sysfs_int(mg, "first_done", &first_done);
	if (rv < 0)
	return rv;

	log_group(mg, "kernel_recovery_done_first first_done %d", first_done);

	if (mg->kernel_mount_done)
	log_group(mg, "FIXME: assuming kernel_mount_done comes after "
	"first_done");

	if (first_done) {
	mg->first_mounter_done = 1;
	send_recovery_done(mg);
	}

	return 0;
	}

	static int need_kernel_recovery_done(struct mountgroup *mg)
	{
	struct mg_member *memb;

	list_for_each_entry(memb, &mg->members_gone, list) {
	if (memb->wait_gfs_recover_done)
	return 1;
	}
	return 0;
	}

	/* Note: when a readonly node fails we do consider its journal (and the
	fs) to need recovery... not sure this is really necessary, but
	the readonly node did "own" a journal so it seems proper to recover
	it even if the node wasn't writing to it. So, if there are 3 ro
	nodes mounting the fs and one fails, gfs on the remaining 2 will
	remain blocked until an rw node mounts, and the next mounter must
	be rw. */

	int process_recovery_uevent_old(char *table)
	{
	struct mountgroup *mg;
	struct mg_member *memb;
	char *name = strstr(table, ":") + 1;
	char *ss;
	int rv, jid_done, status, found = 0;

	mg = find_mg(name);
	if (!mg) {
	log_error("recovery_done: unknown mount group %s", table);
	return -1;
	}

	if (mg->first_mounter && !mg->first_mounter_done)
	return kernel_recovery_done_first(mg);

	rv = read_sysfs_int(mg, "recover_done", &jid_done);
	if (rv < 0)
	return rv;

	list_for_each_entry(memb, &mg->members_gone, list) {
	if (memb->jid == jid_done) {
	if (memb->wait_gfs_recover_done) {
	memb->wait_gfs_recover_done = 0;
	found = 1;
	}
	break;
	}
	}

	/* We need to ignore recovery_done callbacks in the case where there
	are a bunch of recovery_done callbacks for the first mounter, but
	we detect "first_done" before we've processed all the
	recovery_done's. */

	if (!found) {
	log_group(mg, "recovery_done jid %d ignored, first %d,%d",
	jid_done, mg->first_mounter, mg->first_mounter_done);
	return 0;
	}

	rv = read_sysfs_int(mg, "recover_status", &status);
	if (rv < 0) {
	log_group(mg, "recovery_done jid %d nodeid %d sysfs error %d",
	memb->jid, memb->nodeid, rv);
	memb->local_recovery_status = RS_NOFS;
	goto out;
	}

	switch (status) {
	case LM_RD_GAVEUP:
	/*
	* This is unfortunate; it's needed for bz 442451 where
	* gfs-kernel fails to acquire the journal lock on all nodes
	* because a withdrawing node has not yet called
	* dlm_release_lockspace() to free it's journal lock. With
	* this, all nodes should repeatedly try to to recover the
	* journal of the withdrawn node until the withdrawing node
	* clears its dlm locks, and gfs on each of the remaining nodes
	* succeeds in doing the recovery.
	*/

	if (memb->withdrawing) {
	log_group(mg, "recovery_done jid %d nodeid %d retry "
	"for withdraw", memb->jid, memb->nodeid);
	memb->tell_gfs_to_recover = 1;
	memb->wait_gfs_recover_done = 0;
	usleep(500000);
	}

	memb->local_recovery_status = RS_GAVEUP;
	ss = "gaveup";
	break;
	case LM_RD_SUCCESS:
	memb->local_recovery_status = RS_SUCCESS;
	ss = "success";
	break;
	default:
	log_error("recovery_done: jid %d nodeid %d unknown status %d",
	memb->jid, memb->nodeid, status);
	ss = "unknown";
	}

	log_group(mg, "recovery_done jid %d nodeid %d %s",
	memb->jid, memb->nodeid, ss);

	/* sanity check */
	if (need_kernel_recovery_done(mg))
	log_error("recovery_done: should be no pending gfs recoveries");

	out:
	recover_journals(mg);
	return 0;
	}

	-int remount_mountgroup_old(int ci, struct gfsc_mount_args *ma)
	-{
	- struct mountgroup *mg;
	- char *name = strstr(ma->table, ":") + 1;
	- int ro = 0, rw = 0;
	-
	- log_debug("remount: %s ci %d", name, ci);
	-
	- if (!strncmp(ma->options, "ro", 2))
	- ro = 1;
	- else
	- rw = 1;
	-
	- mg = find_mg(name);
	- if (!mg) {
	- log_error("remount: %s not found", name);
	- return -1;
	- }
	-
	- /* no change */
	- if ((mg->ro && ro) \|\| (mg->rw && rw))
	- return 1;
	-
	- mg->remount_client = ci;
	- send_remount(mg, ro);
	- return 0;
	-}
	-
	void gfs_leave_mountgroup_old(char *name, int mnterr)
	{
	struct mountgroup *mg;

	list_for_each_entry(mg, &withdrawn_mounts, list) {
	if (strcmp(mg->name, name))
	continue;

	log_group(mg, "leave: for withdrawn fs");
	list_del(&mg->list);
	free(mg);
	return;
	}

	mg = find_mg(name);
	if (!mg) {
	log_error("leave: %s not found", name);
	return;
	}

	/* sanity check: we should already have gotten the error from
	the mount.gfs mount_done; so this shouldn't happen */

	if (mnterr && !mg->kernel_mount_error) {
	log_error("leave: mount_error is new %d %d",
	mg->kernel_mount_error, mnterr);
	}

	mg->leaving = 1;

	/* Check to see if we're waiting for a kernel recovery_done to do a
	start_done(). If so, call the start_done() here because we won't be
	getting anything else from gfs-kernel which is now gone. */

	if (need_kernel_recovery_done(mg)) {
	log_group(mg, "leave: fill in start_done");
	start_done(mg);
	}

	group_leave(gh, mg->name);
	}

	/* When mounting a fs, we first join the mountgroup, then tell mount.gfs
	to procede with the kernel mount. Once we're in the mountgroup, we
	can get a stop callback at any time, which requires us to block the
	fs by setting a sysfs file. If the kernel mount is slow, we can get
	a stop callback and try to set the sysfs file before the kernel mount
	has actually created the sysfs files for the fs. This function delays
	any further processing until the sysfs files exist. */

	/* This function returns 0 when the kernel mount is successfully detected
	and we know that do_stop() will be able to block the fs.
	This function returns a negative error if it detects the kernel mount
	has failed which means there's nothing to stop and do_stop() can assume
	an implicit stop. */

	/* wait for
	- kernel mount to get to the point of creating sysfs files we
	can read (and that do_stop can then use), or
	- kernel mount to fail causing mount.gfs to send us a MOUNT_DONE
	which we read in process_connection() */

	static int wait_for_kernel_mount(struct mountgroup *mg)
	{
	int rv, val;

	while (1) {
	/* This is the standard way we leave this loop, where the
	kernel mount gets to the point of creating the sysfs files
	which we see by successfully reading "id". With the
	sysfs files in place, do_stop() will be able to block
	the kernel. */

	rv = read_sysfs_int(mg, "id", &val);
	if (!rv)
	break;
	usleep(100000);

	/* kernel_mount_done is set by mount_done_old() which is called
	by process_connection() if mount.gfs sends MOUNT_DONE. */

	if (mg->kernel_mount_done && !mg->kernel_mount_error) {
	/* mount(2) was successful and we should be able
	to read "id" very shortly... */
	continue;
	}

	if (mg->kernel_mount_done && mg->kernel_mount_error) {
	/* mount(2) failed, stop becomes implicit */
	break;
	}

	/* this should either do nothing and return immediatley, or
	read a MOUNT_DONE from mount.gfs and call mount_done_old()
	which will set kernel_mount_done and set kernel_mount_error */

	process_connection(mg->mount_client);
	}

	return rv;
	}

	/* The processing of new mounters (send/recv options, send/recv journals,
	notify mount.gfs) is not very integrated with the stop/start/finish
	callbacks from libgroup. A start callback just notifies us of a new
	mounter and the options/journals messages drive things from there.
	Recovery for failed nodes _is_ controlled more directly by the
	stop/start/finish callbacks. So, processing new mounters happens
	independently of recovery and of the libgroup callbacks. One place
	where they need to intersect, though, is in stopping/suspending
	gfs-kernel:
	- When we get a stop callback, we need to be certain that gfs-kernel
	is blocked.
	- When a mounter notifies mount.gfs to go ahead, gfs-kernel will
	shortly begin running in an unblocked fashion as it goes through
	the kernel mounting process.
	Given this, we need to be sure that if gfs-kernel is supposed to be
	blocked, we don't notify mount.gfs to go ahead and do the kernel mount
	since that starts gfs-kernel in an unblocked state. */

	/* - if we're unmounting, the kernel is gone, so no problem.
	- if we've just mounted and notified mount.gfs, then wait for kernel
	mount and then block.
	- if we're mounting and have not yet notified mount.gfs, then set
	a flag that delays the notification until block is set to 0. */

	int do_stop(struct mountgroup *mg)
	{
	int rv;

	if (mg->first_mounter && !mg->kernel_mount_done) {
	log_group(mg, "do_stop skip during first mount recovery");
	goto out;
	}

	for (;;) {
	rv = set_sysfs(mg, "block", 1);
	if (!rv) {
	mg->kernel_stopped = 1; /* for queries */
	break;
	}

	/* We get an error trying to block gfs, this could be due
	to a number of things:
	1. if the kernel instance of gfs existed before but now
	we can't see it, that must mean it's been unmounted,
	so it's implicitly stopped
	2. we're in the process of mounting and gfs hasn't created
	the sysfs files for this fs yet
	3. we're mounting and mount(2) returned an error
	4. we're mounting but haven't told mount.gfs to go ahead
	with mount(2) yet
	We also need to handle the situation where we get here in
	case 2 but it turns into case 3 while we're in
	wait_for_kernel_mount() */

	if (mg->got_kernel_mount) {
	log_group(mg, "do_stop skipped fs unmounted");
	break;
	}

	if (mg->mount_client_notified) {
	if (!mg->kernel_mount_error) {
	log_group(mg, "do_stop wait for kernel mount");
	rv = wait_for_kernel_mount(mg);
	if (rv < 0)
	break;
	} else {
	log_group(mg, "do_stop ignore, failed mount");
	break;
	}
	} else {
	log_group(mg, "do_stop causes mount_client_delay");
	mg->mount_client_delay = 1;
	break;
	}
	}
	out:
	group_stop_done(gh, mg->name);
	return 0;
	}

	/* After a start that initiated a recovery, everyone will go and see if they
	can do recovery and try if they can. If a node can't, it does start_done,
	if it tries and fails, it does start_done, if it tries and succeeds it
	sends a message and then does start_done once it receives's it back. So,
	when we get a finish we know that we have all the results from the recovery
	cycle and can judge if everything is recovered properly or not. If so, we
	can unblock locks (in the finish), if not, we leave them blocked (in the
	finish).

	If we leave locks blocked in the finish, then they can only be unblocked
	after someone is able to do the recovery that's needed. So, leaving locks
	blocked in a finish because recovery hasn't worked puts us into a special
	state: the fs needs recovery, none of the current mounters has been able to
	recover it, all current mounters have locks blocked in gfs, new mounters
	are allowed, nodes can unmount, new mounters are asked to do first-mounter
	recovery, if one of them succeeds then we can all clear this special state
	and unblock locks (the unblock would happen upon recving the success
	message from the new pseudo-first mounter, not as part of a finish), future
	finishes would then go back to being able to unblock locks.

	While in this special state, a new node has been added and asked to do
	first-mounter recovery, other nodes can also be added while the new
	first-mounter is active. These other nodes don't notify mount.gfs.
	They'll receive the result of the first mounter and if it succeeded they'll
	notify mount.gfs, otherwise one of them will become the next first-mounter
	and notify mount.gfs. */

	int do_finish(struct mountgroup *mg)
	{
	struct mg_member memb, safe;

	log_group(mg, "finish %d needs_recovery %d", mg->last_finish,
	mg->needs_recovery);

	/* members_gone list are the members that were removed from the
	members list when processing a start. members are removed
	from members_gone if their journals have been recovered */

	list_for_each_entry_safe(memb, safe, &mg->members_gone, list) {
	if (!memb->recovery_status) {
	list_del(&memb->list);
	free(memb);
	} else if (memb->recovery_status == RS_SUCCESS) {
	ASSERT(memb->gone_event <= mg->last_finish);
	log_group(mg, "finish: recovered jid %d nodeid %d",
	memb->jid, memb->nodeid);
	list_del(&memb->list);
	free(memb);
	} else {
	log_error("%s finish: needs recovery jid %d nodeid %d "
	"status %d", mg->name, memb->jid,
	memb->nodeid, memb->recovery_status);
	mg->needs_recovery = 1;
	}
	}

	list_for_each_entry(memb, &mg->members, list)
	memb->finished = 1;

	if (mg->group_leave_on_finish) {
	log_group(mg, "leaving group after delay for join to finish");
	group_leave(gh, mg->name);
	mg->group_leave_on_finish = 0;
	return 0;
	}

	if (!mg->needs_recovery) {
	mg->kernel_stopped = 0; /* for queries */
	set_sysfs(mg, "block", 0);

	/* we may have been holding back our local mount due to
	being stopped/blocked */
	if (mg->mount_client_delay && !first_mounter_recovery(mg)) {
	mg->mount_client_delay = 0;
	notify_mount_client(mg);
	}
	} else
	log_group(mg, "finish: leave locks blocked for needs_recovery");

	return 0;
	}

	/*
	* - require the first mounter to be rw, not ro or spectator.
	*
	* - if rw mounter fails, leaving only spectator mounters,
	* require the next mounter to be rw, more ro/spectator mounts should
	* fail until the fs is mounted rw.
	*
	* - if last rw mounter fails and ro mounters are left (possibly with
	* some spectators), disallow any ro->rw remounts, leave gfs blocked,
	* require next mounter to be rw, have next mounter do first mount
	* gfs/journal recovery.
	*/

	/* called for the initial start on the node that's first to mount the fs.
	(it should be ok to let the first mounter be a spectator, gfs should do
	first recovery and bail out if there are any dirty journals) */

	/* FIXME: if journal recovery fails on any of the journals, we should
	fail the mount */

	static void start_first_mounter(struct mountgroup *mg)
	{
	struct mg_member *memb;

	log_group(mg, "start_first_mounter");
	set_our_memb_options(mg);
	memb = find_memb_nodeid(mg, our_nodeid);
	ASSERT(memb);

	if (mg->ro \|\| mg->spectator) {
	memb->jid = -2;
	mg->our_jid = -2;
	log_group(mg, "start_first_mounter not rw ro=%d spect=%d",
	mg->ro , mg->spectator);
	mg->mount_client_result = -EUCLEAN;
	} else {
	memb->opts \|= MEMB_OPT_RECOVER;
	memb->jid = 0;
	mg->our_jid = 0;
	mg->first_mounter = 1;
	mg->first_mounter_done = 0;
	mg->got_our_options = 1;
	mg->got_our_journals = 1;
	}
	start_done(mg);
	notify_mount_client(mg);
	}

	/* called for the initial start on a rw/ro mounter;
	the existing mounters are running start_participant() */

	static void start_participant_init(struct mountgroup *mg)
	{
	log_group(mg, "start_participant_init");
	set_our_memb_options(mg);
	send_options(mg);
	start_done(mg);
	}

	/* called for a non-initial start on a normal mounter.
	NB we can get here without having received a journals message for
	our (recent) mount yet in which case we don't know the jid or ro/rw
	status of any members, and don't know our own jid. */

	static void start_participant(struct mountgroup *mg, int pos, int neg)
	{
	log_group(mg, "start_participant pos=%d neg=%d", pos, neg);

	if (pos) {
	start_done(mg);
	/* we save options messages from nodes for whom we've not
	received a start yet */
	process_saved_options(mg);
	} else if (neg) {
	recover_journals(mg);
	process_saved_recovery_status(mg);
	}
	}

	/* called for the initial start on a spectator mounter,
	after _receive_journals() */

	static void start_spectator_init_2(struct mountgroup *mg)
	{
	log_group(mg, "start_spectator_init_2 our_jid=%d", mg->our_jid);

	/* we've been given jid of -2 which means we're not permitted
	to mount the fs; probably because the next mounter must be rw */

	if (mg->our_jid == -2) {
	mg->mount_client_result = -EUCLEAN;
	} else
	ASSERT(mg->our_jid == -1);

	notify_mount_client(mg);
	}

	/* called for the initial start on a spectator mounter */

	static void start_spectator_init(struct mountgroup *mg)
	{
	log_group(mg, "start_spectator_init");
	set_our_memb_options(mg);
	send_options(mg);
	start_done(mg);
	mg->start2_fn = start_spectator_init_2;
	}

	/* called for a non-initial start on a spectator mounter */

	static void start_spectator(struct mountgroup *mg, int pos, int neg)
	{
	log_group(mg, "start_spectator pos=%d neg=%d", pos, neg);

	if (pos) {
	start_done(mg);
	process_saved_options(mg);
	} else if (neg) {
	recover_journals(mg);
	process_saved_recovery_status(mg);
	}
	}

	/* If nodeA fails, nodeB is recovering journalA and nodeB fails before
	finishing, then nodeC needs to tell gfs to recover both journalA and
	journalB. We do this by setting tell_gfs_to_recover back to 1 for
	any nodes that are still on the members_gone list. */

	static void reset_unfinished_recoveries(struct mountgroup *mg)
	{
	struct mg_member *memb;

	list_for_each_entry(memb, &mg->members_gone, list) {
	if (memb->recovery_status &&
	memb->recovery_status != RS_NEED_RECOVERY) {
	log_group(mg, "retry unfinished recovery "
	"jid %d nodeid %d",
	memb->jid, memb->nodeid);
	memb->tell_gfs_to_recover = 1;
	memb->recovery_status = RS_NEED_RECOVERY;
	memb->local_recovery_status = RS_NEED_RECOVERY;
	}
	}
	}

	/*
	old method:
	A is rw mount, B mounts rw

	do_start do_start
	start_participant start_participant_init
	send_options
	receive_options
	start_participant_2
	discover_journals
	assign B a jid
	send_journals
	group_start_done
	receive_journals
	start_participant_init_2
	group_start_done
	do_finish do_finish

	new method: decouples stop/start/finish from mount processing
	A is rw mount, B mounts rw

	do_start do_start
	start_participant start_participant_init
	start_done send_options
	start_done
	do_finish do_finish

	receive_options
	assign_journal
	send_journals
	receive_journals
	start_participant_init_2
	notify_mount_client
	*/

	void do_start(struct mountgroup mg, int type, int member_count, int nodeids)
	{
	int pos = 0, neg = 0;

	mg->start_event_nr = mg->last_start;
	mg->start_type = type;

	log_group(mg, "start %d init %d type %d member_count %d",
	mg->last_start, mg->init, type, member_count);

	recover_members(mg, member_count, nodeids, &pos, &neg);
	reset_unfinished_recoveries(mg);

	if (mg->init) {
	if (member_count == 1)
	start_first_mounter(mg);
	else if (mg->spectator)
	start_spectator_init(mg);
	else
	start_participant_init(mg);
	mg->init = 0;
	} else {
	if (mg->spectator)
	start_spectator(mg, pos, neg);
	else
	start_participant(mg, pos, neg);
	}
	}

	/*
	What repurcussions are there from umount shutting down gfs in the
	kernel before we leave the mountgroup? We can no longer participate
	in recovery even though we're in the group -- what are the end cases
	that we need to deal with where this causes a problem? i.e. there
	is a period of time where the mountgroup=A,B,C but the kernel fs
	is only active on A,B, not C. The mountgroup on A,B can't depend
	on the mg on C to necessarily be able to do some things (recovery).

	At least in part, it means that after we do an umount and have
	removed the instance of this fs in the kernel, we'll still get
	stop/start/finish callbacks from groupd for which we'll attempt
	and fail to: block/unblock gfs kernel activity, initiate gfs
	journal recovery, get recovery-done signals fromt eh kernel.

	We don't want to hang groupd event processing by failing to send
	an ack (stop_done/start_done) back to groupd when it needs one
	to procede. In the case where we get a start for a failed node
	that needs journal recovery, we have a problem because we wait to
	call group_start_done() until gfs in the kernel to signal that
	the journal recovery is done. If we've unmounted gfs isn't there
	any more to give us this signal and we'll never call start_done.

	update: we should be dealing with all these issues correctly now. */

	int do_terminate(struct mountgroup *mg)
	{
	purge_plocks(mg, 0, 1);

	if (mg->withdraw) {
	log_group(mg, "termination of our withdraw leave");
	set_sysfs(mg, "withdraw", 1);
	list_move(&mg->list, &withdrawn_mounts);
	} else {
	log_group(mg, "termination of our unmount leave");
	list_del(&mg->list);
	free(mg);
	}

	return 0;
	}

	/* The basic rule of withdraw is that we don't want to tell the kernel to drop
	all locks until we know gfs has been stopped/blocked on all nodes. They'll
	be stopped for our leave, we just need to know when they've all arrived
	there.

	A withdrawing node is very much like a readonly node, differences are
	that others recover its journal when they remove it from the group,
	and when it's been removed from the group (gets terminate for its leave),
	it tells the locally withdrawing gfs to clear out locks. */

	int do_withdraw_old(char *table)
	{
	struct mountgroup *mg;
	char *name = strstr(table, ":") + 1;
	int rv;

	if (!cfgd_enable_withdraw) {
	log_error("withdraw feature not enabled");
	return 0;
	}

	mg = find_mg(name);
	if (!mg) {
	log_error("do_withdraw no mountgroup %s", name);
	return -1;
	}

	rv = run_dmsetup_suspend(mg, mg->mount_args.dev);
	if (rv) {
	log_error("do_withdraw %s: dmsetup %s error %d", mg->name,
	mg->mount_args.dev, rv);
	return -1;
	}

	dmsetup_wait = 1;
	return 0;
	}

	static void do_deliver(int nodeid, char *data, int len)
	{
	struct mountgroup *mg;
	struct gdlm_header *hd;

	hd = (struct gdlm_header *) data;

	mg = find_mg(hd->name);
	if (!mg) {
	/*
	log_error("cpg message from %d len %d no group %s",
	nodeid, len, hd->name);
	*/
	return;
	}

	hd->version[0] = le16_to_cpu(hd->version[0]);
	hd->version[1] = le16_to_cpu(hd->version[1]);
	hd->version[2] = le16_to_cpu(hd->version[2]);
	hd->type = le16_to_cpu(hd->type);
	hd->nodeid = le32_to_cpu(hd->nodeid);
	hd->to_nodeid = le32_to_cpu(hd->to_nodeid);

	/* FIXME: we need to look at how to gracefully fail when we end up
	with mixed incompat versions */

	if (hd->version[0] != protocol_active[0]) {
	log_error("reject message from %d version %u.%u.%u vs %u.%u.%u",
	nodeid, hd->version[0], hd->version[1],
	hd->version[2], protocol_active[0],
	protocol_active[1], protocol_active[2]);
	return;
	}

	/* If there are some group messages between a new node being added to
	the cpg group and being added to the app group, the new node should
	discard them since they're only relevant to the app group. */

	if (!mg->last_callback) {
	log_group(mg, "discard %s len %d from %d",
	msg_name(hd->type), len, nodeid);
	return;
	}

	switch (hd->type) {
	case MSG_JOURNAL:
	receive_journals(mg, data, len, nodeid);
	break;

	case MSG_OPTIONS:
	receive_options(mg, data, len, nodeid);
	break;

	case MSG_REMOUNT:
	receive_remount(mg, data, len, nodeid);
	break;

	case MSG_PLOCK:
	receive_plock(mg, data, len, nodeid);
	break;

	case MSG_MOUNT_STATUS:
	receive_mount_status(mg, data, len, nodeid);
	break;

	case MSG_RECOVERY_STATUS:
	receive_recovery_status(mg, data, len, nodeid);
	break;

	case MSG_RECOVERY_DONE:
	receive_recovery_done(mg, data, len, nodeid);
	break;

	case MSG_WITHDRAW:
	receive_withdraw(mg, data, len, nodeid);
	break;

	case MSG_PLOCK_OWN:
	receive_own(mg, data, len, nodeid);
	break;

	case MSG_PLOCK_DROP:
	receive_drop(mg, data, len, nodeid);
	break;

	case MSG_PLOCK_SYNC_LOCK:
	case MSG_PLOCK_SYNC_WAITER:
	receive_sync(mg, data, len, nodeid);
	break;

	default:
	log_error("unknown message type %d from %d",
	hd->type, hd->nodeid);
	}
	}

	static void deliver_cb(cpg_handle_t handle, struct cpg_name *group_name,
	uint32_t nodeid, uint32_t pid, void *data, int data_len)
	{
	do_deliver(nodeid, data, data_len);
	}

	/* Not sure if purging plocks (driven by confchg) needs to be synchronized with
	the other recovery steps (driven by libgroup) for a node, don't think so.
	Is it possible for a node to have been cleared from the members_gone list
	before this confchg is processed? */

	static void confchg_cb(cpg_handle_t handle, struct cpg_name *group_name,
	struct cpg_address *member_list, int member_list_entries,
	struct cpg_address *left_list, int left_list_entries,
	struct cpg_address *joined_list, int joined_list_entries)
	{
	struct mountgroup *mg;
	int i, nodeid;

	for (i = 0; i < left_list_entries; i++) {
	nodeid = left_list[i].nodeid;
	list_for_each_entry(mg, &mountgroups, list) {
	if (is_member(mg, nodeid) \|\| is_removed(mg, nodeid))
	purge_plocks(mg, left_list[i].nodeid, 0);
	}
	}
	}

	static cpg_callbacks_t callbacks = {
	.cpg_deliver_fn = deliver_cb,
	.cpg_confchg_fn = confchg_cb,
	};

	void process_cpg_old(int ci)
	{
	cpg_error_t error;

	error = cpg_dispatch(libcpg_handle, CPG_DISPATCH_ALL);
	if (error != CPG_OK) {
	log_error("cpg_dispatch error %d", error);
	return;
	}

	update_flow_control_status();
	}

	int setup_cpg_old(void)
	{
	cpg_error_t error;
	int fd = 0;

	INIT_LIST_HEAD(&withdrawn_mounts);

	if (cfgd_plock_ownership)
	memcpy(protocol_active, protocol_v200, sizeof(protocol_v200));
	else
	memcpy(protocol_active, protocol_v100, sizeof(protocol_v100));

	error = cpg_initialize(&libcpg_handle, &callbacks);
	if (error != CPG_OK) {
	log_error("cpg_initialize error %d", error);
	return -1;
	}

	cpg_fd_get(libcpg_handle, &fd);
	if (fd < 0) {
	log_error("cpg_fd_get error %d", error);
	return -1;
	}

	memset(&daemon_name, 0, sizeof(daemon_name));
	strcpy(daemon_name.value, "gfs_controld");
	daemon_name.length = 12;

	retry:
	error = cpg_join(libcpg_handle, &daemon_name);
	if (error == CPG_ERR_TRY_AGAIN) {
	log_debug("setup_cpg cpg_join retry");
	sleep(1);
	goto retry;
	}
	if (error != CPG_OK) {
	log_error("cpg_join error %d", error);
	cpg_finalize(libcpg_handle);
	return -1;
	}

	log_debug("cpg %d", fd);
	return fd;
	}

	diff --git a/group/gfs_controld/gfs_daemon.h b/group/gfs_controld/gfs_daemon.h
	index 3cda46725..135574fb4 100644
	--- a/group/gfs_controld/gfs_daemon.h
	+++ b/group/gfs_controld/gfs_daemon.h
	@@ -1,322 +1,323 @@
	#ifndef __GFS_DAEMON_DOT_H__
	#define __GFS_DAEMON_DOT_H__

	#include <sys/types.h>
	#include <asm/types.h>
	#include <sys/uio.h>
	#include <netinet/in.h>
	#include <sys/socket.h>
	#include <sys/un.h>
	#include <sys/ioctl.h>
	#include <sys/stat.h>
	#include <sys/utsname.h>
	#include <sys/poll.h>
	#include <sys/wait.h>
	#include <netinet/in.h>
	#include <arpa/inet.h>
	#include <net/if.h>
	#include <stdio.h>
	#include <errno.h>
	#include <string.h>
	#include <stdlib.h>
	#include <stddef.h>
	#include <stdint.h>
	#include <fcntl.h>
	#include <netdb.h>
	#include <limits.h>
	#include <unistd.h>
	#include <time.h>
	#include <syslog.h>
	#include <sched.h>
	#include <signal.h>
	#include <sys/time.h>
	#include <dirent.h>
	#include <openais/saAis.h>
	#include <openais/saCkpt.h>
	#include <corosync/cpg.h>
	#include <corosync/engine/logsys.h>

	#include <linux/dlmconstants.h>
	#include "libgfscontrol.h"
	#include "gfs_controld.h"
	#include "list.h"
	#include "linux_endian.h"

	/* TODO: warn if
	DLM_LOCKSPACE_LEN (from dlmconstants.h) !=
	GFS_MOUNTGROUP_LEN (from libgfscontrol.h)
	*/

	/* Maximum members of a mountgroup, should match CPG_MEMBERS_MAX in
	corosync/cpg.h. There are no max defines in gfs-kernel for
	mountgroup members. (FIXME verify gfs-kernel/lock_dlm) */

	#define MAX_NODES 128

	/* Max string length printed on a line, for debugging/dump output. */

	#define MAXLINE 256

	/* group_mode */

	#define GROUP_LIBGROUP 2
	#define GROUP_LIBCPG 3

	extern int daemon_debug_opt;
	extern int daemon_quit;
	extern int poll_dlm;
	extern int poll_ignore_plock;
	extern int plock_fd;
	extern int plock_ci;
	extern struct list_head mountgroups;
	extern int cman_quorate;
	extern int our_nodeid;
	extern char *clustername;
	extern char daemon_debug_buf[256];
	extern char dump_buf[GFSC_DUMP_SIZE];
	extern int dump_point;
	extern int dump_wrap;
	extern char plock_dump_buf[GFSC_DUMP_SIZE];
	extern int plock_dump_len;
	extern int dmsetup_wait;
	extern cpg_handle_t libcpg_handle;
	extern int libcpg_flow_control_on;
	extern int group_mode;

	void daemon_dump_save(void);

	#define log_debug(fmt, args...) \
	do { \
	snprintf(daemon_debug_buf, 255, "%ld " fmt "\n", time(NULL), ##args); \
	daemon_dump_save(); \
	if (daemon_debug_opt) \
	fprintf(stderr, "%s", daemon_debug_buf); \
	if (cfgd_debug_logsys) \
	log_printf(LOG_DEBUG, "%s", daemon_debug_buf); \
	} while (0)

	#define log_group(g, fmt, args...) \
	do { \
	snprintf(daemon_debug_buf, 255, "%ld %s " fmt "\n", time(NULL), \
	(g)->name, ##args); \
	daemon_dump_save(); \
	if (daemon_debug_opt) \
	fprintf(stderr, "%s", daemon_debug_buf); \
	if (cfgd_debug_logsys) \
	log_printf(LOG_DEBUG, "%s", daemon_debug_buf); \
	} while (0)

	#define log_error(fmt, args...) \
	do { \
	log_debug(fmt, ##args); \
	log_printf(LOG_ERR, fmt, ##args); \
	} while (0)

	#define log_plock(g, fmt, args...) \
	do { \
	snprintf(daemon_debug_buf, 255, "%ld %s " fmt "\n", time(NULL), \
	(g)->name, ##args); \
	if (daemon_debug_opt && cfgd_plock_debug) \
	fprintf(stderr, "%s", daemon_debug_buf); \
	} while (0)

	struct mountgroup {
	struct list_head list;
	uint32_t id;
	struct gfsc_mount_args mount_args;
	char name[GFS_MOUNTGROUP_LEN+1];
	int old_group_mode;

	int mount_client;
	int mount_client_result;
	int mount_client_notified;
	int mount_client_delay;
	int remount_client;

	int withdraw;
	int dmsetup_wait;
	pid_t dmsetup_pid;
	int our_jid;
	int spectator;
	int ro;
	int rw;
	int joining;
	int leaving;
	int kernel_mount_error;
	int kernel_mount_done;
	int first_mounter;

	/* cpg-new stuff */

	cpg_handle_t cpg_handle;
	int cpg_client;
	int cpg_fd;
	int kernel_stopped;
	uint32_t change_seq;
	uint32_t started_count;
	struct change *started_change;
	struct list_head changes;
	struct list_head node_history;
	struct list_head journals;
	int dlm_notify_nodeid;
	int first_recovery_needed;
	int first_recovery_master;
	int first_recovery_msg;
	int local_recovery_jid;
	int local_recovery_busy;

	/* cpg-old stuff for rhel5/stable2 compat */

	struct list_head members;
	struct list_head members_gone;
	int memb_count;
	int last_stop;
	int last_start;
	int last_finish;
	int last_callback;
	int start_event_nr;
	int start_type;
	int group_leave_on_finish;
	int init;
	int got_our_options;
	int got_our_journals;
	int delay_send_journals;
	int first_mount_pending_stop;
	int first_mounter_done;
	int global_first_recover_done;
	int emulate_first_mounter;
	int wait_first_done;
	int needs_recovery;
	int low_nodeid;
	int master_nodeid;
	int got_kernel_mount;
	struct list_head saved_messages;
	void *start2_fn;

	/* cpg-old plock stuff */

	int save_plocks;
	struct list_head plock_resources;
	uint32_t associated_ls_id;
	uint64_t cp_handle;
	time_t last_checkpoint_time;
	time_t last_plock_time;
	struct timeval drop_resources_last;
	};

	/* these need to match the kernel defines of the same name in lm_interface.h */

	#define LM_RD_GAVEUP 308
	#define LM_RD_SUCCESS 309

	/* config.c */
	int setup_ccs(void);
	void close_ccs(void);
	void read_ccs_name(char path, char name);
	void read_ccs_yesno(char path, int yes, int *no);
	void read_ccs_int(char path, int config_val);
	void read_ccs_nodir(struct mountgroup mg, char buf);

	/* cpg-new.c */
	int setup_cpg(void);
	int setup_dlmcontrol(void);
	void process_dlmcontrol(int ci);
	void process_recovery_uevent(char *table);
	void process_mountgroups(void);
	int gfs_join_mountgroup(struct mountgroup *mg);
	void gfs_leave_mountgroup(char *name, int mnterr);
	void gfs_mount_done(struct mountgroup *mg);
	+void send_remount(struct mountgroup mg, struct gfsc_mount_args ma);
	int set_mountgroup_info(struct mountgroup mg, struct gfsc_mountgroup out);
	int set_node_info(struct mountgroup mg, int nodeid, struct gfsc_node node);
	int set_mountgroups(int count, struct gfsc_mountgroup *mgs_out);
	int set_mountgroup_nodes(struct mountgroup mg, int option, int node_count,
	struct gfsc_node **nodes_out);

	/* cpg-old.c */
	int setup_cpg_old(void);
	void process_cpg_old(int ci);
	int gfs_join_mountgroup_old(struct mountgroup mg, struct gfsc_mount_args ma);
	void gfs_leave_mountgroup_old(char *name, int mnterr);
	int send_group_message_old(struct mountgroup mg, int len, char buf);
	void save_message_old(struct mountgroup mg, char buf, int len, int from,
	int type);
	void send_withdraw_old(struct mountgroup *mg);
	int process_recovery_uevent_old(char *table);
	void ping_kernel_mount_old(char *table);
	-int remount_mountgroup_old(int ci, struct gfsc_mount_args *ma);
	+void send_remount_old(struct mountgroup mg, struct gfsc_mount_args ma);
	void send_mount_status_old(struct mountgroup *mg);
	int do_stop(struct mountgroup *mg);
	int do_finish(struct mountgroup *mg);
	void do_start(struct mountgroup mg, int type, int member_count, int nodeids);
	int do_terminate(struct mountgroup *mg);
	int do_withdraw_old(char *table);

	/* group.c */
	int setup_groupd(void);
	void close_groupd(void);
	void process_groupd(int ci);
	int set_mountgroup_info_group(struct mountgroup *mg,
	struct gfsc_mountgroup *out);
	int set_node_info_group(struct mountgroup *mg, int nodeid,
	struct gfsc_node *node);
	int set_mountgroups_group(int count, struct gfsc_mountgroup *mgs_out);
	int set_mountgroup_nodes_group(struct mountgroup *mg, int option,
	int node_count, struct gfsc_node *nodes_out);
	void set_group_mode(void);

	/* main.c */
	int do_read(int fd, void *buf, size_t count);
	int do_write(int fd, void *buf, size_t count);
	void client_dead(int ci);
	int client_add(int fd, void (workfn)(int ci), void (deadfn)(int ci));
	int client_fd(int ci);
	void client_ignore(int ci, int fd);
	void client_back(int ci, int fd);
	struct mountgroup create_mg(char name);
	struct mountgroup find_mg(char name);
	struct mountgroup *find_mg_id(uint32_t id);
	-void client_reply_remount(struct mountgroup *mg, int result);
	+void client_reply_remount(struct mountgroup *mg, int ci, int result);
	void client_reply_join(int ci, struct gfsc_mount_args *ma, int result);
	void client_reply_join_full(struct mountgroup *mg, int result);
	void query_lock(void);
	void query_unlock(void);
	void process_connection(int ci);
	void cluster_dead(int ci);

	/* member_cman.c */
	int setup_cman(void);
	void close_cman(void);
	void process_cman(int ci);
	void kick_node_from_cluster(int nodeid);

	/* plock.c */
	int setup_plocks(void);
	void process_plocks(int ci);
	int limit_plocks(void);
	void receive_plock(struct mountgroup mg, char buf, int len, int from);
	void receive_own(struct mountgroup mg, char buf, int len, int from);
	void receive_sync(struct mountgroup mg, char buf, int len, int from);
	void receive_drop(struct mountgroup mg, char buf, int len, int from);
	void process_saved_plocks(struct mountgroup *mg);
	int unlink_checkpoint(struct mountgroup *mg);
	void store_plocks(struct mountgroup *mg, int nodeid);
	void retrieve_plocks(struct mountgroup *mg);
	void purge_plocks(struct mountgroup *mg, int nodeid, int unmount);
	int fill_plock_dump_buf(struct mountgroup *mg);

	/* util.c */
	int we_are_in_fence_domain(void);
	int set_sysfs(struct mountgroup mg, char field, int val);
	int read_sysfs_int(struct mountgroup mg, char field, int *val_out);
	int run_dmsetup_suspend(struct mountgroup mg, char dev);
	void update_dmsetup_wait(void);
	void update_flow_control_status(void);
	int check_uncontrolled_filesystems(void);

	/* logging.c */

	void init_logging(void);
	void setup_logging();
	void close_logging(void);

	#endif
	diff --git a/group/gfs_controld/main.c b/group/gfs_controld/main.c
	index 5881961a3..ba456d530 100644
	--- a/group/gfs_controld/main.c
	+++ b/group/gfs_controld/main.c
	@@ -1,1450 +1,1478 @@
	#include "gfs_daemon.h"
	#include "config.h"
	#include <pthread.h>
	#include "copyright.cf"

	#include <linux/netlink.h>

	#define LOCKFILE_NAME "/var/run/gfs_controld.pid"
	#define CLIENT_NALLOC 32

	static int client_maxi;
	static int client_size;
	static struct client *client;
	static struct pollfd *pollfd;
	static pthread_t query_thread;
	static pthread_mutex_t query_mutex;

	struct client {
	int fd;
	void *workfn;
	void *deadfn;
	struct mountgroup *mg;
	};

	static void do_leave(char *table, int mnterr);

	int do_read(int fd, void *buf, size_t count)
	{
	int rv, off = 0;

	while (off < count) {
	rv = read(fd, buf + off, count - off);
	if (rv == 0)
	return -1;
	if (rv == -1 && errno == EINTR)
	continue;
	if (rv == -1)
	return -1;
	off += rv;
	}
	return 0;
	}

	int do_write(int fd, void *buf, size_t count)
	{
	int rv, off = 0;

	retry:
	rv = write(fd, buf + off, count);
	if (rv == -1 && errno == EINTR)
	goto retry;
	if (rv < 0) {
	log_error("write errno %d", errno);
	return rv;
	}

	if (rv != count) {
	count -= rv;
	off += rv;
	goto retry;
	}
	return 0;
	}

	static void client_alloc(void)
	{
	int i;

	if (!client) {
	client = malloc(CLIENT_NALLOC * sizeof(struct client));
	pollfd = malloc(CLIENT_NALLOC * sizeof(struct pollfd));
	} else {
	client = realloc(client, (client_size + CLIENT_NALLOC) *
	sizeof(struct client));
	pollfd = realloc(pollfd, (client_size + CLIENT_NALLOC) *
	sizeof(struct pollfd));
	if (!pollfd)
	log_error("can't alloc for pollfd");
	}
	if (!client \|\| !pollfd)
	log_error("can't alloc for client array");

	for (i = client_size; i < client_size + CLIENT_NALLOC; i++) {
	client[i].workfn = NULL;
	client[i].deadfn = NULL;
	client[i].fd = -1;
	pollfd[i].fd = -1;
	pollfd[i].revents = 0;
	}
	client_size += CLIENT_NALLOC;
	}

	void client_dead(int ci)
	{
	close(client[ci].fd);
	client[ci].workfn = NULL;
	client[ci].fd = -1;
	pollfd[ci].fd = -1;
	}

	int client_add(int fd, void (workfn)(int ci), void (deadfn)(int ci))
	{
	int i;

	if (!client)
	client_alloc();
	again:
	for (i = 0; i < client_size; i++) {
	if (client[i].fd == -1) {
	client[i].workfn = workfn;
	if (deadfn)
	client[i].deadfn = deadfn;
	else
	client[i].deadfn = client_dead;
	client[i].fd = fd;
	pollfd[i].fd = fd;
	pollfd[i].events = POLLIN;
	if (i > client_maxi)
	client_maxi = i;
	return i;
	}
	}

	client_alloc();
	goto again;
	}

	int client_fd(int ci)
	{
	return client[ci].fd;
	}

	void client_ignore(int ci, int fd)
	{
	pollfd[ci].fd = -1;
	pollfd[ci].events = 0;
	}

	void client_back(int ci, int fd)
	{
	pollfd[ci].fd = fd;
	pollfd[ci].events = POLLIN;
	}

	static void sigterm_handler(int sig)
	{
	daemon_quit = 1;
	}

	struct mountgroup create_mg(char name)
	{
	struct mountgroup *mg;

	mg = malloc(sizeof(struct mountgroup));
	if (!mg)
	return NULL;
	memset(mg, 0, sizeof(struct mountgroup));

	if (group_mode == GROUP_LIBGROUP)
	mg->old_group_mode = 1;

	INIT_LIST_HEAD(&mg->members);
	INIT_LIST_HEAD(&mg->members_gone);
	INIT_LIST_HEAD(&mg->plock_resources);
	INIT_LIST_HEAD(&mg->saved_messages);
	INIT_LIST_HEAD(&mg->changes);
	INIT_LIST_HEAD(&mg->journals);
	INIT_LIST_HEAD(&mg->node_history);
	mg->init = 1;
	mg->master_nodeid = -1;
	mg->low_nodeid = -1;

	strncpy(mg->name, name, GFS_MOUNTGROUP_LEN);

	return mg;
	}

	struct mountgroup find_mg(char name)
	{
	struct mountgroup *mg;

	list_for_each_entry(mg, &mountgroups, list) {
	if ((strlen(mg->name) == strlen(name)) &&
	!strncmp(mg->name, name, strlen(name)))
	return mg;
	}
	return NULL;
	}

	struct mountgroup *find_mg_id(uint32_t id)
	{
	struct mountgroup *mg;

	list_for_each_entry(mg, &mountgroups, list) {
	if (mg->id == id)
	return mg;
	}
	return NULL;
	}

	#define MAXARGS 8

	static char get_args(char buf, int argc, char *argv, char sep, int want)
	{
	char p = buf, rp = NULL;
	int i;

	argv[0] = p;

	for (i = 1; i < MAXARGS; i++) {
	p = strchr(buf, sep);
	if (!p)
	break;
	*p = '\0';

	if (want == i) {
	rp = p + 1;
	break;
	}

	argv[i] = p + 1;
	buf = p + 1;
	}
	*argc = i;

	/* we ended by hitting \0, return the point following that */
	if (!rp)
	rp = strchr(buf, '\0') + 1;

	return rp;
	}

	static void ping_kernel_mount(char *table)
	{
	struct mountgroup *mg;
	char *name = strstr(table, ":") + 1;
	int rv, val;

	mg = find_mg(name);
	if (!mg)
	return;

	rv = read_sysfs_int(mg, "id", &val);

	log_group(mg, "ping_kernel_mount %d", rv);
	}

	static void process_uevent(int ci)
	{
	char buf[MAXLINE];
	char argv[MAXARGS], act, *sys;
	int rv, argc = 0;
	int lock_module = 0;

	memset(buf, 0, sizeof(buf));
	memset(argv, 0, sizeof(char ) MAXARGS);

	retry_recv:
	rv = recv(client[ci].fd, &buf, sizeof(buf), 0);
	if (rv == -1 && rv == EINTR)
	goto retry_recv;
	if (rv == -1 && rv == EAGAIN)
	return;
	if (rv < 0) {
	log_error("uevent recv error %d errno %d", rv, errno);
	return;
	}

	/* first we get the uevent for removing lock module kobject:
	"remove@/fs/gfs/bull:x/lock_module"
	second is the uevent for removing gfs kobject:
	"remove@/fs/gfs/bull:x"
	*/

	if (!strstr(buf, "gfs"))
	return;

	/* if an fs is named "gfs", it results in dlm uevents
	like "remove@/kernel/dlm/gfs" */

	if (strstr(buf, "kernel/dlm"))
	return;

	log_debug("uevent: %s", buf);

	if (strstr(buf, "lock_module"))
	lock_module = 1;

	get_args(buf, &argc, argv, '/', 4);
	if (argc != 4)
	log_error("uevent message has %d args", argc);
	act = argv[0];
	sys = argv[2];

	log_debug("kernel: %s %s", act, argv[3]);

	if (!strcmp(act, "remove@")) {
	/* We want to trigger the leave at the very end of the kernel's
	unmount process, i.e. at the end of put_super(), so we do the
	leave when the second uevent (from the gfs kobj) arrives. */

	if (lock_module)
	return;

	do_leave(argv[3], 0);

	} else if (!strcmp(act, "change@")) {
	if (!lock_module)
	return;

	if (group_mode == GROUP_LIBGROUP)
	process_recovery_uevent_old(argv[3]);
	else
	process_recovery_uevent(argv[3]);

	} else if (!strcmp(act, "offline@")) {
	if (!lock_module)
	return;

	if (group_mode == GROUP_LIBGROUP)
	do_withdraw_old(argv[3]);
	else
	log_error("TODO withdraw for libcpg");

	} else {
	if (!lock_module)
	return;

	ping_kernel_mount(argv[3]);
	}
	}

	static int setup_uevent(void)
	{
	struct sockaddr_nl snl;
	int s, rv;

	s = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_KOBJECT_UEVENT);
	if (s < 0) {
	log_error("uevent netlink socket");
	return s;
	}

	memset(&snl, 0, sizeof(snl));
	snl.nl_family = AF_NETLINK;
	snl.nl_pid = getpid();
	snl.nl_groups = 1;

	rv = bind(s, (struct sockaddr *) &snl, sizeof(snl));
	if (rv < 0) {
	log_error("uevent bind error %d errno %d", rv, errno);
	close(s);
	return rv;
	}

	return s;
	}

	static void init_header(struct gfsc_header h, int cmd, char name, int result,
	int extra_len)
	{
	memset(h, 0, sizeof(struct gfsc_header));

	h->magic = GFSC_MAGIC;
	h->version = GFSC_VERSION;
	h->len = sizeof(struct gfsc_header) + extra_len;
	h->command = cmd;
	h->data = result;

	if (name)
	strncpy(h->name, name, GFS_MOUNTGROUP_LEN);
	}

	static void query_dump_debug(int fd)
	{
	struct gfsc_header h;
	int extra_len;
	int len;

	/* in the case of dump_wrap, extra_len will go in two writes,
	first the log tail, then the log head */
	if (dump_wrap)
	extra_len = GFSC_DUMP_SIZE;
	else
	extra_len = dump_point;

	init_header(&h, GFSC_CMD_DUMP_DEBUG, NULL, 0, extra_len);
	do_write(fd, &h, sizeof(h));

	if (dump_wrap) {
	len = GFSC_DUMP_SIZE - dump_point;
	do_write(fd, dump_buf + dump_point, len);
	len = dump_point;
	} else
	len = dump_point;

	/* NUL terminate the debug string */
	dump_buf[dump_point] = '\0';

	do_write(fd, dump_buf, len);
	}

	static void query_dump_plocks(int fd, char *name)
	{
	struct mountgroup *mg;
	struct gfsc_header h;
	int rv;

	mg = find_mg(name);
	if (!mg) {
	plock_dump_len = 0;
	rv = -ENOENT;
	} else {
	/* writes to plock_dump_buf and sets plock_dump_len */
	rv = fill_plock_dump_buf(mg);
	}

	init_header(&h, GFSC_CMD_DUMP_PLOCKS, name, rv, plock_dump_len);

	do_write(fd, &h, sizeof(h));

	if (plock_dump_len)
	do_write(fd, plock_dump_buf, plock_dump_len);
	}

	/* combines a header and the data and sends it back to the client in
	a single do_write() call */

	static void do_reply(int fd, int cmd, char name, int result, void buf,
	int buflen)
	{
	char *reply;
	int reply_len;

	reply_len = sizeof(struct gfsc_header) + buflen;
	reply = malloc(reply_len);
	if (!reply)
	return;
	memset(reply, 0, reply_len);

	init_header((struct gfsc_header *)reply, cmd, name, result, buflen);

	if (buf && buflen)
	memcpy(reply + sizeof(struct gfsc_header), buf, buflen);

	do_write(fd, reply, reply_len);

	free(reply);
	}

	static void query_mountgroup_info(int fd, char *name)
	{
	struct mountgroup *mg;
	struct gfsc_mountgroup mountgroup;
	int rv;

	mg = find_mg(name);
	if (!mg) {
	rv = -ENOENT;
	goto out;
	}

	memset(&mountgroup, 0, sizeof(mountgroup));
	mountgroup.group_mode = group_mode;

	if (group_mode == GROUP_LIBGROUP)
	rv = set_mountgroup_info_group(mg, &mountgroup);
	else
	rv = set_mountgroup_info(mg, &mountgroup);
	out:
	do_reply(fd, GFSC_CMD_MOUNTGROUP_INFO, name, rv,
	(char *)&mountgroup, sizeof(mountgroup));
	}

	static void query_node_info(int fd, char *name, int nodeid)
	{
	struct mountgroup *mg;
	struct gfsc_node node;
	int rv;

	mg = find_mg(name);
	if (!mg) {
	rv = -ENOENT;
	goto out;
	}

	if (group_mode == GROUP_LIBGROUP)
	rv = set_node_info_group(mg, nodeid, &node);
	else
	rv = set_node_info(mg, nodeid, &node);
	out:
	do_reply(fd, GFSC_CMD_NODE_INFO, name, rv,
	(char *)&node, sizeof(node));
	}

	static void query_mountgroups(int fd, int max)
	{
	int mg_count = 0;
	struct gfsc_mountgroup *mgs = NULL;
	int rv, result;

	if (group_mode == GROUP_LIBGROUP)
	rv = set_mountgroups_group(&mg_count, &mgs);
	else
	rv = set_mountgroups(&mg_count, &mgs);

	if (rv < 0) {
	result = rv;
	mg_count = 0;
	goto out;
	}

	if (mg_count > max) {
	result = -E2BIG;
	mg_count = max;
	} else {
	result = mg_count;
	}
	out:
	do_reply(fd, GFSC_CMD_MOUNTGROUPS, NULL, result,
	(char )mgs, mg_count sizeof(struct gfsc_mountgroup));

	if (mgs)
	free(mgs);
	}

	static void query_mountgroup_nodes(int fd, char *name, int option, int max)
	{
	struct mountgroup *mg;
	int node_count = 0;
	struct gfsc_node *nodes = NULL;
	int rv, result;

	mg = find_mg(name);
	if (!mg) {
	result = -ENOENT;
	node_count = 0;
	goto out;
	}

	if (group_mode == GROUP_LIBGROUP)
	rv = set_mountgroup_nodes_group(mg, option, &node_count, &nodes);
	else
	rv = set_mountgroup_nodes(mg, option, &node_count, &nodes);

	if (rv < 0) {
	result = rv;
	node_count = 0;
	goto out;
	}

	/* node_count is the number of structs copied/returned; the caller's
	max may be less than that, in which case we copy as many as they
	asked for and return -E2BIG */

	if (node_count > max) {
	result = -E2BIG;
	node_count = max;
	} else {
	result = node_count;
	}
	out:
	do_reply(fd, GFSC_CMD_MOUNTGROUP_NODES, name, result,
	(char )nodes, node_count sizeof(struct gfsc_node));

	if (nodes)
	free(nodes);
	}

	-void client_reply_remount(struct mountgroup *mg, int result)
	-{
	- struct gfsc_mount_args *ma = &mg->mount_args;
	-
	- log_group(mg, "client_reply_remount ci %d result %d",
	- mg->remount_client, result);
	-
	- do_reply(client[mg->remount_client].fd, GFSC_CMD_FS_REMOUNT,
	- mg->name, result, ma, sizeof(struct gfsc_mount_args));
	-
	- mg->remount_client = 0;
	-}
	-
	void client_reply_join(int ci, struct gfsc_mount_args *ma, int result)
	{
	char *name = strstr(ma->table, ":") + 1;

	log_debug("client_reply_join %s ci %d result %d", name, ci, result);

	do_reply(client[ci].fd, GFSC_CMD_FS_JOIN,
	name, result, ma, sizeof(struct gfsc_mount_args));
	}

	void client_reply_join_full(struct mountgroup *mg, int result)
	{
	char nodir_str[32];

	if (result)
	goto out;

	if (mg->our_jid < 0) {
	snprintf(mg->mount_args.hostdata, PATH_MAX,
	"hostdata=id=%u:first=%d",
	mg->id, mg->first_mounter);
	} else {
	snprintf(mg->mount_args.hostdata, PATH_MAX,
	"hostdata=jid=%d:id=%u:first=%d",
	mg->our_jid, mg->id, mg->first_mounter);
	}

	memset(nodir_str, 0, sizeof(nodir_str));

	read_ccs_nodir(mg, nodir_str);
	if (nodir_str[0])
	strcat(mg->mount_args.hostdata, nodir_str);
	out:
	log_group(mg, "client_reply_join_full ci %d result %d hostdata %s",
	mg->mount_client, result, mg->mount_args.hostdata);

	client_reply_join(mg->mount_client, &mg->mount_args, result);
	}

	static void do_join(int ci, struct gfsc_mount_args *ma)
	{
	struct mountgroup *mg = NULL;
	char table2[PATH_MAX];
	char cluster = NULL, name = NULL;
	int rv;

	log_debug("join: %s %s %s %s %s %s", ma->dir, ma->type, ma->proto,
	ma->table, ma->options, ma->dev);

	if (strcmp(ma->proto, "lock_dlm")) {
	log_error("join: lockproto %s not supported", ma->proto);
	rv = -EPROTONOSUPPORT;
	goto fail;
	}

	if (strstr(ma->options, "jid=") \|\|
	strstr(ma->options, "first=") \|\|
	strstr(ma->options, "id=")) {
	log_error("join: jid, first and id are reserved options");
	rv = -EOPNOTSUPP;
	goto fail;
	}

	/* table is <cluster>:<name> */

	memset(table2, 0, sizeof(table2));
	strncpy(table2, ma->table, sizeof(table2));

	name = strstr(table2, ":");
	if (!name) {
	rv = -EBADFD;
	goto fail;
	}

	*name = '\0';
	name++;
	cluster = table2;

	if (strlen(name) > GFS_MOUNTGROUP_LEN) {
	rv = -ENAMETOOLONG;
	goto fail;
	}

	mg = find_mg(name);
	if (mg) {
	if (strcmp(mg->mount_args.dev, ma->dev)) {
	log_error("different fs dev %s with same name",
	mg->mount_args.dev);
	rv = -EADDRINUSE;
	} else if (mg->leaving) {
	/* we're leaving the group */
	log_error("join: reject mount due to unmount");
	rv = -ESTALE;
	} else if (mg->mount_client \|\| !mg->kernel_mount_done) {
	log_error("join: other mount in progress %d %d",
	mg->mount_client, mg->kernel_mount_done);
	rv = -EBUSY;
	} else {
	log_group(mg, "join: already mounted");
	rv = -EALREADY;
	}
	goto fail;
	}

	mg = create_mg(name);
	if (!mg) {
	rv = -ENOMEM;
	goto fail;
	}
	mg->mount_client = ci;
	memcpy(&mg->mount_args, ma, sizeof(struct gfsc_mount_args));

	if (strlen(cluster) != strlen(clustername) \|\|
	strlen(cluster) == 0 \|\| strcmp(cluster, clustername)) {
	log_error("join: fs requires cluster=\"%s\" current=\"%s\"",
	cluster, clustername);
	rv = -EBADR;
	goto fail_free;
	}
	log_group(mg, "join: cluster name matches: %s", clustername);

	if (strstr(ma->options, "spectator")) {
	log_group(mg, "join: spectator mount");
	mg->spectator = 1;
	} else {
	if (!we_are_in_fence_domain()) {
	log_error("join: not in default fence domain");
	rv = -ENOANO;
	goto fail_free;
	}
	}

	if (!mg->spectator && strstr(ma->options, "rw"))
	mg->rw = 1;
	else if (strstr(ma->options, "ro")) {
	if (mg->spectator) {
	log_error("join: readonly invalid with spectator");
	rv = -EROFS;
	goto fail_free;
	}
	mg->ro = 1;
	}

	list_add(&mg->list, &mountgroups);

	if (group_mode == GROUP_LIBGROUP)
	rv = gfs_join_mountgroup_old(mg, ma);
	else
	rv = gfs_join_mountgroup(mg);

	if (rv) {
	log_error("join: group join error %d", rv);
	list_del(&mg->list);
	goto fail_free;
	}
	return;

	fail_free:
	free(mg);
	fail:
	client_reply_join(ci, ma, rv);
	}

	static void do_leave(char *table, int mnterr)
	{
	char *name = strstr(table, ":") + 1;

	log_debug("leave: %s mnterr %d", name, mnterr);

	if (group_mode == GROUP_LIBGROUP)
	gfs_leave_mountgroup_old(name, mnterr);
	else
	gfs_leave_mountgroup(name, mnterr);
	}

	static void do_mount_done(char *table, int result)
	{
	struct mountgroup *mg;
	char *name = strstr(table, ":") + 1;

	log_debug("mount_done: %s result %d", name, result);

	mg = find_mg(name);
	if (!mg) {
	log_error("mount_done: %s not found", name);
	return;
	}

	mg->mount_client = 0;
	mg->kernel_mount_done = 1;
	mg->kernel_mount_error = result;

	if (group_mode == GROUP_LIBGROUP)
	send_mount_status_old(mg);
	else
	gfs_mount_done(mg);
	}

	+void client_reply_remount(struct mountgroup *mg, int ci, int result)
	+{
	+ do_reply(client[ci].fd, GFSC_CMD_FS_REMOUNT, mg->name, result,
	+ &mg->mount_args, sizeof(struct gfsc_mount_args));
	+}
	+
	+/* mount.gfs creates a special ma->options string with only "ro" or "rw" */
	+
	+static void do_remount(int ci, struct gfsc_mount_args *ma)
	+{
	+ struct mountgroup *mg;
	+ char *name = strstr(ma->table, ":") + 1;
	+ int ro = 0, result = 0;
	+
	+ log_debug("remount: %s ci %d options %s", name, ci, ma->options);
	+
	+ mg = find_mg(name);
	+ if (!mg) {
	+ log_error("remount: %s not found", name);
	+ result = -1;
	+ goto out;
	+ }
	+
	+ if (mg->spectator) {
	+ log_error("remount of spectator not allowed");
	+ result = -1;
	+ goto out;
	+ }
	+
	+ if (!strcmp(ma->options, "ro"))
	+ ro = 1;
	+
	+ if ((mg->ro && ro) \|\| (!mg->ro && !ro))
	+ goto out;
	+
	+ if (group_mode == GROUP_LIBGROUP) {
	+ /* the receive calls client_reply_remount */
	+ mg->remount_client = ci;
	+ send_remount_old(mg, ma);
	+ return;
	+ }
	+
	+ send_remount(mg, ma);
	+ out:
	+ client_reply_remount(mg, ci, result);
	+}
	+
	void process_connection(int ci)
	{
	struct gfsc_header h;
	struct gfsc_mount_args empty;
	struct gfsc_mount_args *ma;
	char *extra = NULL;
	int rv, extra_len;

	rv = do_read(client[ci].fd, &h, sizeof(h));
	if (rv < 0) {
	log_debug("connection %d read error %d", ci, rv);
	goto out;
	}

	if (h.magic != GFSC_MAGIC) {
	log_debug("connection %d magic error %x", ci, h.magic);
	goto out;
	}

	if ((h.version & 0xFFFF0000) != (GFSC_VERSION & 0xFFFF0000)) {
	log_debug("connection %d version error %x", ci, h.version);
	goto out;
	}

	if (h.len > sizeof(h)) {
	extra_len = h.len - sizeof(h);
	extra = malloc(extra_len);
	if (!extra) {
	log_error("process_connection no mem %d", extra_len);
	goto out;
	}
	memset(extra, 0, extra_len);

	rv = do_read(client[ci].fd, extra, extra_len);
	if (rv < 0) {
	log_debug("connection %d extra read error %d", ci, rv);
	goto out;
	}
	}

	ma = (struct gfsc_mount_args *)extra;

	if (!ma) {
	memset(&empty, 0, sizeof(empty));

	if (h.command == GFSC_CMD_FS_JOIN \|\|
	h.command == GFSC_CMD_FS_REMOUNT) {
	do_reply(client[ci].fd, h.command, h.name, -EINVAL,
	&empty, sizeof(empty));
	}
	log_debug("connection %d cmd %d no data", ci, h.command);
	goto out;
	}

	switch (h.command) {

	case GFSC_CMD_FS_JOIN:
	do_join(ci, ma);
	break;

	case GFSC_CMD_FS_LEAVE:
	do_leave(ma->table, h.data);
	break;

	case GFSC_CMD_FS_MOUNT_DONE:
	do_mount_done(ma->table, h.data);
	break;

	case GFSC_CMD_FS_REMOUNT:
	- if (group_mode == GROUP_LIBGROUP)
	- remount_mountgroup_old(ci, ma);
	-#if 0
	- /* FIXME */
	- else
	- remount_mountgroup(ci, ma);
	-#endif
	+ do_remount(ci, ma);
	break;

	default:
	log_error("process_connection %d unknown command %d",
	ci, h.command);
	}
	out:
	if (extra)
	free(extra);

	/* no client_dead(ci) here, since the connection for
	join/remount is reused */
	}

	static void process_listener(int ci)
	{
	int fd, i;

	fd = accept(client[ci].fd, NULL, NULL);
	if (fd < 0) {
	log_error("process_listener: accept error %d %d", fd, errno);
	return;
	}

	i = client_add(fd, process_connection, NULL);

	log_debug("client connection %d fd %d", i, fd);
	}

	static int setup_listener(char *sock_path)
	{
	struct sockaddr_un addr;
	socklen_t addrlen;
	int rv, s;

	/* we listen for new client connections on socket s */

	s = socket(AF_LOCAL, SOCK_STREAM, 0);
	if (s < 0) {
	log_error("socket error %d %d", s, errno);
	return s;
	}

	memset(&addr, 0, sizeof(addr));
	addr.sun_family = AF_LOCAL;
	strcpy(&addr.sun_path[1], sock_path);
	addrlen = sizeof(sa_family_t) + strlen(addr.sun_path+1) + 1;

	rv = bind(s, (struct sockaddr *) &addr, addrlen);
	if (rv < 0) {
	log_error("bind error %d %d", rv, errno);
	close(s);
	return rv;
	}

	rv = listen(s, 5);
	if (rv < 0) {
	log_error("listen error %d %d", rv, errno);
	close(s);
	return rv;
	}
	return s;
	}

	void query_lock(void)
	{
	pthread_mutex_lock(&query_mutex);
	}

	void query_unlock(void)
	{
	pthread_mutex_unlock(&query_mutex);
	}

	/* This is a thread, so we have to be careful, don't call log_ functions.
	We need a thread to process queries because the main thread may block
	for long periods. */

	static void process_queries(void arg)
	{
	struct gfsc_header h;
	int s = ((int )arg);
	int f, rv;

	for (;;) {
	f = accept(s, NULL, NULL);

	rv = do_read(f, &h, sizeof(h));
	if (rv < 0) {
	goto out;
	}

	if (h.magic != GFSC_MAGIC) {
	goto out;
	}

	if ((h.version & 0xFFFF0000) != (GFSC_VERSION & 0xFFFF0000)) {
	goto out;
	}

	query_lock();

	switch (h.command) {
	case GFSC_CMD_DUMP_DEBUG:
	query_dump_debug(f);
	break;
	case GFSC_CMD_DUMP_PLOCKS:
	query_dump_plocks(f, h.name);
	break;
	case GFSC_CMD_MOUNTGROUP_INFO:
	query_mountgroup_info(f, h.name);
	break;
	case GFSC_CMD_NODE_INFO:
	query_node_info(f, h.name, h.data);
	break;
	case GFSC_CMD_MOUNTGROUPS:
	query_mountgroups(f, h.data);
	break;
	case GFSC_CMD_MOUNTGROUP_NODES:
	query_mountgroup_nodes(f, h.name, h.option, h.data);
	break;
	default:
	break;
	}
	query_unlock();

	out:
	close(f);
	}
	}

	static int setup_queries(void)
	{
	int rv, s;

	rv = setup_listener(GFSC_QUERY_SOCK_PATH);
	if (rv < 0)
	return rv;
	s = rv;

	pthread_mutex_init(&query_mutex, NULL);

	rv = pthread_create(&query_thread, NULL, process_queries, &s);
	if (rv < 0) {
	log_error("can't create query thread");
	close(s);
	return rv;
	}
	return 0;
	}

	void cluster_dead(int ci)
	{
	log_error("cluster is down, exiting");
	daemon_quit = 1;
	}

	static void dlmcontrol_dead(int ci)
	{
	if (!list_empty(&mountgroups))
	log_error("dlm_controld is gone");
	client_dead(ci);
	}

	static void loop(void)
	{
	int poll_timeout = -1;
	int rv, i;
	void (*workfn) (int ci);
	void (*deadfn) (int ci);

	rv = setup_queries();
	if (rv < 0)
	goto out;

	rv = setup_listener(GFSC_SOCK_PATH);
	if (rv < 0)
	goto out;
	client_add(rv, process_listener, NULL);

	rv = setup_cman();
	if (rv < 0)
	goto out;
	client_add(rv, process_cman, cluster_dead);

	rv = setup_ccs();
	if (rv < 0)
	goto out;

	setup_logging();

	rv = check_uncontrolled_filesystems();
	if (rv < 0)
	goto out;

	rv = setup_uevent();
	if (rv < 0)
	goto out;
	client_add(rv, process_uevent, NULL);

	group_mode = GROUP_LIBCPG;

	if (cfgd_groupd_compat) {
	rv = setup_groupd();
	if (rv < 0)
	goto out;
	client_add(rv, process_groupd, cluster_dead);

	group_mode = GROUP_LIBGROUP;
	if (cfgd_groupd_compat == 2)
	set_group_mode();
	}
	log_debug("group_mode %d compat %d", group_mode, cfgd_groupd_compat);

	if (group_mode == GROUP_LIBCPG) {

	/*
	* The new, good, way of doing things using libcpg directly.
	* code in: cpg-new.c
	*/

	rv = setup_cpg();
	if (rv < 0)
	goto out;

	rv = setup_dlmcontrol();
	if (rv < 0)
	goto out;
	client_add(rv, process_dlmcontrol, dlmcontrol_dead);

	} else if (group_mode == GROUP_LIBGROUP) {

	/*
	* The old, bad, way of doing things using libgroup.
	* code in: cpg-old.c group.c plock.c
	*/

	rv = setup_cpg_old();
	if (rv < 0)
	goto out;
	client_add(rv, process_cpg_old, cluster_dead);

	rv = setup_plocks();
	if (rv < 0)
	goto out;
	plock_fd = rv;
	plock_ci = client_add(rv, process_plocks, NULL);
	}

	for (;;) {
	rv = poll(pollfd, client_maxi + 1, poll_timeout);
	if (rv == -1 && errno == EINTR) {
	if (daemon_quit && list_empty(&mountgroups))
	goto out;
	daemon_quit = 0;
	continue;
	}
	if (rv < 0) {
	log_error("poll errno %d", errno);
	goto out;
	}

	/* FIXME: lock/unlock around operations that take a while */
	query_lock();

	for (i = 0; i <= client_maxi; i++) {
	if (client[i].fd < 0)
	continue;
	if (pollfd[i].revents & POLLIN) {
	workfn = client[i].workfn;
	workfn(i);
	}
	if (pollfd[i].revents & (POLLERR \| POLLHUP \| POLLNVAL)) {
	deadfn = client[i].deadfn;
	deadfn(i);
	}
	}

	if (daemon_quit)
	break;

	poll_timeout = -1;

	if (poll_dlm) {
	/* only happens for GROUP_LIBCPG */
	process_mountgroups();
	poll_timeout = 500;
	}

	if (poll_ignore_plock) {
	/* only happens for GROUP_LIBGROUP */
	if (!limit_plocks()) {
	poll_ignore_plock = 0;
	client_back(plock_ci, plock_fd);
	}
	poll_timeout = 1000;
	}

	if (dmsetup_wait) {
	update_dmsetup_wait();
	if (dmsetup_wait) {
	if (poll_timeout == -1)
	poll_timeout = 1000;
	} else {
	if (poll_timeout == 1000)
	poll_timeout = -1;
	}
	}

	query_unlock();
	}
	out:
	if (cfgd_groupd_compat)
	close_groupd();
	close_logging();
	close_ccs();
	close_cman();

	if (!list_empty(&mountgroups))
	log_error("mountgroups abandoned");
	}

	static void lockfile(void)
	{
	int fd, error;
	struct flock lock;
	char buf[33];

	memset(buf, 0, 33);

	fd = open(LOCKFILE_NAME, O_CREAT\|O_WRONLY,
	S_IRUSR\|S_IWUSR\|S_IRGRP\|S_IROTH);
	if (fd < 0) {
	fprintf(stderr, "cannot open/create lock file %s\n",
	LOCKFILE_NAME);
	exit(EXIT_FAILURE);
	}

	lock.l_type = F_WRLCK;
	lock.l_start = 0;
	lock.l_whence = SEEK_SET;
	lock.l_len = 0;

	error = fcntl(fd, F_SETLK, &lock);
	if (error) {
	fprintf(stderr, "gfs_controld is already running\n");
	exit(EXIT_FAILURE);
	}

	error = ftruncate(fd, 0);
	if (error) {
	fprintf(stderr, "cannot clear lock file %s\n", LOCKFILE_NAME);
	exit(EXIT_FAILURE);
	}

	sprintf(buf, "%d\n", getpid());

	error = write(fd, buf, strlen(buf));
	if (error <= 0) {
	fprintf(stderr, "cannot write lock file %s\n", LOCKFILE_NAME);
	exit(EXIT_FAILURE);
	}
	}

	static void print_usage(void)
	{
	printf("Usage:\n");
	printf("\n");
	printf("gfs_controld [options]\n");
	printf("\n");
	printf("Options:\n");
	printf("\n");
	printf(" -D Enable debugging code and don't fork\n");
	printf(" -L <num> Enable (1) or disable (0) debugging to logsys (default %d)\n", DEFAULT_DEBUG_LOGSYS);
	printf(" -g <num> groupd compatibility mode, 0 off, 1 on, 2 detect\n");
	printf(" 0: use libcpg, no backward compat, best performance\n");
	printf(" 1: use libgroup for compat with cluster2/rhel5\n");
	printf(" 2: use groupd to detect old, or mode 1, nodes that\n"
	" require compat, use libcpg if none found\n");
	printf(" Default is %d\n", DEFAULT_GROUPD_COMPAT);
	printf(" -w <num> Enable (1) or disable (0) withdraw\n");
	printf(" Default is %d\n", DEFAULT_ENABLE_WITHDRAW);
	printf(" -p <num> Enable (1) or disable (0) plock code\n");
	printf(" Default is %d\n", DEFAULT_ENABLE_PLOCK);
	printf(" -P Enable plock debugging\n");

	printf(" -l <limit> Limit the rate of plock operations\n");
	printf(" Default is %d, set to 0 for no limit\n", DEFAULT_PLOCK_RATE_LIMIT);
	printf(" -o <n> Enable (1) or disable (0) plock ownership\n");
	printf(" Default is %d\n", DEFAULT_PLOCK_OWNERSHIP);
	printf(" -t <ms> plock ownership drop resources time (milliseconds)\n");
	printf(" Default is %u\n", DEFAULT_DROP_RESOURCES_TIME);
	printf(" -c <num> plock ownership drop resources count\n");
	printf(" Default is %u\n", DEFAULT_DROP_RESOURCES_COUNT);
	printf(" -a <ms> plock ownership drop resources age (milliseconds)\n");
	printf(" Default is %u\n", DEFAULT_DROP_RESOURCES_AGE);
	printf(" -h Print this help, then exit\n");
	printf(" -V Print program version information, then exit\n");
	}

	#define OPTION_STRING "L:DKg:w:f:q:d:p:Pl:o:t:c:a:hV"

	static void read_arguments(int argc, char **argv)
	{
	int cont = 1;
	int optchar;

	while (cont) {
	optchar = getopt(argc, argv, OPTION_STRING);

	switch (optchar) {

	case 'D':
	daemon_debug_opt = 1;
	break;

	case 'L':
	optd_debug_logsys = 1;
	cfgd_debug_logsys = atoi(optarg);
	break;

	case 'g':
	optd_groupd_compat = 1;
	cfgd_groupd_compat = atoi(optarg);
	break;

	case 'w':
	optd_enable_withdraw = 1;
	cfgd_enable_withdraw = atoi(optarg);
	break;

	case 'p':
	optd_enable_plock = 1;
	cfgd_enable_plock = atoi(optarg);
	break;

	case 'P':
	optd_plock_debug = 1;
	cfgd_plock_debug = 1;
	break;

	case 'l':
	optd_plock_rate_limit = 1;
	cfgd_plock_rate_limit = atoi(optarg);
	break;

	case 'o':
	optd_plock_ownership = 1;
	cfgd_plock_ownership = atoi(optarg);
	break;

	case 't':
	optd_drop_resources_time = 1;
	cfgd_drop_resources_time = atoi(optarg);
	break;

	case 'c':
	optd_drop_resources_count = 1;
	cfgd_drop_resources_count = atoi(optarg);
	break;

	case 'a':
	optd_drop_resources_age = 1;
	cfgd_drop_resources_age = atoi(optarg);
	break;

	case 'h':
	print_usage();
	exit(EXIT_SUCCESS);
	break;

	case 'V':
	printf("gfs_controld %s (built %s %s)\n",
	RELEASE_VERSION, __DATE__, __TIME__);
	printf("%s\n", REDHAT_COPYRIGHT);
	exit(EXIT_SUCCESS);
	break;

	case ':':
	case '?':
	fprintf(stderr, "Please use '-h' for usage.\n");
	exit(EXIT_FAILURE);
	break;

	case EOF:
	cont = 0;
	break;

	default:
	fprintf(stderr, "unknown option: %c\n", optchar);
	exit(EXIT_FAILURE);
	break;
	};
	}

	if (!optd_debug_logsys && getenv("GFS_CONTROLD_DEBUG")) {
	optd_debug_logsys = 1;
	cfgd_debug_logsys = atoi(getenv("GFS_CONTROLD_DEBUG"));
	}
	}

	static void set_oom_adj(int val)
	{
	FILE *fp;

	fp = fopen("/proc/self/oom_adj", "w");
	if (!fp)
	return;

	fprintf(fp, "%i", val);
	fclose(fp);
	}

	static void set_scheduler(void)
	{
	struct sched_param sched_param;
	int rv;

	rv = sched_get_priority_max(SCHED_RR);
	if (rv != -1) {
	sched_param.sched_priority = rv;
	rv = sched_setscheduler(0, SCHED_RR, &sched_param);
	if (rv == -1)
	log_error("could not set SCHED_RR priority %d err %d",
	sched_param.sched_priority, errno);
	} else {
	log_error("could not get maximum scheduler priority err %d",
	errno);
	}
	}

	int main(int argc, char **argv)
	{
	INIT_LIST_HEAD(&mountgroups);

	init_logging();

	read_arguments(argc, argv);

	lockfile();

	if (!daemon_debug_opt) {
	if (daemon(0, 0) < 0) {
	perror("daemon error");
	exit(EXIT_FAILURE);
	}
	}
	signal(SIGTERM, sigterm_handler);

	set_scheduler();
	set_oom_adj(-16);

	loop();

	return 0;
	}

	void daemon_dump_save(void)
	{
	int len, i;

	len = strlen(daemon_debug_buf);

	for (i = 0; i < len; i++) {
	dump_buf[dump_point++] = daemon_debug_buf[i];

	if (dump_point == GFSC_DUMP_SIZE) {
	dump_point = 0;
	dump_wrap = 1;
	}
	}
	}

	int daemon_debug_opt;
	int daemon_quit;
	int poll_ignore_plock;
	int poll_dlm;
	int plock_fd;
	int plock_ci;
	struct list_head mountgroups;
	int cman_quorate;
	int our_nodeid;
	char *clustername;
	char daemon_debug_buf[256];
	char dump_buf[GFSC_DUMP_SIZE];
	int dump_point;
	int dump_wrap;
	char plock_dump_buf[GFSC_DUMP_SIZE];
	int plock_dump_len;
	int dmsetup_wait;
	cpg_handle_t libcpg_handle;
	int libcpg_flow_control_on;
	int group_mode;

File Metadata

Mime Type: text/x-diff
Expires: Sat, Nov 23, 11:07 AM (1 d, 18 h)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 1018646
Default Alt Text: (193 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions