No OneTemporary
Actions

Size

65 KB

Referenced Files

None

Subscribers

None

View Options

	diff --git a/crmd/throttle.c b/crmd/throttle.c
	index 169594b399..6327170e90 100644
	--- a/crmd/throttle.c
	+++ b/crmd/throttle.c
	@@ -1,734 +1,676 @@
	/*
	* Copyright (C) 2013 Andrew Beekhof <andrew@beekhof.net>
	*
	* This program is free software; you can redistribute it and/or
	* modify it under the terms of the GNU General Public
	* License as published by the Free Software Foundation; either
	* version 2 of the License, or (at your option) any later version.
	*
	* This software is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* General Public License for more details.
	*
	* You should have received a copy of the GNU General Public
	* License along with this library; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
	*/

	#include <crm_internal.h>

	#include <sys/types.h>
	#include <sys/stat.h>

	#include <unistd.h>
	#include <ctype.h>
	#include <dirent.h>

	#include <crm/crm.h>
	#include <crm/msg_xml.h>
	#include <crm/cluster.h>

	#include <crmd_fsa.h>
	#include <throttle.h>


	enum throttle_state_e
	{
	throttle_extreme = 0x1000,
	throttle_high = 0x0100,
	throttle_med = 0x0010,
	throttle_low = 0x0001,
	throttle_none = 0x0000,
	};

	struct throttle_record_s
	{
	int max;
	enum throttle_state_e mode;
	char *node;
	};

	int throttle_job_max = 0;
	float throttle_load_target = 0.0;

	#define THROTTLE_FACTOR_LOW 1.2
	#define THROTTLE_FACTOR_MEDIUM 1.6
	#define THROTTLE_FACTOR_HIGH 2.0

	GHashTable *throttle_records = NULL;
	mainloop_timer_t *throttle_timer = NULL;

	int throttle_num_cores(void)
	{
	static int cores = 0;
	char buffer[256];
	FILE *stream = NULL;
	const char *cpufile = "/proc/cpuinfo";

	if(cores) {
	return cores;
	}
	stream = fopen(cpufile, "r");
	if(stream == NULL) {
	int rc = errno;
	crm_warn("Couldn't read %s, assuming a single processor: %s (%d)", cpufile, pcmk_strerror(rc), rc);
	return 1;
	}

	while (fgets(buffer, sizeof(buffer), stream)) {
	if(strstr(buffer, "processor") == buffer) {
	cores++;
	}
	}

	fclose(stream);

	if(cores == 0) {
	crm_warn("No processors found in %s, assuming 1", cpufile);
	return 1;
	}

	return cores;
	}

	/*
	* \internal
	* \brief Return name of /proc file containing the CIB deamon's load statistics
	*
	* \return Newly allocated memory with file name on success, NULL otherwise
	*
	* \note It is the caller's responsibility to free the return value.
	* This will return NULL if the daemon is being run via valgrind.
	* This should be called only on Linux systems.
	*/
	static char *find_cib_loadfile(void)
	{
	- DIR *dp;
	- struct dirent *entry;
	- struct stat statbuf;
	- char *match = NULL;
	- char procpath[128];
	- char value[64];
	- char key[16];
	-
	- dp = opendir("/proc");
	- if (!dp) {
	- /* no proc directory to search through */
	- crm_notice("Can not read /proc directory to track existing components");
	- return NULL;
	- }
	-
	- /* Iterate through contents of /proc */
	- while ((entry = readdir(dp)) != NULL) {
	- FILE *file;
	- int pid;
	-
	- /* We're only interested in entries whose name is a PID,
	- * so skip anything non-numeric or that is too long.
	- *
	- * 114 = 128 - strlen("/proc/") - strlen("/status") - 1
	- */
	- pid = atoi(entry->d_name);
	- if ((pid <= 0) \|\| (strlen(entry->d_name) > 114)) {
	- continue;
	- }
	-
	- /* We're only interested in subdirectories */
	- strcpy(procpath, "/proc/");
	- strcat(procpath, entry->d_name);
	- if (lstat(procpath, &statbuf) \|\| !S_ISDIR(statbuf.st_mode)) {
	- continue;
	- }
	-
	- /* Read the first entry ("Name:") from the process's status file.
	- * We could handle the valgrind case if we parsed the cmdline file
	- * instead, but that's more of a pain than it's worth.
	- */
	- strcat(procpath, "/status");
	- file = fopen(procpath, "r");
	- if (!file) {
	- continue;
	- }
	- if (fscanf(file, "%15s%63s", key, value) != 2) {
	- fclose(file);
	- continue;
	- }
	- fclose(file);
	-
	- if (safe_str_eq("cib", value)) {
	- /* We found the CIB! */
	- match = crm_strdup_printf("/proc/%d/stat", pid);
	- break;
	- }
	- }
	+ int pid = crm_procfs_pid_of("cib");

	- closedir(dp);
	- return match;
	+ return pid? crm_strdup_printf("/proc/%d/stat", pid) : NULL;
	}

	static bool throttle_cib_load(float *load)
	{
	/*
	/proc/[pid]/stat
	Status information about the process. This is used by ps(1). It is defined in /usr/src/linux/fs/proc/array.c.

	The fields, in order, with their proper scanf(3) format specifiers, are:

	pid %d (1) The process ID.

	comm %s (2) The filename of the executable, in parentheses. This is visible whether or not the executable is swapped out.

	state %c (3) One character from the string "RSDZTW" where R is running, S is sleeping in an interruptible wait, D is waiting in uninterruptible disk sleep, Z is zombie, T is traced or stopped (on a signal), and W is paging.

	ppid %d (4) The PID of the parent.

	pgrp %d (5) The process group ID of the process.

	session %d (6) The session ID of the process.

	tty_nr %d (7) The controlling terminal of the process. (The minor device number is contained in the combination of bits 31 to 20 and 7 to 0; the major device number is in bits 15 to 8.)

	tpgid %d (8) The ID of the foreground process group of the controlling terminal of the process.

	flags %u (%lu before Linux 2.6.22)
	(9) The kernel flags word of the process. For bit meanings, see the PF_* defines in the Linux kernel source file include/linux/sched.h. Details depend on the kernel version.

	minflt %lu (10) The number of minor faults the process has made which have not required loading a memory page from disk.

	cminflt %lu (11) The number of minor faults that the process's waited-for children have made.

	majflt %lu (12) The number of major faults the process has made which have required loading a memory page from disk.

	cmajflt %lu (13) The number of major faults that the process's waited-for children have made.

	utime %lu (14) Amount of time that this process has been scheduled in user mode, measured in clock ticks (divide by sysconf(_SC_CLK_TCK)). This includes guest time, guest_time (time spent running a virtual CPU, see below), so that applications that are not aware of the guest time field do not lose that time from their calculations.

	stime %lu (15) Amount of time that this process has been scheduled in kernel mode, measured in clock ticks (divide by sysconf(_SC_CLK_TCK)).
	*/

	static char *loadfile = NULL;
	static time_t last_call = 0;
	static long ticks_per_s = 0;
	static unsigned long last_utime, last_stime;

	char buffer[64*1024];
	FILE *stream = NULL;
	time_t now = time(NULL);

	if(load == NULL) {
	return FALSE;
	} else {
	*load = 0.0;
	}

	if(loadfile == NULL) {
	last_call = 0;
	last_utime = 0;
	last_stime = 0;
	loadfile = find_cib_loadfile();
	if (loadfile == NULL) {
	crm_warn("Couldn't find CIB load file");
	return FALSE;
	}
	ticks_per_s = sysconf(_SC_CLK_TCK);
	crm_trace("Found %s", loadfile);
	}

	stream = fopen(loadfile, "r");
	if(stream == NULL) {
	int rc = errno;

	crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_strerror(rc), rc);
	free(loadfile); loadfile = NULL;
	return FALSE;
	}

	if(fgets(buffer, sizeof(buffer), stream)) {
	char *comm = calloc(1, 256);
	char state = 0;
	int rc = 0, pid = 0, ppid = 0, pgrp = 0, session = 0, tty_nr = 0, tpgid = 0;
	unsigned long flags = 0, minflt = 0, cminflt = 0, majflt = 0, cmajflt = 0, utime = 0, stime = 0;

	rc = sscanf(buffer, "%d %[^ ] %c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu",
	&pid, comm, &state,
	&ppid, &pgrp, &session, &tty_nr, &tpgid,
	&flags, &minflt, &cminflt, &majflt, &cmajflt, &utime, &stime);
	free(comm);

	if(rc != 15) {
	crm_err("Only %d of 15 fields found in %s", rc, loadfile);
	fclose(stream);
	return FALSE;

	} else if(last_call > 0
	&& last_call < now
	&& last_utime <= utime
	&& last_stime <= stime) {

	time_t elapsed = now - last_call;
	unsigned long delta_utime = utime - last_utime;
	unsigned long delta_stime = stime - last_stime;

	load = (delta_utime + delta_stime); / Cast to a float before division */
	*load /= ticks_per_s;
	*load /= elapsed;
	crm_debug("cib load: %f (%lu ticks in %ds)", *load, delta_utime + delta_stime, elapsed);

	} else {
	crm_debug("Init %lu + %lu ticks at %d (%lu tps)", utime, stime, now, ticks_per_s);
	}

	last_call = now;
	last_utime = utime;
	last_stime = stime;

	fclose(stream);
	return TRUE;
	}

	fclose(stream);
	return FALSE;
	}

	static bool throttle_load_avg(float *load)
	{
	char buffer[256];
	FILE *stream = NULL;
	const char *loadfile = "/proc/loadavg";

	if(load == NULL) {
	return FALSE;
	}

	stream = fopen(loadfile, "r");
	if(stream == NULL) {
	int rc = errno;
	crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_strerror(rc), rc);
	return FALSE;
	}

	if(fgets(buffer, sizeof(buffer), stream)) {
	char *nl = strstr(buffer, "\n");

	/* Grab the 1-minute average, ignore the rest */
	*load = strtof(buffer, NULL);
	if(nl) { nl[0] = 0; }

	crm_debug("Current load is %f (full: %s)", *load, buffer);
	fclose(stream);
	return TRUE;
	}

	fclose(stream);
	return FALSE;
	}

	static bool throttle_io_load(float load, unsigned int blocked)
	{
	char buffer[64*1024];
	FILE *stream = NULL;
	const char *loadfile = "/proc/stat";

	if(load == NULL) {
	return FALSE;
	}

	stream = fopen(loadfile, "r");
	if(stream == NULL) {
	int rc = errno;
	crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_strerror(rc), rc);
	return FALSE;
	}

	if(fgets(buffer, sizeof(buffer), stream)) {
	/* Borrowed from procps-ng's sysinfo.c */

	char *b = NULL;
	unsigned long long cpu_use = 0;
	unsigned long long cpu_nic = 0;
	unsigned long long cpu_sys = 0;
	unsigned long long cpu_idl = 0;
	unsigned long long cpu_iow = 0; /* not separated out until the 2.5.41 kernel */
	unsigned long long cpu_xxx = 0; /* not separated out until the 2.6.0-test4 kernel */
	unsigned long long cpu_yyy = 0; /* not separated out until the 2.6.0-test4 kernel */
	unsigned long long cpu_zzz = 0; /* not separated out until the 2.6.11 kernel */

	long long divo2 = 0;
	long long duse = 0;
	long long dsys = 0;
	long long didl =0;
	long long diow =0;
	long long dstl = 0;
	long long Div = 0;

	b = strstr(buffer, "cpu ");
	if(b) sscanf(b, "cpu %Lu %Lu %Lu %Lu %Lu %Lu %Lu %Lu",
	&cpu_use, &cpu_nic, &cpu_sys, &cpu_idl, &cpu_iow, &cpu_xxx, &cpu_yyy, &cpu_zzz);

	if(blocked) {
	b = strstr(buffer, "procs_blocked ");
	if(b) sscanf(b, "procs_blocked %u", blocked);
	}

	duse = cpu_use + cpu_nic;
	dsys = cpu_sys + cpu_xxx + cpu_yyy;
	didl = cpu_idl;
	diow = cpu_iow;
	dstl = cpu_zzz;
	Div = duse + dsys + didl + diow + dstl;
	if (!Div) Div = 1, didl = 1;
	divo2 = Div / 2UL;

	/* vmstat output:
	*
	* procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu----
	* r b swpd free buff cache si so bi bo in cs us sy id wa
	* 1 0 5537800 958592 204180 1737740 1 1 12 15 0 0 2 1 97 0
	*
	* The last four columns are calculated as:
	*
	* (unsigned)( (100*duse + divo2) / Div ),
	* (unsigned)( (100*dsys + divo2) / Div ),
	* (unsigned)( (100*didl + divo2) / Div ),
	* (unsigned)( (100*diow + divo2) / Div )
	*
	*/
	*load = (diow + divo2) / Div;
	crm_debug("Current IO load is %f", *load);

	fclose(stream);
	return TRUE;
	}

	fclose(stream);
	return FALSE;
	}

	static enum throttle_state_e
	throttle_handle_load(float load, const char *desc, int cores)
	{
	float adjusted_load = load;

	if(cores <= 0) {
	/* No fudging of the supplied load value */

	} else if(cores == 1) {
	/* On a single core machine, a load of 1.0 is already too high */
	adjusted_load = load * THROTTLE_FACTOR_MEDIUM;

	} else {
	/* Normalize the load to be per-core */
	adjusted_load = load / cores;
	}

	if(adjusted_load > THROTTLE_FACTOR_HIGH * throttle_load_target) {
	crm_notice("High %s detected: %f", desc, load);
	return throttle_high;

	} else if(adjusted_load > THROTTLE_FACTOR_MEDIUM * throttle_load_target) {
	crm_info("Moderate %s detected: %f", desc, load);
	return throttle_med;

	} else if(adjusted_load > THROTTLE_FACTOR_LOW * throttle_load_target) {
	crm_debug("Noticeable %s detected: %f", desc, load);
	return throttle_low;
	}

	crm_trace("Negligable %s detected: %f", desc, adjusted_load);
	return throttle_none;
	}

	static enum throttle_state_e
	throttle_mode(void)
	{
	int cores;
	float load;
	unsigned int blocked = 0;
	enum throttle_state_e mode = throttle_none;

	#ifdef ON_SOLARIS
	return throttle_none;
	#endif

	cores = throttle_num_cores();
	if(throttle_cib_load(&load)) {
	float cib_max_cpu = 0.95;
	const char *desc = "CIB load";
	/* The CIB is a single threaded task and thus cannot consume
	* more than 100% of a CPU (and 1/cores of the overall system
	* load).
	*
	* On a many cored system, the CIB might therefor be maxed out
	* (causing operations to fail or appear to fail) even though
	* the overall system load is still reasonable.
	*
	* Therefor the 'normal' thresholds can not apply here and we
	* need a special case.
	*/
	if(cores == 1) {
	cib_max_cpu = 0.4;
	}
	if(throttle_load_target > 0.0 && throttle_load_target < cib_max_cpu) {
	cib_max_cpu = throttle_load_target;
	}

	if(load > 1.5 * cib_max_cpu) {
	/* Can only happen on machines with a low number of cores */
	crm_notice("Extreme %s detected: %f", desc, load);
	mode \|= throttle_extreme;

	} else if(load > cib_max_cpu) {
	crm_notice("High %s detected: %f", desc, load);
	mode \|= throttle_high;

	} else if(load > cib_max_cpu * 0.9) {
	crm_info("Moderate %s detected: %f", desc, load);
	mode \|= throttle_med;

	} else if(load > cib_max_cpu * 0.8) {
	crm_debug("Noticeable %s detected: %f", desc, load);
	mode \|= throttle_low;

	} else {
	crm_trace("Negligable %s detected: %f", desc, load);
	}
	}

	if(throttle_load_target <= 0) {
	/* If we ever make this a valid value, the cluster will at least behave as expected */
	return mode;
	}

	if(throttle_load_avg(&load)) {
	mode \|= throttle_handle_load(load, "CPU load", cores);
	}

	if(throttle_io_load(&load, &blocked)) {
	mode \|= throttle_handle_load(load, "IO load", 0);
	mode \|= throttle_handle_load(blocked, "blocked IO ratio", cores);
	}

	if(mode & throttle_extreme) {
	return throttle_extreme;
	} else if(mode & throttle_high) {
	return throttle_high;
	} else if(mode & throttle_med) {
	return throttle_med;
	} else if(mode & throttle_low) {
	return throttle_low;
	}
	return throttle_none;
	}

	static void
	throttle_send_command(enum throttle_state_e mode)
	{
	xmlNode *xml = NULL;
	static enum throttle_state_e last = -1;

	if(mode != last) {
	crm_info("New throttle mode: %.4x (was %.4x)", mode, last);
	last = mode;

	xml = create_request(CRM_OP_THROTTLE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
	crm_xml_add_int(xml, F_CRM_THROTTLE_MODE, mode);
	crm_xml_add_int(xml, F_CRM_THROTTLE_MAX, throttle_job_max);

	send_cluster_message(NULL, crm_msg_crmd, xml, TRUE);
	free_xml(xml);
	}
	}

	static gboolean
	throttle_timer_cb(gpointer data)
	{
	static bool send_updates = FALSE;
	enum throttle_state_e now = throttle_none;

	if(send_updates) {
	now = throttle_mode();
	throttle_send_command(now);

	} else if(compare_version(fsa_our_dc_version, "3.0.8") < 0) {
	/* Optimize for the true case */
	crm_trace("DC version %s doesn't support throttling", fsa_our_dc_version);

	} else {
	send_updates = TRUE;
	now = throttle_mode();
	throttle_send_command(now);
	}

	return TRUE;
	}

	static void
	throttle_record_free(gpointer p)
	{
	struct throttle_record_s *r = p;
	free(r->node);
	free(r);
	}

	void
	throttle_update_job_max(const char *preference)
	{
	int max = 0;

	throttle_job_max = 2 * throttle_num_cores();

	if(preference) {
	/* Global preference from the CIB */
	max = crm_int_helper(preference, NULL);
	if(max > 0) {
	throttle_job_max = max;
	}
	}

	preference = getenv("LRMD_MAX_CHILDREN");
	if(preference) {
	/* Legacy env variable */
	max = crm_int_helper(preference, NULL);
	if(max > 0) {
	throttle_job_max = max;
	}
	}

	preference = getenv("PCMK_node_action_limit");
	if(preference) {
	/* Per-node override */
	max = crm_int_helper(preference, NULL);
	if(max > 0) {
	throttle_job_max = max;
	}
	}
	}


	void
	throttle_init(void)
	{
	if(throttle_records == NULL) {
	throttle_records = g_hash_table_new_full(
	crm_str_hash, g_str_equal, NULL, throttle_record_free);
	throttle_timer = mainloop_timer_add("throttle", 30 * 1000, TRUE, throttle_timer_cb, NULL);
	}

	throttle_update_job_max(NULL);
	mainloop_timer_start(throttle_timer);
	}

	void
	throttle_fini(void)
	{
	mainloop_timer_del(throttle_timer); throttle_timer = NULL;
	g_hash_table_destroy(throttle_records); throttle_records = NULL;
	}


	int
	throttle_get_total_job_limit(int l)
	{
	/* Cluster-wide limit */
	GHashTableIter iter;
	int limit = l;
	int peers = crm_active_peers();
	struct throttle_record_s *r = NULL;

	g_hash_table_iter_init(&iter, throttle_records);

	while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &r)) {
	switch(r->mode) {

	case throttle_extreme:
	if(limit == 0 \|\| limit > peers/4) {
	limit = QB_MAX(1, peers/4);
	}
	break;

	case throttle_high:
	if(limit == 0 \|\| limit > peers/2) {
	limit = QB_MAX(1, peers/2);
	}
	break;
	default:
	break;
	}
	}
	if(limit == l) {
	/* crm_trace("No change to batch-limit=%d", limit); */

	} else if(l == 0) {
	crm_trace("Using batch-limit=%d", limit);

	} else {
	crm_trace("Using batch-limit=%d instead of %d", limit, l);
	}
	return limit;
	}

	int
	throttle_get_job_limit(const char *node)
	{
	int jobs = 1;
	struct throttle_record_s *r = NULL;

	r = g_hash_table_lookup(throttle_records, node);
	if(r == NULL) {
	r = calloc(1, sizeof(struct throttle_record_s));
	r->node = strdup(node);
	r->mode = throttle_low;
	r->max = throttle_job_max;
	crm_trace("Defaulting to local values for unknown node %s", node);

	g_hash_table_insert(throttle_records, r->node, r);
	}

	switch(r->mode) {
	case throttle_extreme:
	case throttle_high:
	jobs = 1; /* At least one job must always be allowed */
	break;
	case throttle_med:
	jobs = QB_MAX(1, r->max / 4);
	break;
	case throttle_low:
	jobs = QB_MAX(1, r->max / 2);
	break;
	case throttle_none:
	jobs = QB_MAX(1, r->max);
	break;
	default:
	crm_err("Unknown throttle mode %.4x on %s", r->mode, node);
	break;
	}
	return jobs;
	}

	void
	throttle_update(xmlNode *xml)
	{
	int max = 0;
	enum throttle_state_e mode = 0;
	struct throttle_record_s *r = NULL;
	const char *from = crm_element_value(xml, F_CRM_HOST_FROM);

	crm_element_value_int(xml, F_CRM_THROTTLE_MODE, (int*)&mode);
	crm_element_value_int(xml, F_CRM_THROTTLE_MAX, &max);

	r = g_hash_table_lookup(throttle_records, from);

	if(r == NULL) {
	r = calloc(1, sizeof(struct throttle_record_s));
	r->node = strdup(from);
	g_hash_table_insert(throttle_records, r->node, r);
	}

	r->max = max;
	r->mode = mode;

	crm_debug("Host %s supports a maximum of %d jobs and throttle mode %.4x. New job limit is %d",
	from, max, mode, throttle_get_job_limit(from));
	}

	diff --git a/lib/common/watchdog.c b/lib/common/watchdog.c
	index a70ba42c40..6395be8fc8 100644
	--- a/lib/common/watchdog.c
	+++ b/lib/common/watchdog.c
	@@ -1,308 +1,247 @@
	/*
	* Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
	* 2014 Andrew Beekhof <andrew@beekhof.net>
	*
	* This program is free software; you can redistribute it and/or
	* modify it under the terms of the GNU General Public
	* License as published by the Free Software Foundation; either
	* version 2.1 of the License, or (at your option) any later version.
	*
	* This software is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* General Public License for more details.
	*
	* You should have received a copy of the GNU General Public
	* License along with this library; if not, write to the Free Software
	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
	*/

	#include <crm_internal.h>

	#include <sched.h>
	#include <syscall.h>
	#include <sys/ioctl.h>
	#include <sys/reboot.h>

	#include <sys/types.h>
	#include <sys/stat.h>
	#include <unistd.h>
	#include <ctype.h>
	#include <dirent.h>

	#ifdef _POSIX_MEMLOCK
	# include <sys/mman.h>
	#endif

	static int sbd_pid = 0;

	enum pcmk_panic_flags
	{
	pcmk_panic_none = 0x00,
	pcmk_panic_delay = 0x01,
	pcmk_panic_kdump = 0x02,
	pcmk_panic_shutdown = 0x04,
	};

	#define SYSRQ "/proc/sys/kernel/sysrq"

	void
	sysrq_init(void)
	{
	static bool need_init = true;
	FILE* procf;
	int c;

	if(need_init) {
	need_init = false;
	} else {
	return;
	}

	procf = fopen(SYSRQ, "r");
	if (!procf) {
	crm_perror(LOG_ERR, "Cannot open "SYSRQ" for read");
	return;
	}
	if (fscanf(procf, "%d", &c) != 1) {
	crm_perror(LOG_ERR, "Parsing "SYSRQ" failed");
	c = 0;
	}
	fclose(procf);
	if (c == 1)
	return;

	/* 8 for debugging dumps of processes, 128 for reboot/poweroff */
	c \|= 136;
	procf = fopen(SYSRQ, "w");
	if (!procf) {
	crm_perror(LOG_ERR, "Cannot write to "SYSRQ);
	return;
	}
	fprintf(procf, "%d", c);
	fclose(procf);
	return;
	}

	static void
	sysrq_trigger(char t)
	{
	FILE *procf;

	sysrq_init();

	procf = fopen("/proc/sysrq-trigger", "a");
	if (!procf) {
	crm_perror(LOG_ERR, "Opening sysrq-trigger failed");
	return;
	}
	crm_info("sysrq-trigger: %c\n", t);
	fprintf(procf, "%c\n", t);
	fclose(procf);
	return;
	}


	-static pid_t
	-pcmk_locate_proc_entry(const char *name)
	-{
	- DIR *dp;
	- struct dirent *entry;
	- struct stat statbuf;
	-
	- dp = opendir("/proc");
	- if (!dp) {
	- /* no proc directory to search through */
	- crm_notice("Can not read /proc directory to track existing components");
	- return 0;
	- }
	-
	- while ((entry = readdir(dp)) != NULL) {
	- char procpath[128];
	- char value[64];
	- char key[16];
	- FILE *file;
	- int pid;
	-
	- strcpy(procpath, "/proc/");
	- /* strlen("/proc/") + strlen("/status") + 1 = 14
	- * 128 - 14 = 114 */
	- strncat(procpath, entry->d_name, 114);
	-
	- if (lstat(procpath, &statbuf)) {
	- continue;
	- }
	- if (!S_ISDIR(statbuf.st_mode) \|\| !isdigit(entry->d_name[0])) {
	- continue;
	- }
	-
	- strcat(procpath, "/status");
	-
	- file = fopen(procpath, "r");
	- if (!file) {
	- continue;
	- }
	- if (fscanf(file, "%15s%63s", key, value) != 2) {
	- fclose(file);
	- continue;
	- }
	- fclose(file);
	-
	- pid = atoi(entry->d_name);
	- if (pid <= 0) {
	- continue;
	- }
	-
	- if (safe_str_eq(name, value) && crm_pid_active(pid) == 1) {
	- crm_notice("Found %s at process %d", value, pid);
	- closedir(dp);
	- return pid;
	- }
	- }
	-
	- closedir(dp);
	- return 0;
	-}
	-
	static void
	pcmk_panic_local(void)
	{
	int rc = pcmk_ok;
	uid_t uid = geteuid();
	pid_t ppid = getppid();

	if(uid != 0 && ppid > 1) {
	/* We're a non-root pacemaker daemon (cib, crmd, pengine,
	* attrd, etc) with the original pacemakerd parent
	*
	* Of these, only crmd is likely to be initiating resets
	*/
	do_crm_log_always(LOG_EMERG, "Signaling parent %d to panic", ppid);
	crm_exit(pcmk_err_panic);
	return;

	} else if (uid != 0) {
	/*
	* No permissions and no pacemakerd parent to escalate to
	* Track down the new pacakerd process and send a signal instead
	*/
	union sigval signal_value;

	memset(&signal_value, 0, sizeof(signal_value));
	- ppid = pcmk_locate_proc_entry("pacemakerd");
	+ ppid = crm_procfs_pid_of("pacemakerd");
	do_crm_log_always(LOG_EMERG, "Signaling pacemakerd(%d) to panic", ppid);

	if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
	crm_perror(LOG_EMERG, "Cannot signal pacemakerd(%d) to panic", ppid);
	}
	/* The best we can do now is die */
	crm_exit(pcmk_err_panic);
	return;
	}

	/* We're either pacemakerd, or a pacemaker daemon running as root */

	sysrq_trigger('b');
	/* reboot(RB_HALT_SYSTEM); rc = errno; */
	reboot(RB_AUTOBOOT);
	rc = errno;

	do_crm_log_always(LOG_EMERG, "Reboot failed, escalating to %d: %s (%d)", ppid, pcmk_strerror(rc), rc);

	if(ppid > 1) {
	/* child daemon */
	exit(pcmk_err_panic);
	} else {
	/* pacemakerd or orphan child */
	exit(DAEMON_RESPAWN_STOP);
	}
	}

	static void
	pcmk_panic_sbd(void)
	{
	union sigval signal_value;
	pid_t ppid = getppid();

	do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic", sbd_pid);

	memset(&signal_value, 0, sizeof(signal_value));
	/* TODO: Arrange for a slightly less brutal option? */
	if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
	crm_perror(LOG_EMERG, "Cannot signal SBD(%d) to terminate", sbd_pid);
	pcmk_panic_local();
	}

	if(ppid > 1) {
	/* child daemon */
	exit(pcmk_err_panic);
	} else {
	/* pacemakerd or orphan child */
	exit(DAEMON_RESPAWN_STOP);
	}
	}

	void
	pcmk_panic(const char *origin)
	{
	static struct qb_log_callsite *panic_cs = NULL;

	if (panic_cs == NULL) {
	panic_cs = qb_log_callsite_get(__func__, __FILE__, "panic-delay", LOG_TRACE, __LINE__, crm_trace_nonlog);
	}

	pcmk_locate_sbd();

	if (panic_cs && panic_cs->targets) {
	/* getppid() == 1 means our original parent no longer exists */
	do_crm_log_always(LOG_EMERG,
	"Shutting down instead of panicing the node: origin=%s, sbd=%d, parent=%d",
	origin, sbd_pid, getppid());
	crm_exit(DAEMON_RESPAWN_STOP);
	return;
	}

	if(sbd_pid > 1) {
	do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic the system: %s", sbd_pid, origin);
	pcmk_panic_sbd();

	} else {
	do_crm_log_always(LOG_EMERG, "Panicing the system directly: %s", origin);
	pcmk_panic_local();
	}
	}

	pid_t
	pcmk_locate_sbd(void)
	{
	char *pidfile = NULL;

	if(sbd_pid > 1) {
	return sbd_pid;
	}

	/* Look for the pid file */
	pidfile = crm_strdup_printf("%s/sbd.pid", HA_STATE_DIR);

	/* Read the pid file */
	if(pidfile) {
	int rc = crm_pidfile_inuse(pidfile, 1);
	if(rc < pcmk_ok && rc != -ENOENT) {
	sbd_pid = crm_read_pidfile(pidfile);
	crm_trace("SBD detected at pid=%d (file)");
	}
	}

	if(sbd_pid < 0) {
	/* Fall back to /proc for systems that support it */
	- sbd_pid = pcmk_locate_proc_entry("sbd");
	- crm_trace("SBD detected at pid=%d (proc)");
	+ sbd_pid = crm_procfs_pid_of("sbd");
	+ crm_trace("SBD detected at pid=%d (proc)", sbd_pid);
	}

	if(sbd_pid < 0) {
	sbd_pid = 0;
	}

	free(pidfile);
	return sbd_pid;
	}
	diff --git a/mcp/pacemaker.c b/mcp/pacemaker.c
	index 910d154193..e838d4db78 100644
	--- a/mcp/pacemaker.c
	+++ b/mcp/pacemaker.c
	@@ -1,1173 +1,1141 @@
	/*
	* Copyright (C) 2010 Andrew Beekhof <andrew@beekhof.net>
	*
	* This program is free software; you can redistribute it and/or
	* modify it under the terms of the GNU General Public
	* License as published by the Free Software Foundation; either
	* version 2 of the License, or (at your option) any later version.
	*
	* This software is distributed in the hope that it will be useful,
	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	* General Public License for more details.
	*
	* You should have received a copy of the GNU General Public
	* License along with this library; if not, write to the Free Software
	* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
	*/

	#include <crm_internal.h>
	#include <pacemaker.h>

	#include <pwd.h>
	#include <grp.h>
	#include <sys/stat.h>
	#include <sys/types.h>
	#include <sys/time.h>
	#include <sys/resource.h>
	#include <sys/reboot.h>

	#include <crm/msg_xml.h>
	#include <crm/common/ipcs.h>
	#include <crm/common/mainloop.h>
	#include <crm/cluster/internal.h>
	#include <crm/cluster.h>

	#include <dirent.h>
	#include <ctype.h>

	gboolean pcmk_quorate = FALSE;
	gboolean fatal_error = FALSE;
	GMainLoop *mainloop = NULL;

	#define PCMK_PROCESS_CHECK_INTERVAL 5

	const char *local_name = NULL;
	uint32_t local_nodeid = 0;
	crm_trigger_t *shutdown_trigger = NULL;
	const char *pid_file = "/var/run/pacemaker.pid";

	typedef struct pcmk_child_s {
	int pid;
	long flag;
	int start_seq;
	int respawn_count;
	gboolean respawn;
	const char *name;
	const char *uid;
	const char *command;

	gboolean active_before_startup;
	} pcmk_child_t;

	/* Index into the array below */
	#define pcmk_child_crmd 4
	#define pcmk_child_mgmtd 8
	/* INDENT-OFF */
	static pcmk_child_t pcmk_children[] = {
	{ 0, crm_proc_none, 0, 0, FALSE, "none", NULL, NULL },
	{ 0, crm_proc_plugin, 0, 0, FALSE, "ais", NULL, NULL },
	{ 0, crm_proc_lrmd, 3, 0, TRUE, "lrmd", NULL, CRM_DAEMON_DIR"/lrmd" },
	{ 0, crm_proc_cib, 1, 0, TRUE, "cib", CRM_DAEMON_USER, CRM_DAEMON_DIR"/cib" },
	{ 0, crm_proc_crmd, 6, 0, TRUE, "crmd", CRM_DAEMON_USER, CRM_DAEMON_DIR"/crmd" },
	{ 0, crm_proc_attrd, 4, 0, TRUE, "attrd", CRM_DAEMON_USER, CRM_DAEMON_DIR"/attrd" },
	{ 0, crm_proc_stonithd, 0, 0, TRUE, "stonithd", NULL, NULL },
	{ 0, crm_proc_pe, 5, 0, TRUE, "pengine", CRM_DAEMON_USER, CRM_DAEMON_DIR"/pengine" },
	{ 0, crm_proc_mgmtd, 0, 0, TRUE, "mgmtd", NULL, HB_DAEMON_DIR"/mgmtd" },
	{ 0, crm_proc_stonith_ng, 2, 0, TRUE, "stonith-ng", NULL, CRM_DAEMON_DIR"/stonithd" },
	};
	/* INDENT-ON */

	static gboolean start_child(pcmk_child_t * child);
	static gboolean check_active_before_startup_processes(gpointer user_data);
	void update_process_clients(crm_client_t *client);
	void update_process_peers(void);

	void
	enable_crmd_as_root(gboolean enable)
	{
	if (enable) {
	pcmk_children[pcmk_child_crmd].uid = NULL;
	} else {
	pcmk_children[pcmk_child_crmd].uid = CRM_DAEMON_USER;
	}
	}

	void
	enable_mgmtd(gboolean enable)
	{
	if (enable) {
	pcmk_children[pcmk_child_mgmtd].start_seq = 7;
	} else {
	pcmk_children[pcmk_child_mgmtd].start_seq = 0;
	}
	}

	static uint32_t
	get_process_list(void)
	{
	int lpc = 0;
	uint32_t procs = crm_get_cluster_proc();

	for (lpc = 0; lpc < SIZEOF(pcmk_children); lpc++) {
	if (pcmk_children[lpc].pid != 0) {
	procs \|= pcmk_children[lpc].flag;
	}
	}
	return procs;
	}

	static void
	pcmk_process_exit(pcmk_child_t * child)
	{
	child->pid = 0;
	child->active_before_startup = FALSE;

	/* Broadcast the fact that one of our processes died ASAP
	*
	* Try to get some logging of the cause out first though
	* because we're probably about to get fenced
	*
	* Potentially do this only if respawn_count > N
	* to allow for local recovery
	*/
	update_node_processes(local_nodeid, NULL, get_process_list());

	child->respawn_count += 1;
	if (child->respawn_count > MAX_RESPAWN) {
	crm_err("Child respawn count exceeded by %s", child->name);
	child->respawn = FALSE;
	}

	if (shutdown_trigger) {
	mainloop_set_trigger(shutdown_trigger);
	update_node_processes(local_nodeid, NULL, get_process_list());

	} else if (child->respawn && crm_is_true(getenv("PCMK_fail_fast"))) {
	crm_err("Rebooting system because of %s", child->name);
	pcmk_panic(__FUNCTION__);

	} else if (child->respawn) {
	crm_notice("Respawning failed child process: %s", child->name);
	start_child(child);
	}
	}

	static void
	pcmk_child_exit(mainloop_child_t * p, pid_t pid, int core, int signo, int exitcode)
	{
	pcmk_child_t *child = mainloop_child_userdata(p);
	const char *name = mainloop_child_name(p);

	if (signo && signo == SIGKILL) {
	crm_warn("The %s process (%d) terminated with signal %d (core=%d)", name, pid, signo, core);

	} else if (signo) {
	crm_err("The %s process (%d) terminated with signal %d (core=%d)", name, pid, signo, core);

	} else {
	switch(exitcode) {
	case pcmk_ok:
	crm_info("The %s process (%d) exited: %s (%d)", name, pid, pcmk_strerror(exitcode), exitcode);
	break;

	case DAEMON_RESPAWN_STOP:
	crm_warn("The %s process (%d) can no longer be respawned, shutting the cluster down.", name, pid);
	child->respawn = FALSE;
	fatal_error = TRUE;
	pcmk_shutdown(SIGTERM);
	break;

	case pcmk_err_panic:
	do_crm_log_always(LOG_EMERG, "The %s process (%d) instructed the machine to reset", name, pid);
	child->respawn = FALSE;
	fatal_error = TRUE;
	pcmk_panic(__FUNCTION__);
	pcmk_shutdown(SIGTERM);
	break;

	default:
	crm_err("The %s process (%d) exited: %s (%d)", name, pid, pcmk_strerror(exitcode), exitcode);
	break;
	}
	}

	pcmk_process_exit(child);
	}

	static gboolean
	stop_child(pcmk_child_t * child, int signal)
	{
	if (signal == 0) {
	signal = SIGTERM;
	}

	if (child->command == NULL) {
	crm_debug("Nothing to do for child \"%s\"", child->name);
	return TRUE;
	}

	if (child->pid <= 0) {
	crm_trace("Client %s not running", child->name);
	return TRUE;
	}

	errno = 0;
	if (kill(child->pid, signal) == 0) {
	crm_notice("Stopping %s: Sent -%d to process %d", child->name, signal, child->pid);

	} else {
	crm_perror(LOG_ERR, "Stopping %s: Could not send -%d to process %d failed",
	child->name, signal, child->pid);
	}

	return TRUE;
	}

	static char *opts_default[] = { NULL, NULL };
	static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL };

	static gboolean
	start_child(pcmk_child_t * child)
	{
	int lpc = 0;
	uid_t uid = 0;
	gid_t gid = 0;
	struct rlimit oflimits;
	gboolean use_valgrind = FALSE;
	gboolean use_callgrind = FALSE;
	const char *devnull = "/dev/null";
	const char *env_valgrind = getenv("PCMK_valgrind_enabled");
	const char *env_callgrind = getenv("PCMK_callgrind_enabled");
	enum cluster_type_e stack = get_cluster_type();

	child->active_before_startup = FALSE;

	if (child->command == NULL) {
	crm_info("Nothing to do for child \"%s\"", child->name);
	return TRUE;
	}

	if (env_callgrind != NULL && crm_is_true(env_callgrind)) {
	use_callgrind = TRUE;
	use_valgrind = TRUE;

	} else if (env_callgrind != NULL && strstr(env_callgrind, child->name)) {
	use_callgrind = TRUE;
	use_valgrind = TRUE;

	} else if (env_valgrind != NULL && crm_is_true(env_valgrind)) {
	use_valgrind = TRUE;

	} else if (env_valgrind != NULL && strstr(env_valgrind, child->name)) {
	use_valgrind = TRUE;
	}

	if (use_valgrind && strlen(VALGRIND_BIN) == 0) {
	crm_warn("Cannot enable valgrind for %s:"
	" The location of the valgrind binary is unknown", child->name);
	use_valgrind = FALSE;
	}

	if (child->uid) {
	if (crm_user_lookup(child->uid, &uid, &gid) < 0) {
	crm_err("Invalid user (%s) for %s: not found", child->uid, child->name);
	return FALSE;
	}
	crm_info("Using uid=%u and group=%u for process %s", uid, gid, child->name);
	}

	child->pid = fork();
	CRM_ASSERT(child->pid != -1);

	if (child->pid > 0) {
	/* parent */
	mainloop_child_add(child->pid, 0, child->name, child, pcmk_child_exit);

	crm_info("Forked child %d for process %s%s", child->pid, child->name,
	use_valgrind ? " (valgrind enabled: " VALGRIND_BIN ")" : "");
	update_node_processes(local_nodeid, NULL, get_process_list());
	return TRUE;

	} else {
	/* Start a new session */
	(void)setsid();

	/* Setup the two alternate arg arrarys */
	opts_vgrind[0] = strdup(VALGRIND_BIN);
	if (use_callgrind) {
	opts_vgrind[1] = strdup("--tool=callgrind");
	opts_vgrind[2] = strdup("--callgrind-out-file=" CRM_STATE_DIR "/callgrind.out.%p");
	opts_vgrind[3] = strdup(child->command);
	opts_vgrind[4] = NULL;
	} else {
	opts_vgrind[1] = strdup(child->command);
	opts_vgrind[2] = NULL;
	opts_vgrind[3] = NULL;
	opts_vgrind[4] = NULL;
	}
	opts_default[0] = strdup(child->command);;

	if(gid) {
	if(stack == pcmk_cluster_corosync) {
	/* Drop root privileges completely
	*
	* We can do this because we set uidgid.gid.${gid}=1
	* via CMAP which allows these processes to connect to
	* corosync
	*/
	if (setgid(gid) < 0) {
	crm_perror(LOG_ERR, "Could not set group to %d", gid);
	}

	/* Keep the root group (so we can access corosync), but add the haclient group (so we can access ipc) */
	} else if (initgroups(child->uid, gid) < 0) {
	crm_err("Cannot initialize groups for %s: %s (%d)", child->uid, pcmk_strerror(errno), errno);
	}
	}

	if (uid && setuid(uid) < 0) {
	crm_perror(LOG_ERR, "Could not set user to %d (%s)", uid, child->uid);
	}

	/* Close all open file descriptors */
	getrlimit(RLIMIT_NOFILE, &oflimits);
	for (lpc = 0; lpc < oflimits.rlim_cur; lpc++) {
	close(lpc);
	}

	(void)open(devnull, O_RDONLY); /* Stdin: fd 0 */
	(void)open(devnull, O_WRONLY); /* Stdout: fd 1 */
	(void)open(devnull, O_WRONLY); /* Stderr: fd 2 */

	if (use_valgrind) {
	(void)execvp(VALGRIND_BIN, opts_vgrind);
	} else {
	(void)execvp(child->command, opts_default);
	}
	crm_perror(LOG_ERR, "FATAL: Cannot exec %s", child->command);
	crm_exit(DAEMON_RESPAWN_STOP);
	}
	return TRUE; /* never reached */
	}

	static gboolean
	escalate_shutdown(gpointer data)
	{

	pcmk_child_t *child = data;

	if (child->pid) {
	/* Use SIGSEGV instead of SIGKILL to create a core so we can see what it was up to */
	crm_err("Child %s not terminating in a timely manner, forcing", child->name);
	stop_child(child, SIGSEGV);
	}
	return FALSE;
	}

	static gboolean
	pcmk_shutdown_worker(gpointer user_data)
	{
	static int phase = 0;
	static time_t next_log = 0;
	static int max = SIZEOF(pcmk_children);

	int lpc = 0;

	if (phase == 0) {
	crm_notice("Shuting down Pacemaker");
	phase = max;

	/* Add a second, more frequent, check to speed up shutdown */
	g_timeout_add_seconds(5, check_active_before_startup_processes, NULL);
	}

	for (; phase > 0; phase--) {
	/* dont stop anything with start_seq < 1 */

	for (lpc = max - 1; lpc >= 0; lpc--) {
	pcmk_child_t *child = &(pcmk_children[lpc]);

	if (phase != child->start_seq) {
	continue;
	}

	if (child->pid) {
	time_t now = time(NULL);

	if (child->respawn) {
	next_log = now + 30;
	child->respawn = FALSE;
	stop_child(child, SIGTERM);
	if (phase < pcmk_children[pcmk_child_crmd].start_seq) {
	g_timeout_add(180000 /* 3m */ , escalate_shutdown, child);
	}

	} else if (now >= next_log) {
	next_log = now + 30;
	crm_notice("Still waiting for %s (pid=%d, seq=%d) to terminate...",
	child->name, child->pid, child->start_seq);
	}
	return TRUE;
	}

	/* cleanup */
	crm_debug("%s confirmed stopped", child->name);
	child->pid = 0;
	}
	}

	/* send_cluster_id(); */
	crm_notice("Shutdown complete");

	{
	const char *delay = daemon_option("shutdown_delay");
	if(delay) {
	sync();
	sleep(crm_get_msec(delay) / 1000);
	}
	}

	g_main_loop_quit(mainloop);

	if (fatal_error) {
	crm_notice("Attempting to inhibit respawning after fatal error");
	crm_exit(DAEMON_RESPAWN_STOP);
	}

	return TRUE;
	}

	static void
	pcmk_ignore(int nsig)
	{
	crm_info("Ignoring signal %s (%d)", strsignal(nsig), nsig);
	}

	static void
	pcmk_sigquit(int nsig)
	{
	pcmk_panic(__FUNCTION__);
	}

	void
	pcmk_shutdown(int nsig)
	{
	if (shutdown_trigger == NULL) {
	shutdown_trigger = mainloop_add_trigger(G_PRIORITY_HIGH, pcmk_shutdown_worker, NULL);
	}
	mainloop_set_trigger(shutdown_trigger);
	}

	static int32_t
	pcmk_ipc_accept(qb_ipcs_connection_t * c, uid_t uid, gid_t gid)
	{
	crm_trace("Connection %p", c);
	if (crm_client_new(c, uid, gid) == NULL) {
	return -EIO;
	}
	return 0;
	}

	static void
	pcmk_ipc_created(qb_ipcs_connection_t * c)
	{
	crm_trace("Connection %p", c);
	}

	/* Exit code means? */
	static int32_t
	pcmk_ipc_dispatch(qb_ipcs_connection_t * qbc, void *data, size_t size)
	{
	uint32_t id = 0;
	uint32_t flags = 0;
	const char *task = NULL;
	crm_client_t *c = crm_client_get(qbc);
	xmlNode *msg = crm_ipcs_recv(c, data, size, &id, &flags);

	crm_ipcs_send_ack(c, id, flags, "ack", __FUNCTION__, __LINE__);
	if (msg == NULL) {
	return 0;
	}

	task = crm_element_value(msg, F_CRM_TASK);
	if (crm_str_eq(task, CRM_OP_QUIT, TRUE)) {
	/* Time to quit */
	crm_notice("Shutting down in response to ticket %s (%s)",
	crm_element_value(msg, F_CRM_REFERENCE), crm_element_value(msg, F_CRM_ORIGIN));
	pcmk_shutdown(15);

	} else if (crm_str_eq(task, CRM_OP_RM_NODE_CACHE, TRUE)) {
	/* Send to everyone */
	struct iovec *iov;
	int id = 0;
	const char *name = NULL;

	crm_element_value_int(msg, XML_ATTR_ID, &id);
	name = crm_element_value(msg, XML_ATTR_UNAME);
	crm_notice("Instructing peers to remove references to node %s/%u", name, id);

	iov = calloc(1, sizeof(struct iovec));
	iov->iov_base = dump_xml_unformatted(msg);
	iov->iov_len = 1 + strlen(iov->iov_base);
	send_cpg_iov(iov);

	} else {
	update_process_clients(c);
	}

	free_xml(msg);
	return 0;
	}

	/* Error code means? */
	static int32_t
	pcmk_ipc_closed(qb_ipcs_connection_t * c)
	{
	crm_client_t *client = crm_client_get(c);

	if (client == NULL) {
	return 0;
	}
	crm_trace("Connection %p", c);
	crm_client_destroy(client);
	return 0;
	}

	static void
	pcmk_ipc_destroy(qb_ipcs_connection_t * c)
	{
	crm_trace("Connection %p", c);
	pcmk_ipc_closed(c);
	}

	struct qb_ipcs_service_handlers mcp_ipc_callbacks = {
	.connection_accept = pcmk_ipc_accept,
	.connection_created = pcmk_ipc_created,
	.msg_process = pcmk_ipc_dispatch,
	.connection_closed = pcmk_ipc_closed,
	.connection_destroyed = pcmk_ipc_destroy
	};

	/*!
	* \internal
	* \brief Send an XML message with process list of all known peers to client(s)
	*
	* \param[in] client Send message to this client, or all clients if NULL
	*/
	void
	update_process_clients(crm_client_t *client)
	{
	GHashTableIter iter;
	crm_node_t *node = NULL;
	xmlNode *update = create_xml_node(NULL, "nodes");

	if (is_corosync_cluster()) {
	crm_xml_add_int(update, "quorate", pcmk_quorate);
	}

	g_hash_table_iter_init(&iter, crm_peer_cache);
	while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & node)) {
	xmlNode *xml = create_xml_node(update, "node");

	crm_xml_add_int(xml, "id", node->id);
	crm_xml_add(xml, "uname", node->uname);
	crm_xml_add(xml, "state", node->state);
	crm_xml_add_int(xml, "processes", node->processes);
	}

	if(client) {
	crm_trace("Sending process list to client %s", client->id);
	crm_ipcs_send(client, 0, update, crm_ipc_server_event);

	} else {
	crm_trace("Sending process list to %d clients", crm_hash_table_size(client_connections));
	g_hash_table_iter_init(&iter, client_connections);
	while (g_hash_table_iter_next(&iter, NULL, (gpointer *) & client)) {
	crm_ipcs_send(client, 0, update, crm_ipc_server_event);
	}
	}

	free_xml(update);
	}

	/*!
	* \internal
	* \brief Send a CPG message with local node's process list to all peers
	*/
	void
	update_process_peers(void)
	{
	/* Do nothing for corosync-2 based clusters */

	char buffer[1024];
	struct iovec *iov;
	int rc = 0;

	memset(buffer, 0, SIZEOF(buffer));

	if (local_name) {
	rc = snprintf(buffer, SIZEOF(buffer) - 1, "<node uname=\"%s\" proclist=\"%u\"/>",
	local_name, get_process_list());
	} else {
	rc = snprintf(buffer, SIZEOF(buffer) - 1, "<node proclist=\"%u\"/>", get_process_list());
	}

	crm_trace("Sending %s", buffer);
	iov = calloc(1, sizeof(struct iovec));
	iov->iov_base = strdup(buffer);
	iov->iov_len = rc + 1;
	send_cpg_iov(iov);
	}

	/*!
	* \internal
	* \brief Update a node's process list, notifying clients and peers if needed
	*
	* \param[in] id Node ID of affected node
	* \param[in] uname Uname of affected node
	* \param[in] procs Affected node's process list mask
	*
	* \return TRUE if the process list changed, FALSE otherwise
	*/
	gboolean
	update_node_processes(uint32_t id, const char *uname, uint32_t procs)
	{
	gboolean changed = FALSE;
	crm_node_t *node = crm_get_peer(id, uname);

	if (procs != 0) {
	if (procs != node->processes) {
	crm_debug("Node %s now has process list: %.32x (was %.32x)",
	node->uname, procs, node->processes);
	node->processes = procs;
	changed = TRUE;

	/* If local node's processes have changed, notify clients/peers */
	if (id == local_nodeid) {
	update_process_clients(NULL);
	update_process_peers();
	}

	} else {
	crm_trace("Node %s still has process list: %.32x", node->uname, procs);
	}
	}
	return changed;
	}


	/* INDENT-OFF */
	static struct crm_option long_options[] = {
	/* Top-level Options */
	{"help", 0, 0, '?', "\tThis text"},
	{"version", 0, 0, '$', "\tVersion information" },
	{"verbose", 0, 0, 'V', "\tIncrease debug output"},
	{"shutdown", 0, 0, 'S', "\tInstruct Pacemaker to shutdown on this machine"},
	{"features", 0, 0, 'F', "\tDisplay the full version and list of features Pacemaker was built with"},

	{"-spacer-", 1, 0, '-', "\nAdditional Options:"},
	{"foreground", 0, 0, 'f', "\t(Ignored) Pacemaker always runs in the foreground"},
	{"pid-file", 1, 0, 'p', "\t(Ignored) Daemon pid file location"},

	{NULL, 0, 0, 0}
	};
	/* INDENT-ON */

	static void
	mcp_chown(const char *path, uid_t uid, gid_t gid)
	{
	int rc = chown(path, uid, gid);

	if (rc < 0) {
	crm_warn("Cannot change the ownership of %s to user %s and gid %d: %s",
	path, CRM_DAEMON_USER, gid, pcmk_strerror(errno));
	}
	}

	static gboolean
	check_active_before_startup_processes(gpointer user_data)
	{
	int start_seq = 1, lpc = 0;
	static int max = SIZEOF(pcmk_children);
	gboolean keep_tracking = FALSE;

	for (start_seq = 1; start_seq < max; start_seq++) {
	for (lpc = 0; lpc < max; lpc++) {
	if (pcmk_children[lpc].active_before_startup == FALSE) {
	/* we are already tracking it as a child process. */
	continue;
	} else if (start_seq != pcmk_children[lpc].start_seq) {
	continue;
	} else if (crm_pid_active(pcmk_children[lpc].pid) != 1) {
	crm_notice("Process %s terminated (pid=%d)",
	pcmk_children[lpc].name, pcmk_children[lpc].pid);
	pcmk_process_exit(&(pcmk_children[lpc]));
	continue;
	}
	/* at least one of the processes found at startup
	* is still going, so keep this recurring timer around */
	keep_tracking = TRUE;
	}
	}

	return keep_tracking;
	}

	static bool
	find_and_track_existing_processes(void)
	{
	DIR *dp;
	struct dirent *entry;
	- struct stat statbuf;
	int start_tracker = 0;
	+ char entry_name[64];

	dp = opendir("/proc");
	if (!dp) {
	/* no proc directory to search through */
	crm_notice("Can not read /proc directory to track existing components");
	return FALSE;
	}

	while ((entry = readdir(dp)) != NULL) {
	- char procpath[128];
	- char value[64];
	- char key[16];
	- FILE *file;
	int pid;
	int max = SIZEOF(pcmk_children);
	int i;

	- strcpy(procpath, "/proc/");
	- /* strlen("/proc/") + strlen("/status") + 1 = 14
	- * 128 - 14 = 114 */
	- strncat(procpath, entry->d_name, 114);
	-
	- if (lstat(procpath, &statbuf)) {
	- continue;
	- }
	- if (!S_ISDIR(statbuf.st_mode) \|\| !isdigit(entry->d_name[0])) {
	- continue;
	- }
	-
	- strcat(procpath, "/status");
	-
	- file = fopen(procpath, "r");
	- if (!file) {
	+ if (crm_procfs_process_info(entry, entry_name, &pid) < 0) {
	continue;
	}
	- if (fscanf(file, "%15s%63s", key, value) != 2) {
	- fclose(file);
	- continue;
	- }
	- fclose(file);
	-
	- pid = atoi(entry->d_name);
	- if (pid <= 0) {
	- continue;
	- }
	-
	for (i = 0; i < max; i++) {
	const char *name = pcmk_children[i].name;

	if (pcmk_children[i].start_seq == 0) {
	continue;
	}
	if (pcmk_children[i].flag == crm_proc_stonith_ng) {
	name = "stonithd";
	}
	- if (safe_str_eq(name, value)) {
	- if (crm_pid_active(pid) != 1) {
	- continue;
	- }
	- crm_notice("Tracking existing %s process (pid=%d)", value, pid);
	+ if (safe_str_eq(entry_name, name) && (crm_pid_active(pid) == 1)) {
	+ crm_notice("Tracking existing %s process (pid=%d)", name, pid);
	pcmk_children[i].pid = pid;
	pcmk_children[i].active_before_startup = TRUE;
	start_tracker = 1;
	+ break;
	}
	}
	}

	if (start_tracker) {
	g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, check_active_before_startup_processes,
	NULL);
	}
	closedir(dp);

	return start_tracker;
	}

	static void
	init_children_processes(void)
	{
	int start_seq = 1, lpc = 0;
	static int max = SIZEOF(pcmk_children);

	/* start any children that have not been detected */
	for (start_seq = 1; start_seq < max; start_seq++) {
	/* dont start anything with start_seq < 1 */
	for (lpc = 0; lpc < max; lpc++) {
	if (pcmk_children[lpc].pid) {
	/* we are already tracking it */
	continue;
	}

	if (start_seq == pcmk_children[lpc].start_seq) {
	start_child(&(pcmk_children[lpc]));
	}
	}
	}

	/* From this point on, any daemons being started will be due to
	* respawning rather than node start.
	*
	* This may be useful for the daemons to know
	*/
	setenv("PCMK_respawned", "true", 1);
	}

	static void
	mcp_cpg_destroy(gpointer user_data)
	{
	crm_err("Connection destroyed");
	crm_exit(ENOTCONN);
	}

	/*!
	* \internal
	* \brief Process a CPG message (process list or manual peer cache removal)
	*
	* \param[in] handle CPG connection (ignored)
	* \param[in] groupName CPG group name (ignored)
	* \param[in] nodeid ID of affected node
	* \param[in] pid Process ID (ignored)
	* \param[in] msg CPG XML message
	* \param[in] msg_len Length of msg in bytes (ignored)
	*/
	static void
	mcp_cpg_deliver(cpg_handle_t handle,
	const struct cpg_name *groupName,
	uint32_t nodeid, uint32_t pid, void *msg, size_t msg_len)
	{
	xmlNode *xml = string2xml(msg);
	const char *task = crm_element_value(xml, F_CRM_TASK);

	crm_trace("Received CPG message (%s): %.200s",
	(task? task : "process list"), msg);

	if (task == NULL) {
	if (nodeid == local_nodeid) {
	crm_info("Ignoring process list sent by peer for local node");
	} else {
	uint32_t procs = 0;
	const char *uname = crm_element_value(xml, "uname");

	crm_element_value_int(xml, "proclist", (int *)&procs);
	if (update_node_processes(nodeid, uname, procs)) {
	update_process_clients(NULL);
	}
	}

	} else if (crm_str_eq(task, CRM_OP_RM_NODE_CACHE, TRUE)) {
	int id = 0;
	const char *name = NULL;

	crm_element_value_int(xml, XML_ATTR_ID, &id);
	name = crm_element_value(xml, XML_ATTR_UNAME);
	reap_crm_member(id, name);
	}

	if (xml != NULL) {
	free_xml(xml);
	}
	}

	static void
	mcp_cpg_membership(cpg_handle_t handle,
	const struct cpg_name *groupName,
	const struct cpg_address *member_list, size_t member_list_entries,
	const struct cpg_address *left_list, size_t left_list_entries,
	const struct cpg_address *joined_list, size_t joined_list_entries)
	{
	/* Update peer cache if needed */
	pcmk_cpg_membership(handle, groupName, member_list, member_list_entries,
	left_list, left_list_entries,
	joined_list, joined_list_entries);

	/* Always broadcast our own presence after any membership change */
	update_process_peers();
	}

	static gboolean
	mcp_quorum_callback(unsigned long long seq, gboolean quorate)
	{
	pcmk_quorate = quorate;
	return TRUE;
	}

	static void
	mcp_quorum_destroy(gpointer user_data)
	{
	crm_info("connection lost");
	}

	#if SUPPORT_CMAN
	static gboolean
	mcp_cman_dispatch(unsigned long long seq, gboolean quorate)
	{
	pcmk_quorate = quorate;
	return TRUE;
	}

	static void
	mcp_cman_destroy(gpointer user_data)
	{
	crm_info("connection closed");
	}
	#endif

	int
	main(int argc, char **argv)
	{
	int rc;
	int flag;
	int argerr = 0;

	int option_index = 0;
	gboolean shutdown = FALSE;

	uid_t pcmk_uid = 0;
	gid_t pcmk_gid = 0;
	struct rlimit cores;
	crm_ipc_t *old_instance = NULL;
	qb_ipcs_service_t *ipcs = NULL;
	const char *facility = daemon_option("logfacility");
	static crm_cluster_t cluster;

	crm_log_preinit(NULL, argc, argv);
	crm_set_options(NULL, "mode [options]", long_options, "Start/Stop Pacemaker\n");
	mainloop_add_signal(SIGHUP, pcmk_ignore);
	mainloop_add_signal(SIGQUIT, pcmk_sigquit);

	while (1) {
	flag = crm_get_option(argc, argv, &option_index);
	if (flag == -1)
	break;

	switch (flag) {
	case 'V':
	crm_bump_log_level(argc, argv);
	break;
	case 'f':
	/* Legacy */
	break;
	case 'p':
	pid_file = optarg;
	break;
	case '$':
	case '?':
	crm_help(flag, EX_OK);
	break;
	case 'S':
	shutdown = TRUE;
	break;
	case 'F':
	printf("Pacemaker %s (Build: %s)\n Supporting v%s: %s\n", PACEMAKER_VERSION, BUILD_VERSION,
	CRM_FEATURE_SET, CRM_FEATURES);
	crm_exit(pcmk_ok);
	default:
	printf("Argument code 0%o (%c) is not (?yet?) supported\n", flag, flag);
	++argerr;
	break;
	}
	}

	if (optind < argc) {
	printf("non-option ARGV-elements: ");
	while (optind < argc)
	printf("%s ", argv[optind++]);
	printf("\n");
	}
	if (argerr) {
	crm_help('?', EX_USAGE);
	}


	setenv("LC_ALL", "C", 1);
	setenv("HA_LOGD", "no", 1);

	set_daemon_option("mcp", "true");
	set_daemon_option("use_logd", "off");

	crm_log_init(NULL, LOG_INFO, TRUE, FALSE, argc, argv, FALSE);

	/* Restore the original facility so that mcp_read_config() does the right thing */
	set_daemon_option("logfacility", facility);

	crm_debug("Checking for old instances of %s", CRM_SYSTEM_MCP);
	old_instance = crm_ipc_new(CRM_SYSTEM_MCP, 0);
	crm_ipc_connect(old_instance);

	if (shutdown) {
	crm_debug("Terminating previous instance");
	while (crm_ipc_connected(old_instance)) {
	xmlNode *cmd =
	create_request(CRM_OP_QUIT, NULL, NULL, CRM_SYSTEM_MCP, CRM_SYSTEM_MCP, NULL);

	crm_debug(".");
	crm_ipc_send(old_instance, cmd, 0, 0, NULL);
	free_xml(cmd);

	sleep(2);
	}
	crm_ipc_close(old_instance);
	crm_ipc_destroy(old_instance);
	crm_exit(pcmk_ok);

	} else if (crm_ipc_connected(old_instance)) {
	crm_ipc_close(old_instance);
	crm_ipc_destroy(old_instance);
	crm_err("Pacemaker is already active, aborting startup");
	crm_exit(DAEMON_RESPAWN_STOP);
	}

	crm_ipc_close(old_instance);
	crm_ipc_destroy(old_instance);

	if (mcp_read_config() == FALSE) {
	crm_notice("Could not obtain corosync config data, exiting");
	crm_exit(ENODATA);
	}

	crm_notice("Starting Pacemaker %s (Build: %s): %s", PACEMAKER_VERSION, BUILD_VERSION, CRM_FEATURES);
	mainloop = g_main_new(FALSE);
	sysrq_init();

	rc = getrlimit(RLIMIT_CORE, &cores);
	if (rc < 0) {
	crm_perror(LOG_ERR, "Cannot determine current maximum core size.");
	} else {
	if (cores.rlim_max == 0 && geteuid() == 0) {
	cores.rlim_max = RLIM_INFINITY;
	} else {
	crm_info("Maximum core file size is: %lu", (unsigned long)cores.rlim_max);
	}
	cores.rlim_cur = cores.rlim_max;

	rc = setrlimit(RLIMIT_CORE, &cores);
	if (rc < 0) {
	crm_perror(LOG_ERR,
	"Core file generation will remain disabled."
	" Core files are an important diagnositic tool,"
	" please consider enabling them by default.");
	}
	#if 0
	/* system() is not thread-safe, can't call from here
	* Actually, its a pretty hacky way to try and achieve this anyway
	*/
	if (system("echo 1 > /proc/sys/kernel/core_uses_pid") != 0) {
	crm_perror(LOG_ERR, "Could not enable /proc/sys/kernel/core_uses_pid");
	}
	#endif
	}
	rc = pcmk_ok;

	if (crm_user_lookup(CRM_DAEMON_USER, &pcmk_uid, &pcmk_gid) < 0) {
	crm_err("Cluster user %s does not exist, aborting Pacemaker startup", CRM_DAEMON_USER);
	crm_exit(ENOKEY);
	}

	mkdir(CRM_STATE_DIR, 0750);
	mcp_chown(CRM_STATE_DIR, pcmk_uid, pcmk_gid);

	/* Used to store core files in */
	crm_build_path(CRM_CORE_DIR, 0775);
	mcp_chown(CRM_CORE_DIR, pcmk_uid, pcmk_gid);

	/* Used to store blackbox dumps in */
	crm_build_path(CRM_BLACKBOX_DIR, 0755);
	mcp_chown(CRM_BLACKBOX_DIR, pcmk_uid, pcmk_gid);

	/* Used to store policy engine inputs in */
	crm_build_path(PE_STATE_DIR, 0755);
	mcp_chown(PE_STATE_DIR, pcmk_uid, pcmk_gid);

	/* Used to store the cluster configuration */
	crm_build_path(CRM_CONFIG_DIR, 0755);
	mcp_chown(CRM_CONFIG_DIR, pcmk_uid, pcmk_gid);

	/* Resource agent paths are constructed by the lrmd */

	ipcs = mainloop_add_ipc_server(CRM_SYSTEM_MCP, QB_IPC_NATIVE, &mcp_ipc_callbacks);
	if (ipcs == NULL) {
	crm_err("Couldn't start IPC server");
	crm_exit(EIO);
	}

	/* Allows us to block shutdown */
	if (cluster_connect_cfg(&local_nodeid) == FALSE) {
	crm_err("Couldn't connect to Corosync's CFG service");
	crm_exit(ENOPROTOOPT);
	}

	if(pcmk_locate_sbd() > 0) {
	setenv("PCMK_watchdog", "true", 1);
	} else {
	setenv("PCMK_watchdog", "false", 1);
	}

	find_and_track_existing_processes();

	cluster.destroy = mcp_cpg_destroy;
	cluster.cpg.cpg_deliver_fn = mcp_cpg_deliver;
	cluster.cpg.cpg_confchg_fn = mcp_cpg_membership;

	crm_set_autoreap(FALSE);

	if(cluster_connect_cpg(&cluster) == FALSE) {
	crm_err("Couldn't connect to Corosync's CPG service");
	rc = -ENOPROTOOPT;
	}

	if (rc == pcmk_ok && is_corosync_cluster()) {
	/* Keep the membership list up-to-date for crm_node to query */
	if(cluster_connect_quorum(mcp_quorum_callback, mcp_quorum_destroy) == FALSE) {
	rc = -ENOTCONN;
	}
	}

	#if SUPPORT_CMAN
	if (rc == pcmk_ok && is_cman_cluster()) {
	init_cman_connection(mcp_cman_dispatch, mcp_cman_destroy);
	}
	#endif

	if(rc == pcmk_ok) {
	local_name = get_local_node_name();
	update_node_processes(local_nodeid, local_name, get_process_list());

	mainloop_add_signal(SIGTERM, pcmk_shutdown);
	mainloop_add_signal(SIGINT, pcmk_shutdown);

	init_children_processes();

	crm_info("Starting mainloop");

	g_main_run(mainloop);
	}

	if (ipcs) {
	crm_trace("Closing IPC server");
	mainloop_del_ipc_server(ipcs);
	ipcs = NULL;
	}

	g_main_destroy(mainloop);

	cluster_disconnect_cpg(&cluster);
	cluster_disconnect_cfg();

	crm_info("Exiting %s", crm_system_name);

	return crm_exit(rc);
	}

File Metadata

Mime Type: text/x-diff
Expires: Sat, Jan 25, 12:17 PM (19 h, 57 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 1322529
Default Alt Text: (65 KB)

No OneTemporaryActions

View Options

File Metadata

Event Timeline

No OneTemporary
Actions