diff --git a/lib/common/watchdog.c b/lib/common/watchdog.c index 022884f37f..70c22c6d54 100644 --- a/lib/common/watchdog.c +++ b/lib/common/watchdog.c @@ -1,250 +1,254 @@ /* * Copyright (C) 2013 Lars Marowsky-Bree * 2014 Andrew Beekhof * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This software is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include #include #include #include #include #include #include #include #include #ifdef _POSIX_MEMLOCK # include #endif static int sbd_pid = 0; enum pcmk_panic_flags { pcmk_panic_none = 0x00, pcmk_panic_delay = 0x01, pcmk_panic_kdump = 0x02, pcmk_panic_shutdown = 0x04, }; #define SYSRQ "/proc/sys/kernel/sysrq" void sysrq_init(void) { static bool need_init = true; FILE* procf; int c; if(need_init) { need_init = false; } else { return; } procf = fopen(SYSRQ, "r"); if (!procf) { crm_perror(LOG_ERR, "Cannot open "SYSRQ" for read"); return; } if (fscanf(procf, "%d", &c) != 1) { crm_perror(LOG_ERR, "Parsing "SYSRQ" failed"); c = 0; } fclose(procf); if (c == 1) return; /* 8 for debugging dumps of processes, 128 for reboot/poweroff */ c |= 136; procf = fopen(SYSRQ, "w"); if (!procf) { crm_perror(LOG_ERR, "Cannot write to "SYSRQ); return; } fprintf(procf, "%d", c); fclose(procf); return; } static void sysrq_trigger(char t) { FILE *procf; sysrq_init(); procf = fopen("/proc/sysrq-trigger", "a"); if (!procf) { crm_perror(LOG_ERR, "Opening sysrq-trigger failed"); return; } crm_info("sysrq-trigger: %c\n", t); fprintf(procf, "%c\n", t); fclose(procf); return; } static void pcmk_panic_local(void) { int rc = pcmk_ok; uid_t uid = geteuid(); pid_t ppid = getppid(); if(uid != 0 && ppid > 1) { /* We're a non-root pacemaker daemon (cib, crmd, pengine, * attrd, etc) with the original pacemakerd parent * * Of these, only crmd is likely to be initiating resets */ do_crm_log_always(LOG_EMERG, "Signaling parent %d to panic", ppid); crm_exit(pcmk_err_panic); return; } else if (uid != 0) { /* * No permissions and no pacemakerd parent to escalate to * Track down the new pacakerd process and send a signal instead */ union sigval signal_value; memset(&signal_value, 0, sizeof(signal_value)); ppid = crm_procfs_pid_of("pacemakerd"); do_crm_log_always(LOG_EMERG, "Signaling pacemakerd(%d) to panic", ppid); if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) { crm_perror(LOG_EMERG, "Cannot signal pacemakerd(%d) to panic", ppid); } /* The best we can do now is die */ crm_exit(pcmk_err_panic); return; } /* We're either pacemakerd, or a pacemaker daemon running as root */ - sysrq_trigger('b'); + if (strcmp("crash", getenv("PCMK_panic_action")) == 0) { + sysrq_trigger('c'); + } else { + sysrq_trigger('b'); + } /* reboot(RB_HALT_SYSTEM); rc = errno; */ reboot(RB_AUTOBOOT); rc = errno; do_crm_log_always(LOG_EMERG, "Reboot failed, escalating to %d: %s (%d)", ppid, pcmk_strerror(rc), rc); if(ppid > 1) { /* child daemon */ exit(pcmk_err_panic); } else { /* pacemakerd or orphan child */ exit(DAEMON_RESPAWN_STOP); } } static void pcmk_panic_sbd(void) { union sigval signal_value; pid_t ppid = getppid(); do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic", sbd_pid); memset(&signal_value, 0, sizeof(signal_value)); /* TODO: Arrange for a slightly less brutal option? */ if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) { crm_perror(LOG_EMERG, "Cannot signal SBD(%d) to terminate", sbd_pid); pcmk_panic_local(); } if(ppid > 1) { /* child daemon */ exit(pcmk_err_panic); } else { /* pacemakerd or orphan child */ exit(DAEMON_RESPAWN_STOP); } } void pcmk_panic(const char *origin) { static struct qb_log_callsite *panic_cs = NULL; if (panic_cs == NULL) { panic_cs = qb_log_callsite_get(__func__, __FILE__, "panic-delay", LOG_TRACE, __LINE__, crm_trace_nonlog); } /* Ensure sbd_pid is set */ (void)pcmk_locate_sbd(); if (panic_cs && panic_cs->targets) { /* getppid() == 1 means our original parent no longer exists */ do_crm_log_always(LOG_EMERG, "Shutting down instead of panicking the node: origin=%s, sbd=%d, parent=%d", origin, sbd_pid, getppid()); crm_exit(DAEMON_RESPAWN_STOP); return; } if(sbd_pid > 1) { do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic the system: %s", sbd_pid, origin); pcmk_panic_sbd(); } else { do_crm_log_always(LOG_EMERG, "Panicking the system directly: %s", origin); pcmk_panic_local(); } } pid_t pcmk_locate_sbd(void) { char *pidfile = NULL; char *sbd_path = NULL; if(sbd_pid > 1) { return sbd_pid; } /* Look for the pid file */ pidfile = crm_strdup_printf("%s/sbd.pid", HA_STATE_DIR); sbd_path = crm_strdup_printf("%s/sbd", SBINDIR); /* Read the pid file */ CRM_ASSERT(pidfile); sbd_pid = crm_pidfile_inuse(pidfile, 0, sbd_path); if(sbd_pid > 0) { crm_trace("SBD detected at pid=%d (file)", sbd_pid); } else { /* Fall back to /proc for systems that support it */ sbd_pid = crm_procfs_pid_of("sbd"); crm_trace("SBD detected at pid=%d (proc)", sbd_pid); } if(sbd_pid < 0) { sbd_pid = 0; crm_trace("SBD not detected"); } free(pidfile); free(sbd_path); return sbd_pid; } diff --git a/mcp/pacemaker.sysconfig b/mcp/pacemaker.sysconfig index d76ccb2a23..78293fd762 100644 --- a/mcp/pacemaker.sysconfig +++ b/mcp/pacemaker.sysconfig @@ -1,94 +1,100 @@ # Turn on special handling for CMAN clusters in the init script # Without this, fenced (and by inference, cman) cannot reliably be made to shut down # PCMK_STACK=cman #==#==# Variables that control logging # Enable debug logging globally or per-subsystem # Multiple subsystems may me listed separated by commas # eg. PCMK_debug=crmd,pengine # PCMK_debug=yes|no|crmd|pengine|cib|stonith-ng|attrd|pacemakerd # Send INFO (and higher) messages to the named log file # Additional messages may also appear here depending on any configured debug and trace settings # By default Pacemaker will inherit the logfile specified in corosync.conf # PCMK_logfile=/var/log/pacemaker.log # Specify an alternate syslog target for NOTICE (and higher) messages # Use 'none' to disable - not recommended # The default value is 'daemon' # PCMK_logfacility=none|daemon|user|local0|local1|local2|local3|local4|local5|local6|local7 # Send all messages up-to-and-including the configured priority to syslog # A value of 'info' will be far too verbose for most installations and 'debug' is almost certain to send you blind # The default value is 'notice' # PCMK_logpriority=emerg|alert|crit|error|warning|notice|info|debug # Log all messages from a comma-separated list of functions # PCMK_trace_functions=function1,function2,function3 # Log all messages from a comma-separated list of files (no path) # Supports wildcards eg. PCMK_trace_files=prefix*.c # PCMK_trace_files=file.c,other.h # Log all messages matching comma-separated list of formats # PCMK_trace_formats="Sent delete %d" # Log all messages from a comma-separated list of tags # PCMK_trace_tags=tag1,tag2 # Dump the blackbox whenever the message at function and line is printed # eg. PCMK_trace_blackbox=te_graph_trigger:223,unpack_clone:81 # PCMK_trace_blackbox=fn:line,fn2:line2,... # Enable blackbox logging globally or per-subsystem # The blackbox contains a rolling buffer of all logs (including info+debug+trace) # and is written after a crash, assertion failure and/or when SIGTRAP is received # # The blackbox recorder can also be enabled for Pacemaker daemons at runtime by # sending SIGUSR1 (or SIGTRAP), and disabled by sending SIGUSR2 # # Multiple subsystems may me listed separated by commas # eg. PCMK_blackbox=crmd,pengine # PCMK_blackbox=yes|no|crmd|pengine|cib|stonith-ng|attrd|pacemakerd #==#==# Advanced use only # Enable this for compatibility with older corosync (prior to 2.0) # based clusters which used the nodes uname as its uuid also # PCMK_uname_is_uuid=no # Specify an alternate location for RNG schemas and XSL transforms # Mostly only useful for developer testing # PCMK_schema_directory=/some/path # Enable this for rebooting this machine at the time of process (subsystem) failure # PCMK_fail_fast=no +# Set action at the time of the system reboot by fail_fast +# When set 'crash' , the kernel does panic +# When you want to get kdump , please set 'crash' +# The default action is 'reboot' +# PCMK_panic_action=crash + #==#==# Pacemaker Remote # Use a custom directory for finding the authkey. # PCMK_authkey_location=/etc/pacemaker/authkey # # Specify a custom port for Pacemaker Remote connections # PCMK_remote_port=3121 #==#==# IPC # Force use of a particular class of IPC connection # PCMK_ipc_type=shared-mem|socket|posix|sysv # Specify an IPC buffer size in bytes # Useful when connecting to really big clusters that exceed the default 128k buffer # PCMK_ipc_buffer=131072 #==#==# Profiling and memory leak testing # Variables for running child daemons under valgrind and/or checking for memory problems # G_SLICE=always-malloc # MALLOC_PERTURB_=221 # or 0 # MALLOC_CHECK_=3 # or 0,1,2 # PCMK_valgrind_enabled=yes # PCMK_valgrind_enabled=cib,crmd # PCMK_callgrind_enabled=yes # PCMK_callgrind_enabled=cib,crmd # VALGRIND_OPTS="--leak-check=full --trace-children=no --num-callers=25 --log-file=/var/lib/pacemaker/valgrind-%p --suppressions=/usr/share/pacemaker/tests/valgrind-pcmk.suppressions --gen-suppressions=all"