diff --git a/tools/hb_report.in b/tools/hb_report.in index eab91f92ed..c0bfb8ea35 100755 --- a/tools/hb_report.in +++ b/tools/hb_report.in @@ -1,672 +1,733 @@ #!/bin/sh # Copyright (C) 2007 Dejan Muhamedagic # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # . @sysconfdir@/ha.d/shellfuncs . $HA_NOARCHBIN/utillib.sh PROG=`basename $0` # FIXME: once this is part of the package! PROGDIR=`dirname $0` echo "$PROGDIR" | grep -qs '^/' || { test -f @sbindir@/$PROG && PROGDIR=@sbindir@ test -f $HA_NOARCHBIN/$PROG && PROGDIR=$HA_NOARCHBIN } LOGD_CF=`findlogdcf @sysconfdir@ $HA_DIR` export LOGD_CF : ${SSH_OPTS="-T"} LOG_PATTERNS="CRIT: ERROR:" PEINPUTS_PATT="peng.*PEngine Input stored" # # the instance where user runs hb_report is the master # the others are slaves # if [ x"$1" = x__slave ]; then SLAVE=1 fi # # if this is the master, allow ha.cf and logd.cf in the current dir # (because often the master is the log host) # if [ "$SLAVE" = "" ]; then [ -f ha.cf ] && HA_CF=ha.cf [ -f logd.cf ] && LOGD_CF=logd.cf fi usage() { cat< $DESTDIR/.env" done } start_remote_collectors() { for node in `getnodes`; do [ "$node" = "$WE" ] && continue ssh $ssh_opts $node "$PROGDIR/hb_report __slave $DESTDIR" | (cd $DESTDIR && tar xf -) & SLAVEPIDS="$SLAVEPIDS $!" done } # # does ssh work? # testsshuser() { if [ "$2" ]; then ssh -T -o Batchmode=yes $2@$1 true 2>/dev/null else ssh -T -o Batchmode=yes $1 true 2>/dev/null fi } findsshuser() { for u in "" $TRY_SSH; do rc=0 for n in `getnodes`; do [ "$node" = "$WE" ] && continue testsshuser $n $u || { rc=1 break } done if [ $rc -eq 0 ]; then echo $u return 0 fi done return 1 } # # the usual stuff # getbacktraces() { flist=`find_files $HA_VARLIB/cores $1 $2` [ "$flist" ] && getbt $flist > $3 } getpeinputs() { if [ "$HA_LOGFACILITY" ]; then n=`basename $3` patt=" $n $PEINPUTS_PATT" else patt="$PEINPUTS_PATT" fi flist=$( if [ -f $3/ha-log ]; then grep "$patt" $3/ha-log | awk '{print $NF}' elif [ -f $3/../ha-log ]; then # central log grep "$patt" $3/../ha-log | awk '{print $NF}' else find_files $HA_VARLIB/pengine $1 $2 fi | sed "s,$HA_VARLIB/,,g" ) [ "$flist" ] && (cd $HA_VARLIB && tar cf - $flist) | (cd $3 && tar xf -) } touch_DC_if_dc() { dc=`crmadmin -D 2>/dev/null | awk '{print $NF}'` if [ "$WE" = "$dc" ]; then touch $1/DC fi } # # some basic system info and stats # sys_info() { echo "Heartbeat version: `hb_ver`" crm_info echo "Platform: `uname`" echo "Kernel release: `uname -r`" echo "Architecture: `arch`" [ `uname` = Linux ] && echo "Distribution: `distro`" } sys_stats() { set -x uptime ps axf ps auxw top -b -n 1 netstat -i arp -an set +x } # # replace sensitive info with '****' # sanitize() { for f in $1/ha.cf $1/cib.xml $1/pengine/*; do [ -f "$f" ] && sanitize_one $f done } # # remove duplicates if files are same, make links instead # consolidate() { for n in `getnodes`; do if [ -f $1/$2 ]; then rm $1/$n/$2 else mv $1/$n/$2 $1 fi ln -s ../$2 $1/$n done } # # some basic analysis of the report # checkcrmvfy() { for n in `getnodes`; do if [ -s $1/$n/crm_verify.txt ]; then echo "WARN: crm_verify reported warnings at $n:" cat $1/$n/crm_verify.txt fi done } checkbacktraces() { for n in `getnodes`; do [ -s $1/$n/backtraces.txt ] && { echo "WARN: coredumps found at $n:" egrep 'Core was generated|Program terminated' \ $1/$n/backtraces.txt | sed 's/^/ /' } done } checklogs() { logs=`find $1 -name ha-log` [ "$logs" ] || return pattfile=`maketempfile` || fatal "cannot create temporary files" for p in $LOG_PATTERNS; do echo "$p" done > $pattfile echo "" echo "Log patterns:" for n in `getnodes`; do cat $logs | grep -f $pattfile done rm -f $pattfile } # # check if files have same content in the cluster # cibdiff() { d1=`dirname $1` d2=`dirname $2` if [ -f $d1/RUNNING -a -f $d2/RUNNING ] || [ -f $d1/STOPPED -a -f $d2/STOPPED ]; then crm_diff -c -n $1 -o $2 else echo "can't compare cibs from running and stopped systems" fi } txtdiff() { diff $1 $2 } diffcheck() { [ -f "$1" ] || { echo "$1 does not exist" return 1 } [ -f "$2" ] || { echo "$2 does not exist" return 1 } case `basename $1` in ccm_tool.txt) txtdiff $1 $2;; # worddiff? cib.xml) cibdiff $1 $2;; ha.cf) txtdiff $1 $2;; # confdiff? crm_mon.txt|sysinfo.txt) txtdiff $1 $2;; esac } analyze_one() { rc=0 node0="" for n in `getnodes`; do if [ "$node0" ]; then diffcheck $1/$node0/$2 $1/$n/$2 rc=$((rc+$?)) else node0=$n fi done return $rc } analyze() { # flist="ccm_tool.txt cib.xml crm_mon.txt ha.cf logd.cf sysinfo.txt" flist="ccm_tool.txt crm_mon.txt ha.cf logd.cf sysinfo.txt" for f in $flist; do perl -e "printf \"Diff $f... \"" ls $1/*/$f >/dev/null 2>&1 || continue if analyze_one $1 $f; then echo "OK" consolidate $1 $f else echo "varies" fi done checkcrmvfy $1 checkbacktraces $1 checklogs $1 } # # description template, editing, and other notes # mktemplate() { cat<=100{exit 1}' || cat < $DESTDIR/$WE/ha-log else cat > $DESTDIR/ha-log # we are log server, probably fi fi else - warning "could not find the log file on $WE" + [ "$MASTER_IS_HOSTLOG" ] || + warning "could not find the log file on $WE" +fi + +# +# part 5: start this program on other nodes +# +if [ ! "$SLAVE" ]; then + if [ "$ssh_good" ]; then + send_config + start_remote_collectors + else + [ `getnodes | wc -w` -gt 1 ] && + warning "ssh does not work to all nodes" + fi fi # # part 6: get all other info (config, stats, etc) # if [ "$THIS_IS_NODE" ]; then getconfig $DESTDIR/$WE getpeinputs $FROM_TIME $TO_TIME $DESTDIR/$WE getbacktraces $FROM_TIME $TO_TIME $DESTDIR/$WE/backtraces.txt touch_DC_if_dc $DESTDIR/$WE sanitize $DESTDIR/$WE sys_info > $DESTDIR/$WE/sysinfo.txt sys_stats > $DESTDIR/$WE/sysstats.txt 2>&1 fi # # part 7: endgame: # slaves tar their results to stdout, the master waits # for them, analyses results, asks the user to edit the # problem description template, and prints final notes # if [ "$SLAVE" ]; then (cd $DESTDIR && tar cf - $WE) else wait $SLAVEPIDS analyze $DESTDIR > $DESTDIR/analysis.txt mktemplate > $DESTDIR/description.txt [ "$NO_DESCRIPTION" ] || { echo press enter to edit the problem description... read junk edittemplate $DESTDIR/description.txt } cd $DESTDIR/.. tar czf $DESTDIR.tar.gz `basename $DESTDIR` finalword checksize fi [ "$REMOVE_DEST" ] && rm -r $DESTDIR diff --git a/tools/utillib.sh b/tools/utillib.sh index 2187624d9d..0c08aa9fab 100644 --- a/tools/utillib.sh +++ b/tools/utillib.sh @@ -1,354 +1,354 @@ # Copyright (C) 2007 Dejan Muhamedagic # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # # ha.cf/logd.cf parsing # getcfvar() { [ -f $HA_CF ] || return sed 's/#.*//' < $HA_CF | grep -w "^$1" | sed 's/^[^[:space:]]*[[:space:]]*//' } iscfvarset() { test "`getcfvar \"$1\"`" } iscfvartrue() { getcfvar "$1" | egrep -qsi "^(true|y|yes|on|1)" } getnodes() { getcfvar node } # # logging # syslogmsg() { severity=$1 shift 1 logtag="" [ "$HA_LOGTAG" ] && logtag="-t $HA_LOGTAG" logger -p ${HA_LOGFACILITY:-"daemon"}.$severity $logtag $* } # # find log destination # uselogd() { iscfvartrue use_logd && return 0 # if use_logd true iscfvarset logfacility || iscfvarset logfile || iscfvarset debugfile || return 0 # or none of the log options set false } findlogdcf() { for f in \ `which strings > /dev/null 2>&1 && strings $HA_BIN/ha_logd | grep 'logd\.cf'` \ `for d; do echo $d/logd.cf $d/ha_logd.cf; done` do if [ -f "$f" ]; then echo $f return 0 fi done return 1 } getlogvars() { savecf=$HA_CF if uselogd; then [ -f "$LOGD_CF" ] || fatal "could not find logd.cf or ha_logd.cf" HA_CF=$LOGD_CF fi HA_LOGFACILITY=`getcfvar logfacility` HA_LOGFILE=`getcfvar logfile` HA_DEBUGFILE=`getcfvar debugfile` HA_SYSLOGMSGFMT="" iscfvartrue syslogmsgfmt && HA_SYSLOGMSGFMT=1 HA_CF=$savecf } findmsg() { # this is tricky, we try a few directories syslogdir="/var/log /var/logs /var/syslog /var/adm /var/log/ha /var/log/cluster" favourites="ha-*" mark=$1 log="" for d in $syslogdir; do [ -d $d ] || continue log=`fgrep -l "$mark" $d/$favourites` && break log=`fgrep -l "$mark" $d/*` && break done 2>/dev/null echo $log } # # print a segment of a log file # str2time() { perl -e "\$time='$*';" -e ' eval "use Date::Parse"; if (!$@) { print str2time($time); } else { eval "use Date::Manip"; if (!$@) { print UnixDate(ParseDateString($time), "%s"); } } ' } getstamp() { if [ "$HA_SYSLOGMSGFMT" -o "$HA_LOGFACILITY" ]; then awk '{print $1,$2,$3}' else awk '{print $2}' | sed 's/_/ /' fi } linetime() { l=`tail -n +$2 $1 | head -1 | getstamp` str2time "$l" } findln_by_time() { logf=$1 tm=$2 first=1 last=`wc -l < $logf` while [ $first -le $last ]; do mid=$(((last+first)/2)) tmid=`linetime $logf $mid` if [ -z "$tmid" ]; then warning "cannot extract time: $logf:$mid" return fi if [ $tmid -gt $tm ]; then last=$((mid-1)) elif [ $tmid -lt $tm ]; then first=$((mid+1)) else break fi done echo $mid } + dumplog() { logf=$1 - from_time=$2 - to_time=$3 - from_line=`findln_by_time $logf $from_time` - if [ -z "$from_line" ]; then - warning "couldn't find line for time $from_time; corrupt log file?" + from_line=$2 + to_line=$3 + [ "$from_line" ] || return - fi tail -n +$from_line $logf | - if [ "$to_time" != 0 ]; then - to_line=`findln_by_time $logf $to_time` - if [ -z "$to_line" ]; then - warning "couldn't find line for time $to_time; corrupt log file?" - return - fi + if [ "$to_line" ]; then head -$((to_line-from_line+1)) else cat fi } # # find files newer than a and older than b # +isnumber() { + echo "$1" | grep -qs '^[0-9][0-9]*$' +} touchfile() { t=`maketempfile` && perl -e "\$file=\"$t\"; \$tm=$1;" -e 'utime $tm, $tm, $file;' && echo $t } find_files() { dir=$1 from_time=$2 to_time=$3 + isnumber "$from_time" && [ "$from_time" -gt 0 ] || { + warning "sorry, can't find files based on time if you don't supply time" + return + } from_stamp=`touchfile $from_time` findexp="-newer $from_stamp" - if [ "$to_time" -a "$to_time" -gt 0 ]; then + if isnumber "$to_time" && [ "$to_time" -gt 0 ]; then to_stamp=`touchfile $to_time` findexp="$findexp ! -newer $to_stamp" fi find $dir -type f $findexp rm -f $from_stamp $to_stamp } # # coredumps # findbinary() { random_binary=`which cat 2>/dev/null` # suppose we are lucky binary=`gdb $random_binary $1 < /dev/null 2>/dev/null | grep 'Core was generated' | awk '{print $5}' | sed "s/^.//;s/[.']*$//"` [ x = x"$binary" ] && return fullpath=`which $binary 2>/dev/null` if [ x = x"$fullpath" ]; then [ -x $HA_BIN/$binary ] && echo $HA_BIN/$binary else echo $fullpath fi } getbt() { which gdb > /dev/null 2>&1 || { warning "please install gdb to get backtraces" return } for corefile; do absbinpath=`findbinary $corefile` [ x = x"$absbinpath" ] && return 1 echo "====================== start backtrace ======================" ls -l $corefile gdb -batch -n -quiet -ex ${BT_OPTS:-"thread apply all bt full"} -ex quit \ $absbinpath $corefile 2>/dev/null echo "======================= end backtrace =======================" done } # # heartbeat configuration/status # iscrmrunning() { crmadmin -D >/dev/null 2>&1 } dumpstate() { crm_mon -1 | grep -v '^Last upd' > $1/crm_mon.txt cibadmin -Ql > $1/cib.xml ccm_tool -p > $1/ccm_tool.txt 2>&1 } getconfig() { [ -f $HA_CF ] && cp -p $HA_CF $1/ [ -f $LOGD_CF ] && cp -p $LOGD_CF $1/ if iscrmrunning; then dumpstate $1 touch $1/RUNNING else cp -p $HA_VARLIB/crm/cib.xml $1/ 2>/dev/null touch $1/STOPPED fi [ -f "$1/cib.xml" ] && crm_verify -V -x $1/cib.xml >$1/crm_verify.txt 2>&1 } # # remove values of sensitive attributes # # this is not proper xml parsing, but it will work under the # circumstances sanitize_xml_attrs() { sed $( for patt in $SANITIZE; do echo "-e /name=\"$patt\"/s/value=\"[^\"]*\"/value=\"****\"/" done ) } sanitize_hacf() { awk ' $1=="stonith_host"{ for( i=5; i<=NF; i++ ) $i="****"; } {print} ' } sanitize_one() { file=$1 compress="" echo $file | grep -qs 'gz$' && compress=gzip echo $file | grep -qs 'bz2$' && compress=bzip2 if [ "$compress" ]; then decompress="$compress -dc" else compress=cat decompress=cat fi tmp=`maketempfile` && ref=`maketempfile` || fatal "cannot create temporary files" touch -r $file $ref # save the mtime if [ "`basename $file`" = ha.cf ]; then sanitize_hacf else $decompress | sanitize_xml_attrs | $compress fi < $file > $tmp mv $tmp $file touch -r $ref $file rm -f $ref } # # keep the user posted # fatal() { echo "`uname -n`: ERROR: $*" >&2 exit 1 } warning() { echo "`uname -n`: WARN: $*" >&2 } info() { echo "`uname -n`: INFO: $*" >&2 } pickfirst() { for x; do which $x >/dev/null 2>&1 && { echo $x return 0 } done return 1 } # # get some system info # distro() { which lsb_release >/dev/null 2>&1 && { lsb_release -d return } relf=`ls /etc/debian_version 2>/dev/null` || relf=`ls /etc/slackware-version 2>/dev/null` || relf=`ls -d /etc/*-release 2>/dev/null` && { for f in $relf; do test -f $f && { echo "`ls $f` `cat $f`" return } done } warning "no lsb_release no /etc/*-release no /etc/debian_version" } hb_ver() { which dpkg > /dev/null 2>&1 && { dpkg-query -f '${Version}' -W heartbeat 2>/dev/null || dpkg-query -f '${Version}' -W heartbeat-2 return } which rpm > /dev/null 2>&1 && { rpm -q --qf '%{version}' heartbeat return } # more packagers? } crm_info() { $HA_BIN/crmd version 2>&1 }