diff --git a/tools/crm_report.in b/tools/crm_report.in index cd6e63fd1d..38a45cec21 100755 --- a/tools/crm_report.in +++ b/tools/crm_report.in @@ -1,400 +1,411 @@ #!/bin/sh # Copyright (C) 2010 Andrew Beekhof # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Note the quotes around `$TEMP': they are essential! TEMP=`getopt \ -o hv?xl:f:t:n:T:Lpc:dSACHu:MV \ --long help,cts:,node:,nodes:,from:,to:logfile:,as-directory,single-node,cluster:,user:,version,features \ -n 'pcmk_report' -- "$@"` eval set -- "$TEMP" times="" tests="" nodes="" compress=1 cluster="any" ssh_user="root" search_logs=1 report_data=`dirname $0` extra_logs="" sanitize_patterns="passw.*" log_patterns="CRIT: ERROR:" usage() { cat< "$l_base/$HALOG_F" fi cat<$l_base/.env LABEL="$label" REPORT_HOME="$r_base" REPORT_MASTER="$host" LOG_START=$start LOG_END=$end REMOVE=1 SANITIZE="$sanitize_patterns" CLUSTER=$cluster LOG_PATTERNS="$log_patterns" EXTRA_LOGS="$extra_logs" SEARCH_LOGS=$search_logs verbose=$verbose EOF for node in $nodes; do if [ `uname -n` = $node ]; then cat $l_base/.env $report_data/report.common $report_data/report.collector > $r_base/collector bash $r_base/collector else cat $l_base/.env $report_data/report.common $report_data/report.collector \ | ssh -l $ssh_user -T $node -- "mkdir -p $r_base; cat > $r_base/collector; bash $r_base/collector" | (cd $l_base && tar xf -) fi done analyze $l_base > $l_base/$ANALYSIS_F if [ -f $l_base/$HALOG_F ]; then node_events $l_base/$HALOG_F > $l_base/$EVENTS_F fi for node in $nodes; do cat $l_base/$node/$ANALYSIS_F >> $l_base/$ANALYSIS_F if [ -s $l_base/$node/$EVENTS_F ]; then cat $l_base/$node/$EVENTS_F >> $l_base/$EVENTS_F elif [ -s $l_base/$HALOG_F ]; then awk "\$4==\"$nodes\"" $l_base/$EVENTS_F >> $l_base/$n/$EVENTS_F fi done log " " if [ $compress = 1 ]; then fname=`shrink $l_base` rm -rf $l_base log "Collected results are available in $fname" log " " log "Please create a bug entry at" log " http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker" log "Include a description of your problem and attach this tarball" log " " log "Thank you for taking time to create this report." else log "Collected results are available in $l_base" fi log " " } # # check if files have same content in the cluster # cibdiff() { d1=`dirname $1` d2=`dirname $2` if [ -f $d1/RUNNING -a -f $d2/RUNNING ] || [ -f $d1/STOPPED -a -f $d2/STOPPED ]; then if which crm_diff > /dev/null 2>&1; then crm_diff -c -n $1 -o $2 else info "crm_diff(8) not found, cannot diff CIBs" fi else echo "can't compare cibs from running and stopped systems" fi } diffcheck() { [ -f "$1" ] || { echo "$1 does not exist" return 1 } [ -f "$2" ] || { echo "$2 does not exist" return 1 } case `basename $1` in $CIB_F) cibdiff $1 $2;; $B_CONF) diff -u $1 $2;; # confdiff? *) diff -u $1 $2;; esac } # # remove duplicates if files are same, make links instead # consolidate() { for n in $NODES; do if [ -f $1/$2 ]; then rm $1/$n/$2 else mv $1/$n/$2 $1 fi ln -s ../$2 $1/$n done } analyze_one() { rc=0 node0="" for n in $NODES; do if [ "$node0" ]; then diffcheck $1/$node0/$2 $1/$n/$2 rc=$(($rc+$?)) else node0=$n fi done return $rc } analyze() { flist="$HOSTCACHE $MEMBERSHIP_F $CIB_F $CRM_MON_F $B_CONF logd.cf $SYSINFO_F" for f in $flist; do printf "Diff $f... " ls $1/*/$f >/dev/null 2>&1 || { echo "no $1/*/$f :/" continue } if analyze_one $1 $f; then echo "OK" [ "$f" != $CIB_F ] && consolidate $1 $f else echo "" fi done } do_cts() { - if [ x$ctslog = x ]; then - ctslog=`findmsg 1 "CTS: Stack:"` - fi - if [ x$ctslog = x ]; then - fatal "No CTS control file detected" - fi - - if [ -z "$nodes" ]; then - debug "Using CTS control file: $ctslog" - nodes=`grep CTS: $ctslog | grep -v debug: | grep " \* " | sed s:.*\\\*::g | sort -u | tr '\\n' ' '` - fi - test_sets=`echo $tests | tr ',' ' '` for test_set in $test_sets; do + + start_time=0 start_test=`echo $test_set | tr '-' ' ' | awk '{print $1}'` + + end_time=0 end_test=`echo $test_set | tr '-' ' ' | awk '{print $2}'` if [ x$end_test = x ]; then msg="Extracting test $start_test" label="CTS-`date +"%a-%d-%b-%Y"`-$start_test" end_test=`expr $start_test + 1` else msg="Extracting set $start_test to $end_test..." label="CTS-`date +"%a-%d-%b-%Y"`-$start_test-$end_test" end_test=`expr $end_test + 1` fi if [ $start_test = 0 ]; then start_pat="BEGINNING [0-9].* TESTS" else start_pat="Running test.*\[ *$start_test\]" fi - ctslog=`findmsg 1 "$start_pat"` + + if [ x$ctslog = x ]; then + ctslog=`findmsg 1 "$start_pat"` + fi + if [ x$ctslog = x ]; then + fatal "No CTS control file detected" + fi + line=`grep -n "$start_pat" $ctslog | tail -1 | sed 's/:.*//'` - start_time=`linetime $ctslog $line` + if [ ! -z "$line" ]; then + start_time=`linetime $ctslog $line` + fi - ctslog=`findmsg 1 "Running test.*\[ *$end_test\]"` line=`grep -n "Running test.*\[ *$end_test\]" $ctslog | tail -1 | sed 's/:.*//'` - end_time=`linetime $ctslog $line` + if [ ! -z "$line" ]; then + end_time=`linetime $ctslog $line` + fi + if [ -z "$nodes" ]; then + debug "Using CTS control file: $ctslog" + nodes=`grep CTS: $ctslog | grep -v debug: | grep " \* " | sed s:.*\\\*::g | sort -u | tr '\\n' ' '` + fi + if [ $end_time -lt $start_time ]; then debug "Test didn't complete, grabbing everything up to now" end_time=`date +%s` fi - log "$msg (`time2str $start_time` to `time2str $end_time`)" - collect_data $label $start_time $end_time $ctslog + if [ $start_time != 0 ];then + log "$msg (`time2str $start_time` to `time2str $end_time`)" + collect_data $label $start_time $end_time $ctslog + else + fatal "$msg failed: not found" + fi done } getnodes() { if [ -z $1 ]; then cluster=`get_cluster_type` else cluster=$1 fi cluster_cf=`find_cluster_cf $cluster` # 1. Live if ps -ef | egrep -qs [c]rmd then debug "Querying CRM for nodes" cibadmin -Ql -o nodes | awk ' /type="normal"/ { for( i=1; i<=NF; i++ ) if( $i~/^uname=/ ) { sub("uname=.","",$i); sub("\".*","",$i); print $i; next; } } ' # 2. hostcache elif [ -f $HA_STATE_DIR/hostcache ]; then debug "Reading nodes from $HA_STATE_DIR/hostcache" awk '{print $1}' $HA_STATE_DIR/hostcache # 3. ha.cf elif [ "x$cluster" = "xheartbeat" ]; then debug "Reading nodes from $cluster_cf" getcfvar $cluster node $cluster_cf else # Look in the logs... logfile=`findmsg 1 "crm_update_peer"` debug "Reading nodes from $logfile" if [ ! -z "$logfile" ]; then grep crm_update_peer: $logfile | sed s/.*crm_update_peer// | sed s/://g | awk '{print $2}' | grep -v "(null)" | sort -u | tr '\n' ' ' fi fi } if [ "x$tests" != "x" ]; then do_cts elif [ "x$start_time" != "x" ]; then masterlog="" if [ -z "$nodes" ]; then nodes=`getnodes $cluster` log "Calculated node list: $nodes" fi if [ -z "$nodes" ]; then fatal "Cannot determine node list, please specify manually with --nodes" fi if echo $nodes | grep -qs $host then debug "We are a cluster node" else debug "We are a log master" masterlog=`findmsg 1 "crmd\\|CTS"` fi if [ -z $end_time ]; then end_time=`perl -e 'print time()'` fi label="pcmk-`date +"%a-%d-%b-%Y"`" log "Collecting data from $nodes (`time2str $start_time` to `time2str $end_time`)" collect_data $label $start_time $end_time $masterlog else fatal "Not sure what to do, no tests or times to extract" fi diff --git a/tools/report.collector b/tools/report.collector index c708723372..39f351b660 100644 --- a/tools/report.collector +++ b/tools/report.collector @@ -1,651 +1,651 @@ # Copyright (C) 2007 Dejan Muhamedagic # Almost everything as part of hb_report # Copyright (C) 2010 Andrew Beekhof # Cleanups, refactoring, extensions # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # if echo $REPORT_HOME | grep -qs '^/' then - debug "Using full path to working directory" + debug "Using full path to working directory: $REPORT_HOME" else - debug "Canonicalizing working directory path" REPORT_HOME="$HOME/$REPORT_HOME" + debug "Canonicalizing working directory path: $REPORT_HOME" fi findlogdcf() { for f in \ `test -x $CRM_DAEMON_DIR/ha_logd && which strings > /dev/null 2>&1 && strings $CRM_DAEMON_DIR/ha_logd | grep 'logd\.cf'` \ `for d; do echo $d/logd.cf $d/ha_logd.cf; done` do if [ -f "$f" ]; then echo $f debug "Located logd.cf at: $f" return 0 fi done debug "Could not determine logd.cf location" return 1 } # # find files newer than a and older than b # isnumber() { echo "$*" | grep -qs '^[0-9][0-9]*$' } touchfile() { t=`mktemp` && perl -e "\$file=\"$t\"; \$tm=$1;" -e 'utime $tm, $tm, $file;' && echo $t } find_files_clean() { [ -z "$from_stamp" ] || rm -f "$from_stamp" [ -z "$to_stamp" ] || rm -f "$to_stamp" from_stamp="" to_stamp="" } find_files() { dirs=$1 from_time=$2 to_time=$3 isnumber "$from_time" && [ "$from_time" -gt 0 ] || { warning "sorry, can't find files in [ $1 ] based on time if you don't supply time" return } trap find_files_clean 0 if ! from_stamp=`touchfile $from_time`; then warning "sorry, can't create temporary file for find_files" return fi findexp="-newer $from_stamp" if isnumber "$to_time" && [ "$to_time" -gt 0 ]; then if ! to_stamp=`touchfile $to_time`; then warning "sorry, can't create temporary file for find_files" find_files_clean return fi findexp="$findexp ! -newer $to_stamp" fi find $dirs -type f $findexp find_files_clean trap "" 0 } # # check permissions of files/dirs # pl_checkperms() { perl -e ' # check permissions and ownership # uid and gid are numeric # everything must match exactly # no error checking! (file should exist, etc) ($filename, $perms, $in_uid, $in_gid) = @ARGV; ($mode,$uid,$gid) = (stat($filename))[2,4,5]; $p=sprintf("%04o", $mode & 07777); $p ne $perms and exit(1); $uid ne $in_uid and exit(1); $gid ne $in_gid and exit(1); ' $* } num_id() { getent $1 $2 | awk -F: '{print $3}' } chk_id() { [ "$2" ] && return 0 echo "$1: id not found" return 1 } check_perms() { while read type f p uid gid; do [ -$type $f ] || { echo "$f wrong type or doesn't exist" continue } n_uid=`num_id passwd $uid` chk_id "$uid" "$n_uid" || continue n_gid=`num_id group $gid` chk_id "$gid" "$n_gid" || continue pl_checkperms $f $p $n_uid $n_gid || { echo "wrong permissions or ownership for $f:" ls -ld $f } done } # # coredumps # findbinary() { random_binary=`which cat 2>/dev/null` # suppose we are lucky binary=`gdb $random_binary $1 < /dev/null 2>/dev/null | grep 'Core was generated' | awk '{print $5}' | sed "s/^.//;s/[.':]*$//"` if [ x = x"$binary" ]; then debug "Could not detect the program name for core $1 from the gdb output; will try with file(1)" binary=$(file $1 | awk '/from/{ for( i=1; i<=NF; i++ ) if( $i == "from" ) { print $(i+1) break } }') binary=`echo $binary | tr -d "'"` binary=$(echo $binary | tr -d '`') if [ "$binary" ]; then binary=`which $binary 2>/dev/null` fi fi if [ x = x"$binary" ]; then warning "Could not find the program path for core $1" return fi fullpath=`which $binary 2>/dev/null` if [ x = x"$fullpath" ]; then if [ -x $CRM_DAEMON_DIR/$binary ]; then echo $CRM_DAEMON_DIR/$binary debug "Found the program at $CRM_DAEMON_DIR/$binary for core $1" else warning "Could not find the program path for core $1" fi else echo $fullpath debug "Found the program at $fullpath for core $1" fi } getbt() { which gdb > /dev/null 2>&1 || { warning "Please install gdb to get backtraces" return } for corefile; do absbinpath=`findbinary $corefile` [ x = x"$absbinpath" ] && continue echo "====================== start backtrace ======================" ls -l $corefile gdb -batch -n -quiet -ex ${BT_OPTS:-"thread apply all bt full"} -ex quit \ $absbinpath $corefile 2>/dev/null echo "======================= end backtrace =======================" done } getconfig() { target=$1; shift; for cf in $*; do if [ -f "$cf" ]; then cp -p "$cf" $target/ fi done crm_uuid -r > $target/$HB_UUID_F 2>&1 if ps -ef | egrep -qs [c]rmd then crm_mon -1 2>&1 | grep -v '^Last upd' > $target/$CRM_MON_F - cibadmin -Ql > $target/${CIB_F}.live + cibadmin -Ql 2>/dev/null > $target/${CIB_F}.live crm_node -p > $target/$MEMBERSHIP_F 2>&1 echo "$host" > $target/RUNNING else echo "$host" > $target/STOPPED fi if [ -f "$target/$CIB_F" ]; then crm_verify -V -x $target/$CIB_F >$target/$CRM_VERIFY_F 2>&1 CIB_file=$target/$CIB_F crm configure show >$target/$CIB_TXT_F 2>&1 fi } # # remove values of sensitive attributes # # this is not proper xml parsing, but it will work under the # circumstances sanitize_xml_attrs() { sed $( for patt in $SANITIZE; do echo "-e /name=\"$patt\"/s/value=\"[^\"]*\"/value=\"****\"/" done ) } sanitize_hacf() { awk ' $1=="stonith_host"{ for( i=5; i<=NF; i++ ) $i="****"; } {print} ' } sanitize_one_clean() { [ -z "$tmp" ] || rm -f "$tmp" tmp="" [ -z "$ref" ] || rm -f "$ref" ref="" } sanitize() { file=$1 compress="" if [ -z $SANITIZE ]; then return fi echo $file | grep -qs 'gz$' && compress=gzip echo $file | grep -qs 'bz2$' && compress=bzip2 if [ "$compress" ]; then decompress="$compress -dc" else compress=cat decompress=cat fi trap sanitize_one_clean 0 tmp=`mktemp` ref=`mktemp` if [ -z "$tmp" -o -z "$ref" ]; then sanitize_one_clean fatal "cannot create temporary files" fi touch -r $file $ref # save the mtime if [ "`basename $file`" = ha.cf ]; then sanitize_hacf else $decompress | sanitize_xml_attrs | $compress fi < $file > $tmp mv $tmp $file # note: cleaning $tmp up is still needed even after it's renamed # because its temp directory is still there. touch -r $ref $file sanitize_one_clean trap "" 0 } pickfirst() { for x; do which $x >/dev/null 2>&1 && { echo $x return 0 } done return 1 } # # get some system info # distro() { if which lsb_release >/dev/null 2>&1 then lsb_release -d debug "Using lsb_release for distribution info" return fi relf=`ls /etc/debian_version 2>/dev/null` || relf=`ls /etc/slackware-version 2>/dev/null` || relf=`ls -d /etc/*-release 2>/dev/null` && { for f in $relf; do test -f $f && { echo "`ls $f` `cat $f`" debug "Found `echo $relf | tr '\n' ' '` distribution release file(s)" return } done } warning "No lsb_release, no /etc/*-release, no /etc/debian_version: no distro information" } pkg_ver() { if which dpkg >/dev/null 2>&1 ; then pkg_mgr="deb" elif which rpm >/dev/null 2>&1 ; then pkg_mgr="rpm" elif which pkg_info >/dev/null 2>&1 ; then pkg_mgr="pkg_info" elif which pkginfo >/dev/null 2>&1 ; then pkg_mgr="pkginfo" else warning "Unknown package manager" return fi debug "The package manager is: $pkg_mgr" echo "The package manager is: $pkg_mgr" # for Linux .deb based systems case $pkg_mgr in deb) dpkg-query -f '${Package} ${Version} ${Architecture}\n' -W for pkg in $*; do if dpkg-query -W $pkg 2>/dev/null ; then debug "Verifying installation of: $pkg" echo "Verifying installation of: $pkg" debsums -s $pkg 2>/dev/null fi done ;; rpm) rpm -qa --qf '%{name} %{version}-%{release} - %{distribution} %{arch}\n' for pkg in $*; do if rpm -q $pkg >/dev/null 2>&1 ; then debug "Verifying installation of: $pkg" echo "Verifying installation of: $pkg" rpm --verify $pkg fi done ;; pkg_info) pkg_info ;; pkginfo) pkginfo | awk '{print $3}' # format? ;; esac } getbacktraces() { debug "Looking for backtraces: $*" flist=$( for f in `find_files "$CRM_CORE_DIRS" $1 $2`; do bf=`basename $f` test `expr match $bf core` -gt 0 && echo $f done) if [ "$flist" ]; then log "Found core files: `echo $flist | tr '\n' ' '`" getbt "$flist" fi } getpeinputs() { flist=$( find_files $PE_STATE_DIR $1 $2 | sed "s,`dirname $PE_STATE_DIR`/,,g" ) if [ "$flist" ]; then (cd `dirname $PE_STATE_DIR` && tar cf - $flist) | (cd $3 && tar xf -) debug "found `echo $flist | wc -w` pengine input files in $PE_STATE_DIR" fi } # # some basic system info and stats # sys_info() { cluster=$1; shift echo "Platform: `uname`" echo "Kernel release: `uname -r`" echo "Architecture: `uname -m`" if [ `uname` = Linux ]; then echo "Distribution: `distro`" fi cibadmin --version 2>&1 cibadmin -! 2>&1 case $1 in openais) : echo "openais version: how?" ;; corosync) /usr/sbin/corosync -v 2>&1 ;; heartbeat) heartbeat version: `$CRM_DAEMON_DIR/heartbeat -V` 2>&1 ;; esac # TODO: Get cluster-glue build version echo "resource-agents: `grep 'Build version:' /usr/lib/ocf/resource.d/heartbeat/.ocf-shellfuncs`" pkg_ver $* } sys_stats() { set -x uname -n uptime ps axf ps auxw top -b -n 1 ifconfig -a ip addr list netstat -i arp -an test -d /proc && { cat /proc/cpuinfo } lsscsi lspci mount df set +x } dlm_dump() { if which dlm_tool >/dev/null 2>&1 ; then echo NOTICE - Lockspace overview: dlm_tool ls dlm_tool ls | grep name | while read X N ; do echo NOTICE - Lockspace $N: dlm_tool lockdump $N done echo NOTICE - Lockspace history: dlm_tool dump fi } iscfvarset() { test "`getcfvar $1 $2`" } iscfvartrue() { getcfvar $1 $2 $3 | egrep -qsi "^(true|y|yes|on|1)" } uselogd() { cf_file=$2 case $1 in heartbeat) iscfvartrue $1 use_logd $cf_file && return 0 # if use_logd true iscfvarset $1 logfacility $cf_file || iscfvarset $1 logfile $cf_file || iscfvarset $1 debugfile $cf_file || return 0 # or none of the log options set false ;; *) iscfvartrue $1 use_logd $cf_file ;; esac } get_logfile() { cf_type=$1 cf_file="$2" cf_logd="$3" facility_var="logfacility" if [ -f "$cf_logd" ]; then if uselogd; then cf_file="$cf_logd" cf_type="logd" fi fi debug "Reading $cf_type log settings" case $cf_type in openais|corosync) debug "Reading log settings from $cf_file" if iscfvartrue $cf_type to_syslog $cf_file; then facility_var=syslog_facility elif iscfvartrue $cf_type to_file $cf_file; then logfile=`getcfvar $cf_type logfile $cf_file syslog_facility` fi ;; heartbeat|logd) debug "Reading log settings from $cf_file" if iscfvartrue $cf_type debug $cf_file then logfile=`getcfvar $cf_type debugfile $cf_file` else logfile=`getcfvar $cf_type logfile $cf_file` fi ;; *) debug "Unknown cluster type: $cf_type" echo "/var/log/messages" ;; esac if [ "x$logfile" != "x" -a -f "$logfile" ]; then echo $logfile else facility=`getcfvar $cf_type $facility_var $cf_file` [ "" = "$facility" ] && facility="daemon" if [ "none" = "$facility" ]; then fatal "No logging is configured" fi msg="Mark:pcmk:`perl -e 'print time()'`" logger -p $facility.info $msg >/dev/null 2>&1 findmsg 1 "$msg" fi } essential_files() { cat< $SYSINFO_F essential_files $cluster | check_perms > $PERMISSIONS_F 2>&1 getconfig "$REPORT_HOME/$host" "$cluster_cf" "$logd_cf" "$HA_STATE_DIR/crm/$CIB_F" "$HA_STATE_DIR/hostcache" -dlm_dump > $DLM_DUMP_F 2>&1 -sys_stats > $SYSSTATS_F 2>&1 - getpeinputs $LOG_START $LOG_END $REPORT_HOME/$host getbacktraces $LOG_START $LOG_END > $REPORT_HOME/$host/$BT_F dc=`crm_mon -1 2>/dev/null | awk '/Current DC/ {print $3}'` if [ "$host" = "$dc" ]; then echo "$host" > DC fi +dlm_dump > $DLM_DUMP_F 2>&1 +sys_stats > $SYSSTATS_F 2>&1 + debug "Sanitizing files" # # replace sensitive info with '****' # for f in `basename $cluster_cf` $CIB_F $CIB_TXT_F $CIB_F.live pengine/*; do if [ -f "$f" ]; then sanitize $f fi done # Grab logs #debug "Gathering logs: $logfile $EXTRA_LOGS" trap '[ -z "$pattfile" ] || rm -f "$pattfile"' 0 pattfile=`mktemp` || fatal "cannot create temporary files" for p in $LOG_PATTERNS; do echo "$p" done > $pattfile for l in $logfile $EXTRA_LOGS; do b=`basename $l` if [ ! -f "$l" ]; then # Not a file continue elif [ -f "$b" ]; then # We already have it continue fi dumplogset "$l" $LOG_START $LOG_END > "$b" echo "Log patterns $host:" > $ANALYSIS_F cat $b | grep -f $pattfile >> $ANALYSIS_F done rm -f $pattfile trap "" 0 # Purge files containing no information for f in `ls -1`; do if [ -d "$f" ]; then continue elif [ ! -s "$f" ]; then debug "Removing empty file: $f" rm -f $f fi done # Parse for events for l in $logfile $EXTRA_LOGS; do node_events `basename $logfile` > $EVENTS_F # Link the first logfile to a standard name if it doesn't yet exist if [ ! -e $HALOG_F ]; then ln -s `basename $l` $HALOG_F fi done if [ "$REPORT_MASTER" != "$host" ]; then debug "Streaming report back to $REPORT_MASTER" (cd $REPORT_HOME && tar cf - $host) if [ "$REMOVE" = "1" ]; then cd rm -rf $REPORT_HOME fi fi diff --git a/tools/report.common b/tools/report.common index b700a46df0..0504afe2ae 100644 --- a/tools/report.common +++ b/tools/report.common @@ -1,629 +1,633 @@ # Copyright (C) 2007 Dejan Muhamedagic # Almost everything as part of hb_report # Copyright (C) 2010 Andrew Beekhof # Cleanups, refactoring, extensions # # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # host=`uname -n` shorthost=`echo $host | sed s:\\\\..*::` if [ -z $verbose ]; then verbose=0 fi # Target Files EVENTS_F=events.txt ANALYSIS_F=analysis.txt DESCRIPTION_F=description.txt HALOG_F=cluster-log.txt BT_F=backtraces.txt SYSINFO_F=sysinfo.txt SYSSTATS_F=sysstats.txt DLM_DUMP_F=dlm_dump.txt CRM_MON_F=crm_mon.txt MEMBERSHIP_F=members.txt HB_UUID_F=hb_uuid.txt HOSTCACHE=hostcache CRM_VERIFY_F=crm_verify.txt PERMISSIONS_F=permissions.txt CIB_F=cib.xml CIB_TXT_F=cib.txt EVENT_PATTERNS=" state do_state_transition membership pcmk_peer_update.*(lost|memb): quorum crmd.*ais.disp.*quorum.(lost|ac?quir) pause Process.pause.detected resources lrmd.*rsc:(start|stop) stonith stonithd.*(requests|(Succeeded|Failed).to.STONITH|result=) start_stop Executive.Service.RELEASE|crm_shutdown:.Requesting.shutdown|pcmk_shutdown:.Shutdown.complete " PACKAGES="pacemaker pacemaker-libs libpacemaker3 pacemaker-pygui pacemaker-pymgmt pymgmt-client openais libopenais2 libopenais3 corosync libcorosync4 resource-agents cluster-glue-libs cluster-glue libglue2 ldirectord heartbeat heartbeat-common heartbeat-resources libheartbeat2 ocfs2-tools ocfs2-tools-o2cb ocfs2console ocfs2-kmp-default ocfs2-kmp-pae ocfs2-kmp-xen ocfs2-kmp-debug ocfs2-kmp-trace drbd drbd-kmp-xen drbd-kmp-pae drbd-kmp-default drbd-kmp-debug drbd-kmp-trace drbd-heartbeat drbd-pacemaker drbd-utils drbd-bash-completion drbd-xen lvm2 lvm2-clvm cmirrord libdlm libdlm2 libdlm3 hawk ruby lighttpd kernel-default kernel-pae kernel-xen glibc " # # keep the user posted # log() { printf "%-10s $*\n" "$shorthost:" 1>&2 } debug() { if [ $verbose -gt 0 ]; then log "Debug: $*" fi } info() { log "$*" } warning() { log "WARN: $*" } fatal() { log "ERROR: $*" exit 1 } detect_host() { local_state_dir=/var if [ -d $local_state_dir/run/crm ]; then CRM_STATE_DIR=$local_state_dir/run/crm else for d in `find / -type d -name run`; do if [ -d $d/crm ]; then CRM_STATE_DIR=$d/crm local_state_dir=`dirname $d` fi done fi if [ ! -d $CRM_STATE_DIR ]; then fatal "Non-standard Pacemaker installation: State directory not found" fi debug "Machine state directory: $local_state_dir" debug "State files located in: $CRM_STATE_DIR" if [ -d $local_state_dir/lib/pengine ]; then PE_STATE_DIR=$local_state_dir/lib/pengine else for d in `find / -type d -name pengine`; do PE_STATE_DIR=$d break done fi if [ -z $PE_STATE_DIR ]; then fatal "Non-standard Pacemaker installation: Policy Engine directory not found" fi debug "PE files located in: $PE_STATE_DIR" HA_STATE_DIR=$local_state_dir/lib/heartbeat if [ ! -d $HA_STATE_DIR ]; then # TODO: Go looking fatal "Non-standard Heartbeat installation: Heartbeat state directory not found" fi debug "Heartbeat state files located in: $HA_STATE_DIR" CRM_CORE_DIRS="" for d in $HA_STATE_DIR/cores $local_state_dir/lib/corosync $local_state_dir/lib/openais; do if [ -d $d ]; then CRM_CORE_DIRS="$CRM_CORE_DIRS $d" fi done debug "Core files located under: $CRM_CORE_DIRS" for d in /usr/lib/heartbeat /usr/lib64/heartbeat; do if [ -f $d/crmd ]; then CRM_DAEMON_DIR=$d break fi done if [ ! -d $CRM_DAEMON_DIR ]; then for d in `find / -type d -name heartbeat`; do if [ -f $d/crmd ]; then CRM_DAEMON_DIR=$d break fi done fi if [ ! -d $CRM_DAEMON_DIR ]; then for f in `find / -type f -name crmd`; do if [ -f $f ]; then CRM_DAEMON_DIR=`basename $f` fi done fi if [ ! -d $CRM_DAEMON_DIR ]; then fatal "Non-standard Pacemaker installation: daemons not found" fi debug "Pacemaker daemons located under: $CRM_DAEMON_DIR" } time2str() { perl -e "use POSIX; print strftime('%x %X',localtime($1));" } get_time() { perl -e "\$time='$*';" -e ' eval "use Date::Parse"; if (!$@) { print str2time($time); } else { eval "use Date::Manip"; if (!$@) { print UnixDate(ParseDateString($time), "%s"); } } ' } get_time_() { warning "No time format specified for: $*" } get_time_syslog() { awk '{print $1,$2,$3}' } get_time_legacy() { awk '{print $2}' | sed 's/_/ /' } get_time_format() { t=0 l="" func="" trycnt=10 while [ $trycnt -gt 0 ] && read l; do t=$(get_time `echo $l | get_time_syslog`) if [ "$t" ]; then func="syslog" break fi t=$(get_time `echo $l | get_time_legacy`) if [ "$t" ]; then func="legacy" break fi trycnt=$(($trycnt-1)) done #debug "Logfile uses the $func time format" echo $func } linetime() { format=`get_time_format < $1` l=`tail -n +$2 $1 | grep ":[0-5][0-9]:" | head -1 | get_time_$format` get_time "$l" } # Find pattern in a logfile somewhere # Return $max ordered results by age (newest first) findmsg() { max=$1 pattern=$2 logfiles="" syslogdirs="/var/log /var/logs /var/syslog /var/adm /var/log/ha /var/log/cluster" for d in $syslogdirs; do if [ -d $d ]; then logfiles=`grep -l -e "$pattern" $d/*` && break fi done 2>/dev/null if [ "x$logfiles" != "x" ]; then list=`ls -t $logfiles | head -n $max | tr '\n' ' '` echo $list debug "Pattern \'$pattern\' found in: [ $list ]" else debug "Pattern \'$pattern\' not found anywhere" fi } node_events() { Epatt=`echo "$EVENT_PATTERNS" | while read title p; do [ -n "$p" ] && echo -n "|$p"; done | sed 's/.//' ` grep -E "$Epatt" $1 } pickfirst() { for x; do which $x >/dev/null 2>&1 && { echo $x return 0 } done return 1 } shrink() { src=$* target=$1.tar tar_options=cf variant=`pickfirst bzip2 gzip false` case $variant in bz*) tar_options="jcf" target="$target.bz2" ;; gz*) tar_options="zcf" target="$target.gz" ;; *) warning "Could not find a compression program, the resulting tarball may be huge" ;; esac tar $tar_options $target $src >/dev/null 2>&1 echo $target } findln_by_time() { local logf=$1 local tm=$2 local first=1 local last=`wc -l < $logf` while [ $first -le $last ]; do mid=$((($last+$first)/2)) trycnt=10 while [ $trycnt -gt 0 ]; do tmid=`linetime $logf $mid` [ "$tmid" ] && break warning "cannot extract time: $logf:$mid; will try the next one" trycnt=$(($trycnt-1)) # shift the whole first-last segment first=$(($first-1)) last=$(($last-1)) mid=$((($last+$first)/2)) done if [ -z "$tmid" ]; then warning "giving up on log..." return fi if [ $tmid -gt $tm ]; then last=$(($mid-1)) elif [ $tmid -lt $tm ]; then first=$(($mid+1)) else break fi done echo $mid } dumplog() { local logf=$1 local from_line=$2 local to_line=$3 [ "$from_line" ] || return tail -n +$from_line $logf | if [ "$to_line" ]; then head -$(($to_line-$from_line+1)) else cat fi } # # find log/set of logs which are interesting for us # # # find log slices # find_decompressor() { if echo $1 | grep -qs 'bz2$'; then echo "bzip2 -dc" elif echo $1 | grep -qs 'gz$'; then echo "gzip -dc" else echo "cat" fi } # # check if the log contains a piece of our segment # is_our_log() { local logf=$1 local from_time=$2 local to_time=$3 local cat=`find_decompressor $logf` local format=`$cat $logf | get_time_format` local first_time=$(get_time "`$cat $logf | head -1 | get_time_$format`") local last_time=$(get_time "`$cat $logf | tail -1 | get_time_$format`") if [ x = "x$first_time" -o x = "x$last_time" ]; then return 0 # skip (empty log?) fi if [ $from_time -gt $last_time ]; then # we shouldn't get here anyway if the logs are in order return 2 # we're past good logs; exit fi if [ $from_time -ge $first_time ]; then return 3 # this is the last good log fi # have to go further back if [ x = "x$to_time" -o $to_time -ge $first_time ]; then return 1 # include this log else return 0 # don't include this log fi } # # go through archived logs (timewise backwards) and see if there # are lines belonging to us # (we rely on untouched log files, i.e. that modify time # hasn't been changed) # arch_logs() { local logf=$1 local from_time=$2 local to_time=$3 # look for files such as: ha-log-20090308 or # ha-log-20090308.gz (.bz2) or ha-log.0, etc ls -t $logf $logf*[0-9z] 2>/dev/null | while read next_log; do is_our_log $next_log $from_time $to_time case $? in 0) ;; # noop, continue 1) echo $next_log # include log and continue debug "Found log $next_log" ;; 2) break;; # don't go through older logs! 3) echo $next_log # include log and continue debug "Found log $next_log" break ;; # don't go through older logs! esac done } # # print part of the log # drop_tmp_file() { [ -z "$tmp" ] || rm -f "$tmp" } print_logseg() { local logf=$1 local from_time=$2 local to_time=$3 # uncompress to a temp file (if necessary) local cat=`find_decompressor $logf` if [ "$cat" != "cat" ]; then tmp=`mktemp` $cat $logf > $tmp trap drop_tmp_file 0 sourcef=$tmp else sourcef=$logf tmp="" fi if [ "$from_time" = 0 ]; then FROM_LINE=1 else FROM_LINE=`findln_by_time $sourcef $from_time` fi if [ -z "$FROM_LINE" ]; then warning "couldn't find line for time $from_time; corrupt log file?" return fi TO_LINE="" if [ "$to_time" != 0 ]; then TO_LINE=`findln_by_time $sourcef $to_time` if [ -z "$TO_LINE" ]; then warning "couldn't find line for time $to_time; corrupt log file?" return fi fi - dumplog $sourcef $FROM_LINE $TO_LINE - log "Including segment [$FROM_LINE-$TO_LINE] from $logf" + if [ $FROM_LINE -lt $TO_LINE ]; then + dumplog $sourcef $FROM_LINE $TO_LINE + log "Including segment [$FROM_LINE-$TO_LINE] from $logf" + else + log "Segment from $logf finished before it started, line: $FROM_LINE to $TO_LINE" + fi drop_tmp_file trap "" 0 } # # find log/set of logs which are interesting for us # dumplogset() { local logf=$1 local from_time=$2 local to_time=$3 local logf_set=`arch_logs $logf $from_time $to_time` if [ x = "x$logf_set" ]; then return fi local num_logs=`echo "$logf_set" | wc -l` local oldest=`echo $logf_set | awk '{print $NF}'` local newest=`echo $logf_set | awk '{print $1}'` local mid_logfiles=`echo $logf_set | awk '{for(i=NF-1; i>1; i--) print $i}'` # the first logfile: from $from_time to $to_time (or end) # logfiles in the middle: all # the last logfile: from beginning to $to_time (or end) case $num_logs in 1) print_logseg $newest $from_time $to_time;; *) print_logseg $oldest $from_time 0 for f in $mid_logfiles; do `find_decompressor $f` $f debug "including complete $f logfile" done print_logseg $newest 0 $to_time ;; esac } # cut out a stanza getstanza() { awk -v name="$1" ' !in_stanza && NF==2 && /^[a-z][a-z]*[[:space:]]*{/ { # stanza start if ($1 == name) in_stanza = 1 } in_stanza { print } in_stanza && NF==1 && $1 == "}" { exit } ' } # supply stanza in $1 and variable name in $2 # (stanza is optional) getcfvar() { cf_type=$1; shift; cf_var=$1; shift; cf_file=$* [ -f "$cf_file" ] || return case $cf_type in corosync|openais) sed 's/#.*//' < $cf_file | if [ $# -eq 2 ]; then getstanza "$cf_var" shift 1 else cat fi | awk -v varname="$cf_var" ' NF==2 && match($1,varname":$")==1 { print $2; exit; } ' ;; heartbeat) sed 's/#.*//' < $cf_file | grep -w "^$cf_var" | sed 's/^[^[:space:]]*[[:space:]]*//' ;; logd) sed 's/#.*//' < $cf_file | grep -w "^$cf_var" | sed 's/^[^[:space:]]*[[:space:]]*//' ;; esac } # # figure out the cluster type, depending on the process list # and existence of configuration files # get_cluster_type() { if ps -ef | egrep -qs '[c]orosync' then stack="corosync" elif ps -ef | egrep -qs '[a]isexec' then stack="openais" elif - ps -ef | egrep -qs '[h]eartbeat' + ps -ef | grep -v -e grep -e "eartbeat/[clasp]" | egrep -qs '[h]eartbeat' then stack="heartbeat" # Now we're guessing... # TODO: Technically these could be anywhere :-/ elif [ -f /etc/corosync/corosync.conf ]; then stack="corosync" elif [ -f /etc/ais/openais.conf ]; then stack="openais" else stack="heartbeat" fi - debug "Detected the $stack cluster stack" + debug "Detected the '$stack' cluster stack" echo $stack } find_cluster_cf() { case $1 in corosync) best_size=0 best_file="" # TODO: Technically these could be anywhere :-/ for cf in /etc/ais/openais.conf /etc/corosync/corosync.conf; do if [ -f $cf ]; then size=`wc -l $cf | awk '{print $1}'` if [ $size -gt $best_size ]; then best_size=$size best_file=$cf fi fi done echo "$best_file" ;; openais) # TODO: Technically it could be anywhere :-/ cf="/etc/ais/openais.conf" if [ -f $cf ]; then echo "$cf" fi ;; heartbeat) cf="/etc/ha.d/ha.cf" if [ -f $cf ]; then echo "$cf" fi ;; *) warning "Unknown cluster type: $1" ;; esac } # # check for the major prereq for a) parameter parsing and b) # parsing logs # t=`get_time "12:00"` if [ "$t" = "" ]; then fatal "please install the perl Date::Parse module" fi detect_host