diff --git a/tools/crm_report.in b/tools/crm_report.in index 52e2713bec..c2a1b5e585 100755 --- a/tools/crm_report.in +++ b/tools/crm_report.in @@ -1,481 +1,513 @@ #!/bin/sh # Copyright (C) 2010 Andrew Beekhof # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # # Note the quotes around `$TEMP': they are essential! TEMP=`getopt \ -o hv?xl:f:t:n:T:L:p:c:dSACHu:D:MVse: \ --long help,cts:,cts-log:,dest:,node:,nodes:,from:,to:,sos-mode,logfile:,as-directory,single-node,cluster:,user:,max-depth:,version,features,rsh: \ -n 'crm_report' -- "$@"` eval set -- "$TEMP" +progname=$(basename "$0") rsh="ssh -T" times="" tests="" nodes="" compress=1 cluster="any" ssh_user="root" search_logs=1 report_data=`dirname $0` +maxdepth=5 extra_logs="" sanitize_patterns="passw.*" log_patterns="CRIT: ERROR:" usage() { - cat< "$l_base/$HALOG_F" fi for node in $nodes; do - cat<$l_base/.env + cat <$l_base/.env LABEL="$label" REPORT_HOME="$r_base" REPORT_MASTER="$host" REPORT_TARGET="$node" LOG_START=$start LOG_END=$end REMOVE=1 SANITIZE="$sanitize_patterns" CLUSTER=$cluster LOG_PATTERNS="$log_patterns" EXTRA_LOGS="$extra_logs" SEARCH_LOGS=$search_logs verbose=$verbose maxdepth=$maxdepth EOF if [ $host = $node ]; then - cat<>$l_base/.env + cat <>$l_base/.env REPORT_HOME="$l_base" EOF cat $l_base/.env $report_data/report.common $report_data/report.collector > $l_base/collector bash $l_base/collector else cat $l_base/.env $report_data/report.common $report_data/report.collector \ | $rsh -l $ssh_user $node -- "mkdir -p $r_base; cat > $r_base/collector; bash $r_base/collector" | (cd $l_base && tar mxf -) fi done analyze $l_base > $l_base/$ANALYSIS_F if [ -f $l_base/$HALOG_F ]; then node_events $l_base/$HALOG_F > $l_base/$EVENTS_F fi for node in $nodes; do cat $l_base/$node/$ANALYSIS_F >> $l_base/$ANALYSIS_F if [ -s $l_base/$node/$EVENTS_F ]; then cat $l_base/$node/$EVENTS_F >> $l_base/$EVENTS_F elif [ -s $l_base/$HALOG_F ]; then awk "\$4==\"$nodes\"" $l_base/$EVENTS_F >> $l_base/$n/$EVENTS_F fi done log " " if [ $compress = 1 ]; then fname=`shrink $l_base` rm -rf $l_base log "Collected results are available in $fname" log " " log "Please create a bug entry at" log " http://developerbugs.linux-foundation.org/enter_bug.cgi?product=Pacemaker" log "Include a description of your problem and attach this tarball" log " " log "Thank you for taking time to create this report." else log "Collected results are available in $l_base" fi log " " } # # check if files have same content in the cluster # cibdiff() { d1=`dirname $1` d2=`dirname $2` if [ -f $d1/RUNNING -a -f $d2/RUNNING ] || [ -f $d1/STOPPED -a -f $d2/STOPPED ]; then if which crm_diff > /dev/null 2>&1; then crm_diff -c -n $1 -o $2 else info "crm_diff(8) not found, cannot diff CIBs" fi else echo "can't compare cibs from running and stopped systems" fi } diffcheck() { [ -f "$1" ] || { echo "$1 does not exist" return 1 } [ -f "$2" ] || { echo "$2 does not exist" return 1 } case `basename $1` in $CIB_F) cibdiff $1 $2;; $B_CONF) diff -u $1 $2;; # confdiff? *) diff -u $1 $2;; esac } # # remove duplicates if files are same, make links instead # consolidate() { for n in $NODES; do if [ -f $1/$2 ]; then rm $1/$n/$2 else mv $1/$n/$2 $1 fi ln -s ../$2 $1/$n done } analyze_one() { rc=0 node0="" for n in $NODES; do if [ "$node0" ]; then diffcheck $1/$node0/$2 $1/$n/$2 rc=$(($rc+$?)) else node0=$n fi done return $rc } analyze() { flist="$HOSTCACHE $MEMBERSHIP_F $CIB_F $CRM_MON_F $B_CONF logd.cf $SYSINFO_F" for f in $flist; do printf "Diff $f... " ls $1/*/$f >/dev/null 2>&1 || { echo "no $1/*/$f :/" continue } if analyze_one $1 $f; then echo "OK" [ "$f" != $CIB_F ] && consolidate $1 $f else echo "" fi done } do_cts() { test_sets=`echo $tests | tr ',' ' '` for test_set in $test_sets; do start_time=0 start_test=`echo $test_set | tr '-' ' ' | awk '{print $1}'` end_time=0 end_test=`echo $test_set | tr '-' ' ' | awk '{print $2}'` if [ x$end_test = x ]; then msg="Extracting test $start_test" label="CTS-$start_test-`date +"%b-%d-%Y"`" end_test=`expr $start_test + 1` else msg="Extracting tests $start_test to $end_test" label="CTS-$start_test-$end_test-`date +"%b-%d-%Y"`" end_test=`expr $end_test + 1` fi if [ $start_test = 0 ]; then start_pat="BEGINNING [0-9].* TESTS" else start_pat="Running test.*\[ *$start_test\]" fi if [ x$ctslog = x ]; then ctslog=`findmsg 1 "$start_pat"` if [ x$ctslog = x ]; then fatal "No CTS control file detected" else log "Using CTS control file: $ctslog" fi fi line=`grep -n "$start_pat" $ctslog | tail -1 | sed 's/:.*//'` if [ ! -z "$line" ]; then start_time=`linetime $ctslog $line` fi line=`grep -n "Running test.*\[ *$end_test\]" $ctslog | tail -1 | sed 's/:.*//'` if [ ! -z "$line" ]; then end_time=`linetime $ctslog $line` fi if [ -z "$nodes" ]; then nodes=`grep CTS: $ctslog | grep -v debug: | grep " \* " | sed s:.*\\\*::g | sort -u | tr '\\n' ' '` log "Calculated node list: $nodes" fi if [ $end_time -lt $start_time ]; then debug "Test didn't complete, grabbing everything up to now" end_time=`date +%s` fi if [ $start_time != 0 ];then log "$msg (`time2str $start_time` to `time2str $end_time`)" collect_data $label $start_time $end_time $ctslog else fatal "$msg failed: not found" fi done } -getnodes() { - if [ -z $1 ]; then - cluster=`get_cluster_type` - elif [ $1 = any ]; then - cluster=`get_cluster_type` - else - cluster=$1 - fi +node_names_from_xml() { + awk ' + /uname/ { + for( i=1; i<=NF; i++ ) + if( $i~/^uname=/ ) { + sub("uname=.","",$i); + sub("\".*","",$i); + print $i; + next; + } + } + ' | tr '\n' ' ' +} - if [ -z $HA_STATE_DIR ]; then - HA_STATE_DIR=/var/lib/heartbeat +getnodes() { + cluster="$1" + + # 1. Live (cluster nodes or Pacemaker Remote nodes) + # TODO: This will not detect Pacemaker Remote nodes unless they + # have ever had a permanent node attribute set, because it only + # searches the nodes section. It should also search the config + # for resources that create Pacemaker Remote nodes. + cib_nodes=$(cibadmin -Ql -o nodes 2>/dev/null) + if [ $? -eq 0 ]; then + debug "Querying CIB for nodes" + echo "$cib_nodes" | node_names_from_xml + return fi - cluster_cf=`find_cluster_cf $cluster` - # 1. Live - if - ps -ef | egrep -qs "[c]ib" - then - debug "Querying CIB for nodes" - cibadmin -Ql -o nodes | awk ' - /uname/ { - for( i=1; i<=NF; i++ ) - if( $i~/^uname=/ ) { - sub("uname=.","",$i); - sub("\".*","",$i); - print $i; - next; - } - } - ' | tr '\n' ' ' # 2. Saved - elif [ -f @CRM_CONFIG_DIR@/cib.xml ]; then + if [ -f "@CRM_CONFIG_DIR@/cib.xml" ]; then debug "Querying on-disk CIB for nodes" - grep "node " @CRM_CONFIG_DIR@/cib.xml | awk ' - /uname/ { - for( i=1; i<=NF; i++ ) - if( $i~/^uname=/ ) { - sub("uname=.","",$i); - sub("\".*","",$i); - print $i; - next; - } - } - ' | tr '\n' ' ' + grep "node " "@CRM_CONFIG_DIR@/cib.xml" | node_names_from_xml + return + fi # 3. hostcache - elif [ -f $HA_STATE_DIR/hostcache ]; then + if [ -z "$HA_STATE_DIR" ]; then + HA_STATE_DIR=/var/lib/heartbeat + fi + if [ -f "$HA_STATE_DIR/hostcache" ]; then debug "Reading nodes from $HA_STATE_DIR/hostcache" - awk '{print $1}' $HA_STATE_DIR/hostcache + awk '{print $1}' "$HA_STATE_DIR/hostcache" + return + fi # 4. ha.cf - elif [ "x$cluster" = "xheartbeat" ]; then + if [ "x$cluster" = "xheartbeat" ]; then + cluster_cf=$(find_cluster_cf $cluster) debug "Reading nodes from $cluster_cf" - getcfvar $cluster node $cluster_cf + getcfvar $cluster node "$cluster_cf" + return + fi # 5. logs - else - # Look in the logs... - logfile=`findmsg 1 "crm_update_peer"` - debug "Reading nodes from $logfile" - if [ ! -z "$logfile" ]; then - grep crm_update_peer: $logfile | sed s/.*crm_update_peer// | sed s/://g | awk '{print $2}' | grep -v "(null)" | sort -u | tr '\n' ' ' - fi + # TODO: This has multiple issues: + # * It looks for messages from crm_update_peer(), which is used only by + # heartbeat and legacy plugin clusters; it should work with CMAN and + # corosync2 clusters as well. + # * It does a findmsg for "crm_update_peer" (which will hit + # "crm_update_peer_proc" etc.), but then greps for "crm_update_peer:". + # * It always uses grep, even though $logfile might be compressed. + # For this reason and efficiency, it would nice if findmsg could + # optionally print the matches instead of the file names. + # * It would be nice to skip this step for Pacemaker Remote nodes since their + # logs will not have node names, but it is nontrivial to know that. + # Cluster nodes generally won't get here, but stopped Pacemaker Remote + # nodes will. + logfile=$(findmsg 1 "crm_update_peer") + debug "Looking for nodes in $logfile" + if [ ! -z "$logfile" ]; then + grep crm_update_peer: "$logfile" \ + | sed s/.*crm_update_peer// \ + | sed s/://g \ + | awk '{print $2}' \ + | grep -v "(null)" \ + | sort -u \ + | tr '\n' ' ' fi } if [ "x$tests" != "x" ]; then do_cts elif [ "x$start_time" != "x" ]; then masterlog="" if [ -z "$sanitize_patterns" ]; then log "WARNING: The tarball produced by this program may contain" log " sensitive information such as passwords." log "" log "We will attempt to remove such information if you use the" log "-p option. For example: -p \"pass.*\" -p \"user.*\"" log "" log "However, doing this may reduce the ability for the recipients" log "to diagnose issues and generally provide assistance." log "" log "IT IS YOUR RESPONSIBILITY TO PROTECT SENSITIVE DATA FROM EXPOSURE" log "" fi - if [ -z "$nodes" ]; then - nodes=`getnodes $cluster` - log "Calculated node list: $nodes" + # If user didn't specify a cluster stack, make a best guess if possible. + if [ -z "$cluster" ] || [ "$cluster" = "any" ]; then + cluster=$(get_cluster_type) fi + # If user didn't specify node(s), make a best guess if possible. if [ -z "$nodes" ]; then - fatal "Cannot determine node list, please specify manually with --nodes" + nodes=`getnodes $cluster` + if [ -n "$nodes" ]; then + log "Calculated node list: $nodes" + else + fatal "Cannot determine nodes; specify --nodes or --single-node" + fi fi if echo $nodes | grep -qs $host then debug "We are a cluster node" else debug "We are a log master" masterlog=`findmsg 1 "crmd\\|CTS"` fi if [ -z $end_time ]; then end_time=`perl -e 'print time()'` fi label="pcmk-`date +"%a-%d-%b-%Y"`" log "Collecting data from $nodes (`time2str $start_time` to `time2str $end_time`)" collect_data $label $start_time $end_time $masterlog else fatal "Not sure what to do, no tests or time ranges to extract" fi + +# vim: set expandtab tabstop=8 softtabstop=4 shiftwidth=4 textwidth=80: diff --git a/tools/report.collector b/tools/report.collector index ecd1546b34..8386f6bdac 100644 --- a/tools/report.collector +++ b/tools/report.collector @@ -1,794 +1,821 @@ # Copyright (C) 2007 Dejan Muhamedagic # Almost everything as part of hb_report # Copyright (C) 2010 Andrew Beekhof # Cleanups, refactoring, extensions # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # if echo $REPORT_HOME | grep -qs '^/' then debug "Using full path to working directory: $REPORT_HOME" else REPORT_HOME="$HOME/$REPORT_HOME" debug "Canonicalizing working directory path: $REPORT_HOME" fi detect_host findlogdcf() { for f in \ `test -x $CRM_DAEMON_DIR/ha_logd && which strings > /dev/null 2>&1 && strings $CRM_DAEMON_DIR/ha_logd | grep 'logd\.cf'` \ `for d; do echo $d/logd.cf $d/ha_logd.cf; done` do if [ -f "$f" ]; then echo $f debug "Located logd.cf at: $f" return 0 fi done debug "Could not determine logd.cf location" return 1 } # # find files newer than a and older than b # isnumber() { echo "$*" | grep -qs '^[0-9][0-9]*$' } touchfile() { t=`mktemp` && perl -e "\$file=\"$t\"; \$tm=$1;" -e 'utime $tm, $tm, $file;' && echo $t } find_files_clean() { [ -z "$from_stamp" ] || rm -f "$from_stamp" [ -z "$to_stamp" ] || rm -f "$to_stamp" from_stamp="" to_stamp="" } find_files() { dirs= from_time=$2 to_time=$3 for d in $1; do if [ -d $d ]; then dirs="$dirs $d" fi done if [ x"$dirs" = x ]; then return fi isnumber "$from_time" && [ "$from_time" -gt 0 ] || { warning "sorry, can't find files in [ $1 ] based on time if you don't supply time" return } trap find_files_clean 0 if ! from_stamp=`touchfile $from_time`; then warning "sorry, can't create temporary file for find_files" return fi findexp="-newer $from_stamp" if isnumber "$to_time" && [ "$to_time" -gt 0 ]; then if ! to_stamp=`touchfile $to_time`; then warning "sorry, can't create temporary file for find_files" find_files_clean return fi findexp="$findexp ! -newer $to_stamp" fi find $dirs -type f $findexp find_files_clean trap "" 0 } # # check permissions of files/dirs # pl_checkperms() { perl -e ' # check permissions and ownership # uid and gid are numeric # everything must match exactly # no error checking! (file should exist, etc) ($filename, $perms, $in_uid, $in_gid) = @ARGV; ($mode,$uid,$gid) = (stat($filename))[2,4,5]; $p=sprintf("%04o", $mode & 07777); $p ne $perms and exit(1); $uid ne $in_uid and exit(1); $gid ne $in_gid and exit(1); ' $* } num_id() { getent $1 $2 | awk -F: '{print $3}' } chk_id() { [ "$2" ] && return 0 echo "$1: id not found" return 1 } check_perms() { while read type f p uid gid; do - [ -$type $f ] || { - echo "$f wrong type or doesn't exist" - continue - } + if [ ! -e "$f" ]; then + echo "$f doesn't exist" + continue + elif [ ! -$type "$f" ]; then + echo "$f has wrong type" + continue + fi n_uid=`num_id passwd $uid` chk_id "$uid" "$n_uid" || continue n_gid=`num_id group $gid` chk_id "$gid" "$n_gid" || continue pl_checkperms $f $p $n_uid $n_gid || { echo "wrong permissions or ownership for $f:" ls -ld $f } done } # # coredumps # findbinary() { random_binary=`which cat 2>/dev/null` # suppose we are lucky binary=`gdb $random_binary $1 < /dev/null 2>/dev/null | grep 'Core was generated' | awk '{print $5}' | sed "s/^.//;s/[.':]*$//"` if [ x = x"$binary" ]; then debug "Could not detect the program name for core $1 from the gdb output; will try with file(1)" binary=$(file $1 | awk '/from/{ for( i=1; i<=NF; i++ ) if( $i == "from" ) { print $(i+1) break } }') binary=`echo $binary | tr -d "'"` binary=$(echo $binary | tr -d '`') if [ "$binary" ]; then binary=`which $binary 2>/dev/null` fi fi if [ x = x"$binary" ]; then warning "Could not find the program path for core $1" return fi fullpath=`which $binary 2>/dev/null` if [ x = x"$fullpath" ]; then if [ -x $CRM_DAEMON_DIR/$binary ]; then echo $CRM_DAEMON_DIR/$binary debug "Found the program at $CRM_DAEMON_DIR/$binary for core $1" else warning "Could not find the program path for core $1" fi else echo $fullpath debug "Found the program at $fullpath for core $1" fi } getbt() { which gdb > /dev/null 2>&1 || { warning "Please install gdb to get backtraces" return } for corefile; do absbinpath=`findbinary $corefile` [ x = x"$absbinpath" ] && continue echo "====================== start backtrace ======================" ls -l $corefile # Summary first... gdb -batch -n -quiet -ex ${BT_OPTS:-"thread apply all bt"} -ex quit \ $absbinpath $corefile 2>/dev/null echo "====================== start detail ======================" # Now the unreadable details... gdb -batch -n -quiet -ex ${BT_OPTS:-"thread apply all bt full"} -ex quit \ $absbinpath $corefile 2>/dev/null echo "======================= end backtrace =======================" done } +dump_status_and_config() { + crm_mon -1 2>&1 | grep -v '^Last upd' > $target/$CRM_MON_F + cibadmin -Ql 2>/dev/null > $target/${CIB_F}.live +} + getconfig() { cluster=$1; shift; target=$1; shift; for cf in $*; do if [ -e "$cf" ]; then cp -a "$cf" $target/ fi done - crm_uuid -r > $target/$HB_UUID_F 2>&1 + if which crm_uuid >/dev/null 2>&1; then + crm_uuid -r > $target/$HB_UUID_F 2>&1 + fi - if - ps -ef | egrep -qs [c]rmd - then - crm_mon -1 2>&1 | grep -v '^Last upd' > $target/$CRM_MON_F - cibadmin -Ql 2>/dev/null > $target/${CIB_F}.live + if is_running crmd; then + dump_status_and_config case $cluster in cman) crm_node -p --cman > $target/$MEMBERSHIP_F 2>&1;; corosync|openais) crm_node -p --openais > $target/$MEMBERSHIP_F 2>&1;; heartbeat) crm_node -p --heartbeat > $target/$MEMBERSHIP_F 2>&1;; *) crm_node -p > $target/$MEMBERSHIP_F 2>&1;; esac echo "$host" > $target/RUNNING + + elif is_running pacemaker_remoted; then + dump_status_and_config + echo "$host" > $target/RUNNING + else echo "$host" > $target/STOPPED fi if [ -f "$target/$CIB_F" ]; then crm_verify -V -x $target/$CIB_F >$target/$CRM_VERIFY_F 2>&1 - CIB_file=$target/$CIB_F crm configure show >$target/$CIB_TXT_F 2>&1 + if which crm >/dev/null 2>&1 ; then + CIB_file=$target/$CIB_F crm configure show >$target/$CIB_TXT_F 2>&1 + elif which pcs >/dev/null 2>&1 ; then + pcs config -f $target/$CIB_F >$target/$CIB_TXT_F 2>&1 + fi fi } # # remove values of sensitive attributes # # this is not proper xml parsing, but it will work under the # circumstances sanitize_xml_attrs() { sed $( for patt in $SANITIZE; do echo "-e /name=\"$patt\"/s/value=\"[^\"]*\"/value=\"****\"/" done ) } sanitize_hacf() { awk ' $1=="stonith_host"{ for( i=5; i<=NF; i++ ) $i="****"; } {print} ' } sanitize_one_clean() { [ -z "$tmp" ] || rm -f "$tmp" tmp="" [ -z "$ref" ] || rm -f "$ref" ref="" } sanitize() { file=$1 compress="" if [ -z "$SANITIZE" ]; then return fi echo $file | grep -qs 'gz$' && compress=gzip echo $file | grep -qs 'bz2$' && compress=bzip2 if [ "$compress" ]; then decompress="$compress -dc" else compress=cat decompress=cat fi trap sanitize_one_clean 0 tmp=`mktemp` ref=`mktemp` if [ -z "$tmp" -o -z "$ref" ]; then sanitize_one_clean fatal "cannot create temporary files" fi touch -r $file $ref # save the mtime if [ "`basename $file`" = ha.cf ]; then sanitize_hacf else $decompress | sanitize_xml_attrs | $compress fi < $file > $tmp mv $tmp $file # note: cleaning $tmp up is still needed even after it's renamed # because its temp directory is still there. touch -r $ref $file sanitize_one_clean trap "" 0 } # # get some system info # distro() { if which lsb_release >/dev/null 2>&1 then - lsb_release -d + lsb_release -d | sed -e 's/^Description:\s*//' debug "Using lsb_release for distribution info" return fi relf=`ls /etc/debian_version 2>/dev/null` || relf=`ls /etc/slackware-version 2>/dev/null` || relf=`ls -d /etc/*-release 2>/dev/null` && { for f in $relf; do test -f $f && { echo "`ls $f` `cat $f`" debug "Found `echo $relf | tr '\n' ' '` distribution release file(s)" return } done } warning "No lsb_release, no /etc/*-release, no /etc/debian_version: no distro information" } pkg_ver() { if which dpkg >/dev/null 2>&1 ; then pkg_mgr="deb" elif which rpm >/dev/null 2>&1 ; then pkg_mgr="rpm" elif which pkg_info >/dev/null 2>&1 ; then pkg_mgr="pkg_info" elif which pkginfo >/dev/null 2>&1 ; then pkg_mgr="pkginfo" else warning "Unknown package manager" return fi debug "The package manager is: $pkg_mgr" echo "The package manager is: $pkg_mgr" - # for Linux .deb based systems + echo "Installed packages:" case $pkg_mgr in deb) dpkg-query -f '${Package} ${Version} ${Architecture}\n' -W | sort + echo for pkg in $*; do if dpkg-query -W $pkg 2>/dev/null ; then debug "Verifying installation of: $pkg" echo "Verifying installation of: $pkg" debsums -s $pkg 2>/dev/null fi done ;; rpm) rpm -qa --qf '%{name} %{version}-%{release} - %{distribution} %{arch}\n' | sort + echo for pkg in $*; do if rpm -q $pkg >/dev/null 2>&1 ; then debug "Verifying installation of: $pkg" echo "Verifying installation of: $pkg" rpm --verify $pkg 2>&1 fi done ;; pkg_info) pkg_info ;; pkginfo) pkginfo | awk '{print $3}' # format? ;; esac } getbacktraces() { debug "Looking for backtraces: $*" flist=$( for f in `find_files "$CRM_CORE_DIRS" $1 $2`; do bf=`basename $f` test `expr match $bf core` -gt 0 && echo $f done) if [ "$flist" ]; then for core in $flist; do log "Found core file: `ls -al $core`" done # Make a copy of them in case we need more data later # Luckily they compress well - mkdir cores &> /dev/null + mkdir cores >/dev/null 2>&1 cp -a $flist cores/ shrink cores rm -rf cores # Now get as much as we can from them automagically for f in $flist; do getbt $f done fi } getpeinputs() { - flist=$( - find_files $PE_STATE_DIR $1 $2 | sed "s,`dirname $PE_STATE_DIR`/,,g" - ) - if [ "$flist" ]; then - (cd `dirname $PE_STATE_DIR` && tar cf - $flist) | (cd $3 && tar xf -) - debug "found `echo $flist | wc -w` pengine input files in $PE_STATE_DIR" + if [ -n "$PE_STATE_DIR" ]; then + flist=$( + find_files "$PE_STATE_DIR" "$1" "$2" | sed "s,`dirname $PE_STATE_DIR`/,,g" + ) + if [ "$flist" ]; then + (cd $(dirname "$PE_STATE_DIR") && tar cf - $flist) | (cd "$3" && tar xf -) + debug "found `echo $flist | wc -w` pengine input files in $PE_STATE_DIR" + fi fi } getblackboxes() { flist=$( find_files $BLACKBOX_DIR $1 $2 ) for bb in $flist; do bb_short=`basename $bb` - qb-blackbox $bb &> $3/${bb_short}.blackbox + qb-blackbox $bb > $3/${bb_short}.blackbox 2>&1 info "Extracting contents of blackbox: $bb_short" done } # # some basic system info and stats # sys_info() { cluster=$1; shift echo "Platform: `uname`" echo "Kernel release: `uname -r`" echo "Architecture: `uname -m`" if [ `uname` = Linux ]; then echo "Distribution: `distro`" fi - cibadmin --version 2>&1 + echo + cibadmin --version 2>&1 | head -1 cibadmin -! 2>&1 - case $1 in + case $cluster in openais) - : echo "openais version: how?" + echo openais # version: how? ;; cman) - cman_tool -V - /usr/sbin/corosync -v 2>&1 + cman_tool -V 2>&1 | head -1 + /usr/sbin/corosync -v 2>&1 | head -1 ;; corosync) - /usr/sbin/corosync -v 2>&1 + /usr/sbin/corosync -v 2>&1 | head -1 ;; heartbeat) - heartbeat version: `$CRM_DAEMON_DIR/heartbeat -V` 2>&1 + echo heartbeat $($CRM_DAEMON_DIR/heartbeat -V 2>&1) ;; esac # Cluster glue version hash (if available) stonith -V 2>/dev/null # Resource agents version hash echo "resource-agents: `grep 'Build version:' /usr/lib/ocf/resource.d/heartbeat/.ocf-shellfuncs`" + echo pkg_ver $* } sys_stats() { set -x uname -n uptime ps axf ps auxw top -b -n 1 ifconfig -a ip addr list netstat -i arp -an test -d /proc && { cat /proc/cpuinfo } lsscsi lspci mount df set +x } dlm_dump() { if which dlm_tool >/dev/null 2>&1 ; then - if - ps -ef | egrep -qs '[d]lm_controld' - then + if is_running dlm_controld; then echo "--- Lockspace overview:" dlm_tool ls -n echo "---Lockspace history:" dlm_tool dump echo "---Lockspace status:" dlm_tool status dlm_tool status -v echo "---Lockspace config:" dlm_tool dump_config dlm_tool log_plock dlm_tool ls | grep name | while read X N ; do echo "--- Lockspace $N:" dlm_tool lockdump "$N" dlm_tool lockdebug -svw "$N" done fi fi } iscfvarset() { test "`getcfvar $1 $2`" } iscfvartrue() { getcfvar $1 $2 $3 | egrep -qsi "^(true|y|yes|on|1)" } uselogd() { cf_file=$2 case $1 in heartbeat) iscfvartrue $1 use_logd $cf_file && return 0 # if use_logd true iscfvarset $1 logfacility $cf_file || iscfvarset $1 logfile $cf_file || iscfvarset $1 debugfile $cf_file || return 0 # or none of the log options set false ;; *) iscfvartrue $1 use_logd $cf_file ;; esac } get_logfiles() { cf_type=$1 cf_file="$2" cf_logd="$3" facility_var="logfacility" if [ -f "$cf_logd" ]; then + # TODO: this call is broken, it expects args if uselogd; then cf_file="$cf_logd" cf_type="logd" fi fi - debug "Reading $cf_type log settings" case $cf_type in cman|openais|corosync) - debug "Reading log settings from $cf_file" - if iscfvartrue $cf_type to_syslog $cf_file; then - facility_var=syslog_facility - fi - if iscfvartrue $cf_type to_logfile $cf_file; then - logfile=`getcfvar $cf_type logfile $cf_file` - fi + if [ -f "$cf_file" ]; then + debug "Reading $cf_type log settings from $cf_file" + if iscfvartrue $cf_type to_syslog "$cf_file"; then + facility_var=syslog_facility + fi + if iscfvartrue $cf_type to_logfile "$cf_file"; then + logfile=$(getcfvar $cf_type logfile "$cf_file") + fi + fi ;; heartbeat|logd) - debug "Reading log settings from $cf_file" - if - iscfvartrue $cf_type debug $cf_file - then - logfile=`getcfvar $cf_type debugfile $cf_file` - else - logfile=`getcfvar $cf_type logfile $cf_file` - fi + if [ -f "$cf_file" ]; then + debug "Reading $cf_type log settings from $cf_file" + if iscfvartrue $cf_type debug "$cf_file"; then + logfile=$(getcfvar $cf_type debugfile "$cf_file") + else + logfile=$(getcfvar $cf_type logfile "$cf_file") + fi + fi ;; - *) debug "Unknown cluster type: $cf_type" - echo "/var/log/pacemaker.log" - ;; esac - if [ "x$logfile" != "x" -a -f "$logfile" ]; then + if [ -z "$logfile" ]; then + logfile="/var/log/pacemaker.log" + debug "Log settings not found for cluster type $cf_type, assuming $logfile" + fi + if [ -f "$logfile" ]; then echo $logfile fi + if [ "x$facility" = x ]; then facility=`getcfvar $cf_type $facility_var $cf_file` [ "" = "$facility" ] && facility="daemon" fi - if [ "x$facility" = x ]; then - facility="daemon" - fi # Always include system logs (if we can find them) msg="Mark:pcmk:`perl -e 'print time()'`" logger -p $facility.info $msg >/dev/null 2>&1 sleep 2 # Give syslog time to catch up in case its busy findmsg 1 "$msg" - # Initial pacemakerd logs and tracing might also go to a file (other than the syslog log file) - findmsg 3 "Starting Pacemaker" + # Look for detail logs: + + # - initial pacemakerd logs and tracing might go to a different file + pattern="Starting Pacemaker" - # Make sure we get something from the Policy Engine - findmsg 3 "Calculated Transition" + # - make sure we get something from the Policy Engine + pattern="$pattern\\|Calculated Transition" - # These patterns look for cib and lrmd updates - # Helpful on non-DC nodes or when the cluster has been up for a long time - findmsg 3 cib_perform_op - findmsg 3 process_lrm_event + # - cib and lrmd updates (helpful on non-DC nodes or when the cluster has been up for a long time) + pattern="$pattern\\|cib_perform_op\\|process_lrm_event" + # - pacemaker_remote might use a different file + pattern="$pattern\\|pacemaker_remoted:" + + findmsg 3 "$pattern" } essential_files() { cat< $SYSINFO_F essential_files $cluster | check_perms > $PERMISSIONS_F 2>&1 getconfig $cluster "$REPORT_HOME/$REPORT_TARGET" "$cluster_cf" "$logd_cf" "$CRM_CONFIG_DIR/$CIB_F" "$HA_STATE_DIR/hostcache" "/etc/drbd.conf" "/etc/drbd.d" "/etc/booth" getpeinputs $LOG_START $LOG_END $REPORT_HOME/$REPORT_TARGET getbacktraces $LOG_START $LOG_END > $REPORT_HOME/$REPORT_TARGET/$BT_F getblackboxes $LOG_START $LOG_END $REPORT_HOME/$REPORT_TARGET case $cluster in cman|corosync) - if - ps -ef | egrep -qs '[c]orosync' - then - corosync-blackbox &> corosync-blackbox-live.txt + if is_running corosync; then + corosync-blackbox >corosync-blackbox-live.txt 2>&1 +# corosync-fplay > corosync-blackbox.txt + tool=`pickfirst corosync-objctl corosync-cmapctl` + case $tool in + *objctl) $tool -a > corosync.dump 2>/dev/null;; + *cmapctl) $tool > corosync.dump 2>/dev/null;; + esac + corosync-quorumtool -s -i > corosync.quorum 2>&1 fi -# corosync-fplay > corosync-blackbox.txt - - tool=`pickfirst corosync-objctl corosync-cmapctl` - case $tool in - *objctl) $tool -a > corosync.dump 2>/dev/null;; - *cmapctl) $tool > corosync.dump 2>/dev/null;; - esac - corosync-quorumtool -s -i > corosync.quorum 2>&1 ;; esac dc=`crm_mon -1 2>/dev/null | awk '/Current DC/ {print $3}'` if [ "$REPORT_TARGET" = "$dc" ]; then echo "$REPORT_TARGET" > DC fi dlm_dump > $DLM_DUMP_F 2>&1 sys_stats > $SYSSTATS_F 2>&1 debug "Sanitizing files: $SANITIZE" # # replace sensitive info with '****' # cf="" if [ ! -z "$cluster_cf" ]; then cf=`basename $cluster_cf` fi for f in $cf $CIB_F $CIB_TXT_F $CIB_F.live pengine/*; do if [ -f "$f" ]; then sanitize $f fi done # Grab logs start=`date -d @${LOG_START} +"%F %T"` end=`date -d @${LOG_END} +"%F %T"` debug "Gathering logs from $start to $end: $logfiles $EXTRA_LOGS" trap '[ -z "$pattfile" ] || rm -f "$pattfile"' 0 pattfile=`mktemp` || fatal "cannot create temporary files" for p in $LOG_PATTERNS; do echo "$p" done > $pattfile for l in $logfiles $EXTRA_LOGS; do b="$(basename $l).extract.txt" if [ ! -f "$l" ]; then # Not a file continue elif [ -f "$b" ]; then # We already have it continue fi dumplogset "$l" $LOG_START $LOG_END > "$b" sanitize "$b" echo "Log patterns $REPORT_TARGET:" > $ANALYSIS_F grep -f "$pattfile" "$b" >> $ANALYSIS_F done which journalctl > /dev/null 2>&1 if [ $? = 0 ]; then log "Including segment [$LOG_START-$LOG_END] from journald" journalctl --since "$start" --until "$end" > journal.log cat journal.log | grep -f $pattfile >> $ANALYSIS_F fi rm -f $pattfile trap "" 0 # Purge files containing no information for f in `ls -1`; do if [ -d "$f" ]; then continue elif [ ! -s "$f" ]; then case $f in *core*) log "Detected empty core file: $f";; *) debug "Removing empty file: `ls -al $f`" rm -f $f ;; esac fi done # Parse for events for l in $logfiles $EXTRA_LOGS; do b="$(basename $l).extract.txt" node_events "$b" > $EVENTS_F # Link the first logfile to a standard name if it doesn't yet exist if [ -e "$b" -a ! -e "$HALOG_F" ]; then ln -s "$b" "$HALOG_F" fi done if [ -e $REPORT_HOME/.env ]; then debug "Localhost: $REPORT_MASTER $REPORT_TARGET" elif [ "$REPORT_MASTER" != "$REPORT_TARGET" ]; then debug "Streaming report back to $REPORT_MASTER" (cd $REPORT_HOME && tar cf - $REPORT_TARGET) if [ "$REMOVE" = "1" ]; then cd rm -rf $REPORT_HOME fi fi + +# vim: set expandtab tabstop=8 softtabstop=4 shiftwidth=4 textwidth=80: diff --git a/tools/report.common.in b/tools/report.common.in index 9dc57695d2..f9ed6f5355 100644 --- a/tools/report.common.in +++ b/tools/report.common.in @@ -1,805 +1,886 @@ # Copyright (C) 2007 Dejan Muhamedagic # Almost everything as part of hb_report # Copyright (C) 2010 Andrew Beekhof # Cleanups, refactoring, extensions # # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public # License as published by the Free Software Foundation; either # version 2.1 of the License, or (at your option) any later version. # # This software is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # host=`uname -n` shorthost=`echo $host | sed s:\\\\..*::` if [ -z $verbose ]; then verbose=0 fi # Target Files EVENTS_F=events.txt ANALYSIS_F=analysis.txt DESCRIPTION_F=description.txt HALOG_F=cluster-log.txt BT_F=backtraces.txt SYSINFO_F=sysinfo.txt SYSSTATS_F=sysstats.txt DLM_DUMP_F=dlm_dump.txt CRM_MON_F=crm_mon.txt MEMBERSHIP_F=members.txt HB_UUID_F=hb_uuid.txt HOSTCACHE=hostcache CRM_VERIFY_F=crm_verify.txt PERMISSIONS_F=permissions.txt CIB_F=cib.xml CIB_TXT_F=cib.txt EVENT_PATTERNS=" state do_state_transition membership pcmk_peer_update.*(lost|memb): quorum crmd.*crm_update_quorum|crmd.*ais.disp.*quorum.(lost|ac?quir) pause Process.pause.detected resources lrmd.*rsc:(start|stop) stonith te_fence_node|stonith-ng.*log_oper.*report|stonithd.*(requests|(Succeeded|Failed).to.STONITH|result=) start_stop sutdown.decision|Starting.heartbeat|Corosync.Cluster.Engine|corosync.*Initializing.transport|Executive.Service.RELEASE|crm_shutdown:.Requesting.shutdown|pcmk_shutdown:.Shutdown.complete " -PACKAGES="pacemaker pacemaker-libs libpacemaker3 -pacemaker-pygui pacemaker-pymgmt pymgmt-client +# superset of all packages of interest on all distros +# (the package manager will be used to validate the installation +# of any of these packages that are installed) +PACKAGES="pacemaker pacemaker-libs pacemaker-cluster-libs libpacemaker3 +pacemaker-remote pacemaker-pygui pacemaker-pymgmt pymgmt-client openais libopenais2 libopenais3 corosync libcorosync4 resource-agents cluster-glue-libs cluster-glue libglue2 ldirectord heartbeat heartbeat-common heartbeat-resources libheartbeat2 ocfs2-tools ocfs2-tools-o2cb ocfs2console ocfs2-kmp-default ocfs2-kmp-pae ocfs2-kmp-xen ocfs2-kmp-debug ocfs2-kmp-trace drbd drbd-kmp-xen drbd-kmp-pae drbd-kmp-default drbd-kmp-debug drbd-kmp-trace drbd-heartbeat drbd-pacemaker drbd-utils drbd-bash-completion drbd-xen lvm2 lvm2-clvm cmirrord libdlm libdlm2 libdlm3 hawk ruby lighttpd kernel-default kernel-pae kernel-xen glibc " +# Potential locations of system log files +SYSLOGS=" + /var/log/* + /var/logs/* + /var/syslog/* + /var/adm/* + /var/log/ha/* + /var/log/cluster/* +" + # # keep the user posted # record() { if [ x != x"$REPORT_HOME" -a -d "${REPORT_HOME}/$shorthost" ]; then rec="${REPORT_HOME}/$shorthost/report.out" elif [ x != x"${l_base}" -a -d "${l_base}" ]; then rec="${l_base}/report.summary" else rec="/dev/null" fi printf "%-10s $*\n" "$shorthost:" 2>&1 >> "${rec}" } log() { printf "%-10s $*\n" "$shorthost:" 1>&2 record "$*" } debug() { if [ $verbose -gt 0 ]; then log "Debug: $*" + else + record "Debug: $*" fi - record "Debug: $*" } info() { log "$*" } warning() { log "WARN: $*" } fatal() { log "ERROR: $*" exit 1 } -detect_host() { - if [ -z "$maxdepth" ]; then - depth="-maxdepth 5" - else - depth="-maxdepth $maxdepth" +is_running() { + ps -ef | egrep -qs $(echo "$1" | sed -e 's/^\(.\)/[\1]/') +} + +has_remoted() { + # TODO: the binary might be elsewhere + if which pacemaker_remoted >/dev/null 2>&1; then + return 0 + elif [ -x "@sbindir@/pacemaker_remoted" ]; then + return 0 fi + return 1 +} - local_state_dir=@localstatedir@ +# found_dir +found_dir() { + echo "$2" + info "Pacemaker $1 found in: $2" +} - if [ -d $local_state_dir/run ]; then - CRM_STATE_DIR=$local_state_dir/run/crm +detect_daemon_dir() { + info "Searching for where Pacemaker daemons live... this may take a while" + + for d in \ + {/usr,/usr/local,/opt/local,@exec_prefix@}/{libexec,lib64,lib}/{pacemaker,heartbeat} + do + # pacemaker and pacemaker-cts packages can install to daemon directory, + # so check for a file from each + if [ -e $d/pengine ] || [ -e $d/lrmd_test ]; then + found_dir "daemons" "$d" + return + fi + done + + for f in $(find / -maxdepth $maxdepth -type f -name pengine -o -name lrmd_test); do + d=$(dirname "$f") + found_dir "daemons" "$d" + return + done + + # Pacemaker Remote nodes don't need to install daemons + if has_remoted; then + info "Not found (this appears to be a Pacemaker Remote node)" else - info "Searching for where Pacemaker keeps runtime data... this may take a while" - for d in `find / $depth -type d -name run`; do - local_state_dir=`dirname $d` - CRM_STATE_DIR=$d/crm - break - done - info "Found: $CRM_STATE_DIR" + fatal "Pacemaker daemons not found (nonstandard installation?)" fi - debug "Machine runtime directory: $local_state_dir" - debug "Pacemaker runtime data located in: $CRM_STATE_DIR" +} - CRM_DAEMON_DIR= - for p in /usr /usr/local /opt/local @exec_prefix@; do - for d in libexec lib64 lib; do - if [ -e $p/$d/pacemaker/pengine ]; then - CRM_DAEMON_DIR=$p/$d/pacemaker - break - elif [ -e $p/$d/heartbeat/pengine ]; then - CRM_DAEMON_DIR=$p/$d/heartbeat - break - fi - done +detect_cib_dir() { + for d in $local_state_dir/lib/{pacemaker/cib,heartbeat/crm}; do + if [ "-f $d/cib.xml" ]; then + found_dir "config files" "$d" + return + fi done - if [ ! -d $CRM_DAEMON_DIR ]; then - info "Searching for where Pacemaker daemons live... this may take a while" - for f in `find / $depth -type f -name pengine`; do - CRM_DAEMON_DIR=`dirname $f` - break - done - info "Found: $CRM_DAEMON_DIR" - fi + info "Searching for where Pacemaker keeps config information... this may take a while" + # TODO: What about false positives where someone copied the CIB? + for f in $(find / -maxdepth $maxdepth -type f -name cib.xml); do + d=$(dirname $f) + found_dir "config files" "$d" + return + done - if [ -z $CRM_DAEMON_DIR ]; then - fatal "Non-standard Pacemaker installation: daemons not found" + # Pacemaker Remote nodes don't need a CIB + if has_remoted; then + info "Not found (this appears to be a Pacemaker Remote node)" else - debug "Pacemaker daemons located under: $CRM_DAEMON_DIR" + warning "Pacemaker config not found (nonstandard installation?)" + fi +} + +detect_state_dir() { + if [ -n "$CRM_CONFIG_DIR" ]; then + # Assume new layout + # $local_state_dir/lib/pacemaker/(cib,pengine,blackbox,cores) + dirname "$CRM_CONFIG_DIR" + + # Pacemaker Remote nodes might not have a CRM_CONFIG_DIR + elif [ -d "$local_state_dir/lib/pacemaker" ]; then + echo $local_state_dir/lib/pacemaker fi +} - CRM_CONFIG_DIR= - for d in pacemaker/cib heartbeat/crm; do - if [ -f $local_state_dir/lib/$d/cib.xml ]; then - CRM_CONFIG_DIR=$local_state_dir/lib/$d - break - fi +detect_pe_dir() { + config_root="$1" + + d="$config_root/pengine" + if [ -d "$d" ]; then + found_dir "policy engine inputs" "$d" + return + fi + + info "Searching for where Pacemaker keeps Policy Engine inputs... this may take a while" + for d in $(find / -maxdepth $maxdepth -type d -name pengine); do + found_dir "policy engine inputs" "$d" + return done - if [ ! -d $CRM_CONFIG_DIR ]; then - info "Detecting where Pacemaker keeps config information... this may take a while" - for f in `find / $depth -type f -name cib.xml`; do - CRM_CONFIG_DIR=`dirname $f` - break - done - info "Found: $CRM_CONFIG_DIR" + if has_remoted; then + info "Not found (this appears to be a Pacemaker Remote node)" + else + fatal "Pacemaker policy engine inputs not found (nonstandard installation?)" fi - if [ -z $CRM_CONFIG_DIR ]; then - warning "Non-standard Pacemaker installation: config not found" +} + +detect_host() { + local_state_dir=@localstatedir@ + + if [ -d $local_state_dir/run ]; then + CRM_STATE_DIR=$local_state_dir/run/crm else - debug "Pacemaker config files located in: $CRM_CONFIG_DIR" + info "Searching for where Pacemaker keeps runtime data... this may take a while" + for d in `find / -maxdepth $maxdepth -type d -name run`; do + local_state_dir=`dirname $d` + CRM_STATE_DIR=$d/crm + break + done + info "Found: $CRM_STATE_DIR" fi + debug "Machine runtime directory: $local_state_dir" + debug "Pacemaker runtime data located in: $CRM_STATE_DIR" - # Assume new layout - # $local_state_dir/lib/pacemaker/(cib,pengine,blackbox,cores) - config_root=`dirname $CRM_CONFIG_DIR` + CRM_DAEMON_DIR=$(detect_daemon_dir) + CRM_CONFIG_DIR=$(detect_cib_dir) + config_root=$(detect_state_dir) # Older versions had none BLACKBOX_DIR=$config_root/blackbox debug "Pacemaker blackboxes (if any) located in: $BLACKBOX_DIR" - PE_STATE_DIR=$config_root/pengine - if [ ! -d $PE_STATE_DIR ]; then - info "Detecting where Pacemaker keeps Policy Engine inputs... this may take a while" - for d in `find / $depth -type d -name pengine`; do - PE_STATE_DIR=$d - break - done - info "Found: $PE_STATE_DIR" - fi - if [ -z $PE_STATE_DIR ]; then - fatal "Non-standard Pacemaker installation: Policy Engine directory not found" - else - debug "PE files located in: $PE_STATE_DIR" - fi + PE_STATE_DIR=$(detect_pe_dir "$config_root") HA_STATE_DIR=$local_state_dir/lib/heartbeat debug "Assuming Heartbeat state files, if any, are located in: $HA_STATE_DIR" CRM_CORE_DIRS="" for d in $config_root/cores $HA_STATE_DIR/cores $local_state_dir/lib/corosync $local_state_dir/lib/openais; do if [ -d $d ]; then CRM_CORE_DIRS="$CRM_CORE_DIRS $d" fi done debug "Core files located under: $CRM_CORE_DIRS" } time2str() { perl -e "use POSIX; print strftime('%x %X',localtime($1));" } get_time() { perl -e "\$time=\"$*\";" -e ' $unix_tm = 0; eval "use Date::Parse"; if (index($time, ":") < 0) { } elsif (!$@) { $unix_tm = str2time($time); } else { eval "use Date::Manip"; if (!$@) { $unix_tm = UnixDate(ParseDateString($time), "%s"); } } if ($unix_tm != "") { print int($unix_tm); } else { print ""; } ' } get_time_() { warning "Unknown time format used by: $*" } get_time_syslog() { awk '{print $1,$2,$3}' } get_time_legacy() { awk '{print $2}' | sed 's/_/ /' } get_time_iso8601() { awk '{print $1}' } get_time_format_for_string() { l="$*" t=$(get_time `echo $l | get_time_syslog`) if [ "x$t" != x ]; then echo syslog return fi t=$(get_time `echo $l | get_time_iso8601`) if [ "x$t" != x ]; then echo iso8601 return fi t=$(get_time `echo $l | get_time_legacy`) if [ "x$t" != x ]; then echo legacy return fi } get_time_format() { t=0 l="" func="" trycnt=10 while [ $trycnt -gt 0 ] && read l; do func=$(get_time_format_for_string $l) if [ "x$func" != x ]; then break fi trycnt=$(($trycnt-1)) done #debug "Logfile uses the $func time format" echo $func } get_first_time() { l="" format=$1 while read l; do t=$(echo $l | get_time_$format) ts=$(get_time $t) if [ "x$ts" != x ]; then echo "$ts" return fi done } get_last_time() { l="" best=`date +%s` # Now format=$1 while read l; do t=$(echo $l | get_time_$format) ts=$(get_time $t) if [ "x$ts" != x ]; then best=$ts fi done echo $best } linetime() { l=`tail -n +$2 $1 | grep -a ":[0-5][0-9]:" | head -n 1` format=`get_time_format_for_string $l` t=`echo $l | get_time_$format` get_time "$t" } -# Find pattern in a logfile somewhere -# Return $max ordered results by age (newest first) +# +# findmsg +# +# Print the names of up to system logs that contain , +# ordered by most recently modified. +# findmsg() { - max=$1 - pattern=$2 - logfiles="" - syslogdirs="/var/log /var/logs /var/syslog /var/adm /var/log/ha /var/log/cluster" - - for d in $syslogdirs; do - if [ -d $d ]; then - files=`find $d -type f -maxdepth 1` - for f in $files; do - local cat=`find_decompressor $f` - $cat $f | grep -l -e "$pattern" - if [ $? = 0 ]; then - logfiles="$logfiles $f" - fi - done - fi - done 2>/dev/null + max=$1 + pattern="$2" + found=0 + + # List all potential system logs ordered by most recently modified. + candidates=$(ls -1td $SYSLOGS 2>/dev/null) + if [ -z "$candidates" ]; then + debug "No system logs found to search for pattern \'$pattern\'" + return + fi - if [ "x$logfiles" != "x" ]; then - list=`ls -t $logfiles | head -n $max | tr '\n' ' '` - echo $list - debug "Pattern \'$pattern\' found in: [ $list ]" - else - debug "Pattern \'$pattern\' not found anywhere" - fi + # Portable way to handle files with spaces in their names. + SAVE_IFS=$IFS + IFS=" +" + + # Check each log file for matches. + logfiles="" + for f in $candidates; do + local cat=$(find_decompressor "$f") + $cat "$f" 2>/dev/null | grep -q -e "$pattern" + if [ $? -eq 0 ]; then + + # Add this file to the list of hits + # (using newline as separator to handle spaces in names). + if [ -z "$logfiles" ]; then + logfiles="$f" + else + logfiles="$logfiles +$f" + fi + + # If we have enough hits, print them and return. + found=$(($found+1)) + if [ $found -ge $max ]; then + debug "Pattern \'$pattern\' found in: [ $logfiles ]" + IFS=$SAVE_IFS + echo "$logfiles" + return + fi + fi + done 2>/dev/null + IFS=$SAVE_IFS + + debug "Pattern \'$pattern\' not found in any system logs" } node_events() { if [ -e $1 ]; then Epatt=`echo "$EVENT_PATTERNS" | while read title p; do [ -n "$p" ] && echo -n "|$p"; done | sed 's/.//' ` grep -E "$Epatt" $1 fi } pickfirst() { for x; do which $x >/dev/null 2>&1 && { echo $x return 0 } done return 1 } shrink() { olddir=$PWD dir=`dirname $1` base=`basename $1` target=$1.tar tar_options="cf" variant=`pickfirst bzip2 gzip xz false` case $variant in bz*) tar_options="jcf" target="$target.bz2" ;; gz*) tar_options="zcf" target="$target.gz" ;; xz*) tar_options="Jcf" target="$target.xz" ;; *) warning "Could not find a compression program, the resulting tarball may be huge" ;; esac if [ -e $target ]; then fatal "Destination $target already exists, specify an alternate name with --dest" fi cd $dir >/dev/null 2>&1 tar $tar_options $target $base >/dev/null 2>&1 cd $olddir >/dev/null 2>&1 echo $target } findln_by_time() { local logf=$1 local tm=$2 local first=1 # Some logs can be massive (over 1,500,000,000 lines have been seen in the wild) # Even just 'wc -l' on these files can take 10+ minutes local fileSize=`ls -lh | awk '{ print $5 }' | grep -ie G` if [ x$fileSize != x ]; then warning "$logf is ${fileSize} in size and could take many hours to process. Skipping." return fi local last=`wc -l < $logf` while [ $first -le $last ]; do mid=$((($last+$first)/2)) trycnt=10 while [ $trycnt -gt 0 ]; do tmid=`linetime $logf $mid` [ "$tmid" ] && break warning "cannot extract time: $logf:$mid; will try the next one" trycnt=$(($trycnt-1)) # shift the whole first-last segment first=$(($first-1)) last=$(($last-1)) mid=$((($last+$first)/2)) done if [ -z "$tmid" ]; then warning "giving up on log..." return fi if [ $tmid -gt $tm ]; then last=$(($mid-1)) elif [ $tmid -lt $tm ]; then first=$(($mid+1)) else break fi done echo $mid } dumplog() { local logf=$1 local from_line=$2 local to_line=$3 [ "$from_line" ] || return tail -n +$from_line $logf | if [ "$to_line" ]; then head -$(($to_line-$from_line+1)) else cat fi } # # find log/set of logs which are interesting for us # # # find log slices # find_decompressor() { - if echo $1 | grep -qs 'bz2$'; then - echo "bzip2 -dc" - elif echo $1 | grep -qs 'gz$'; then - echo "gzip -dc" - elif echo $1 | grep -qs 'xz$'; then - echo "xz -dc" - else - echo "cat" - fi + case $1 in + *bz2) echo "bzip2 -dc" ;; + *gz) echo "gzip -dc" ;; + *xz) echo "xz -dc" ;; + *) echo "cat" ;; + esac } + # # check if the log contains a piece of our segment # is_our_log() { local logf=$1 local from_time=$2 local to_time=$3 local cat=`find_decompressor $logf` local format=`$cat $logf | get_time_format` local first_time=`$cat $logf | head -10 | get_first_time $format` local last_time=`$cat $logf | tail -10 | get_last_time $format` if [ x = "x$first_time" -o x = "x$last_time" ]; then warning "Skipping bad logfile '$1': Could not determine log dates" return 0 # skip (empty log?) fi if [ $from_time -gt $last_time ]; then # we shouldn't get here anyway if the logs are in order return 2 # we're past good logs; exit fi if [ $from_time -ge $first_time ]; then return 3 # this is the last good log fi # have to go further back if [ x = "x$to_time" -o $to_time -ge $first_time ]; then return 1 # include this log else return 0 # don't include this log fi } # # go through archived logs (timewise backwards) and see if there # are lines belonging to us # (we rely on untouched log files, i.e. that modify time # hasn't been changed) # arch_logs() { local logf=$1 local from_time=$2 local to_time=$3 # look for files such as: ha-log-20090308 or # ha-log-20090308.gz (.bz2) or ha-log.0, etc ls -t $logf $logf*[0-9z] 2>/dev/null | while read next_log; do is_our_log $next_log $from_time $to_time case $? in 0) ;; # noop, continue 1) echo $next_log # include log and continue debug "Found log $next_log" ;; 2) break;; # don't go through older logs! 3) echo $next_log # include log and continue debug "Found log $next_log" break ;; # don't go through older logs! esac done } # # print part of the log # drop_tmp_file() { [ -z "$tmp" ] || rm -f "$tmp" } print_logseg() { local logf=$1 local from_time=$2 local to_time=$3 # uncompress to a temp file (if necessary) local cat=`find_decompressor $logf` if [ "$cat" != "cat" ]; then tmp=`mktemp` $cat $logf > $tmp trap drop_tmp_file 0 sourcef=$tmp else sourcef=$logf tmp="" fi if [ "$from_time" = 0 ]; then FROM_LINE=1 else FROM_LINE=`findln_by_time $sourcef $from_time` fi if [ -z "$FROM_LINE" ]; then warning "couldn't find line for time $from_time; corrupt log file?" return fi TO_LINE="" if [ "$to_time" != 0 ]; then TO_LINE=`findln_by_time $sourcef $to_time` if [ -z "$TO_LINE" ]; then warning "couldn't find line for time $to_time; corrupt log file?" return fi if [ $FROM_LINE -lt $TO_LINE ]; then dumplog $sourcef $FROM_LINE $TO_LINE log "Including segment [$FROM_LINE-$TO_LINE] from $logf" else debug "Empty segment [$FROM_LINE-$TO_LINE] from $logf" fi else dumplog $sourcef $FROM_LINE $TO_LINE log "Including all logs after line $FROM_LINE from $logf" fi drop_tmp_file trap "" 0 } # # find log/set of logs which are interesting for us # dumplogset() { local logf=$1 local from_time=$2 local to_time=$3 local logf_set=`arch_logs $logf $from_time $to_time` if [ x = "x$logf_set" ]; then return fi local num_logs=`echo "$logf_set" | wc -l` local oldest=`echo $logf_set | awk '{print $NF}'` local newest=`echo $logf_set | awk '{print $1}'` local mid_logfiles=`echo $logf_set | awk '{for(i=NF-1; i>1; i--) print $i}'` # the first logfile: from $from_time to $to_time (or end) # logfiles in the middle: all # the last logfile: from beginning to $to_time (or end) case $num_logs in 1) print_logseg $newest $from_time $to_time;; *) print_logseg $oldest $from_time 0 for f in $mid_logfiles; do `find_decompressor $f` $f debug "including complete $f logfile" done print_logseg $newest 0 $to_time ;; esac } # cut out a stanza getstanza() { awk -v name="$1" ' !in_stanza && NF==2 && /^[a-z][a-z]*[[:space:]]*{/ { # stanza start if ($1 == name) in_stanza = 1 } in_stanza { print } in_stanza && NF==1 && $1 == "}" { exit } ' } # supply stanza in $1 and variable name in $2 # (stanza is optional) getcfvar() { cf_type=$1; shift; cf_var=$1; shift; cf_file=$* [ -f "$cf_file" ] || return case $cf_type in cman) grep $cf_var $cf_file | sed s/.*$cf_var=\"// | sed s/\".*// ;; corosync|openais) sed 's/#.*//' < $cf_file | if [ $# -eq 2 ]; then getstanza "$cf_var" shift 1 else cat fi | awk -v varname="$cf_var" ' NF==2 && match($1,varname":$")==1 { print $2; exit; } ' ;; heartbeat) sed 's/#.*//' < $cf_file | grep -w "^$cf_var" | sed 's/^[^[:space:]]*[[:space:]]*//' ;; logd) sed 's/#.*//' < $cf_file | grep -w "^$cf_var" | sed 's/^[^[:space:]]*[[:space:]]*//' ;; esac } pickfirst() { for x; do which $x >/dev/null 2>&1 && { echo $x return 0 } done return 1 } # # figure out the cluster type, depending on the process list # and existence of configuration files # get_cluster_type() { - if - ps -ef | egrep -qs '[c]orosync' - then + if is_running corosync; then tool=`pickfirst corosync-objctl corosync-cmapctl` case $tool in - *objctl) quorum=`$tool -a | grep quorum.provider | sed s/.*=//`;; - *cmapctl) quorum=`$tool | grep quorum.provider | sed s/.*=//`;; + *objctl) quorum=`$tool -a | grep quorum.provider | sed 's/.*=\s*//'`;; + *cmapctl) quorum=`$tool | grep quorum.provider | sed 's/.*=\s*//'`;; esac if [ x"$quorum" = x"quorum_cman" ]; then stack="cman" else stack="corosync" fi - elif - ps -ef | egrep -qs '[a]isexec' - then + elif is_running aisexec; then stack="openais" + elif ps -ef | grep -v -e grep -e "eartbeat/[clasp]" | egrep -qs '[h]eartbeat' then stack="heartbeat" # Now we're guessing... elif [ -f /etc/cluster/cluster.conf ]; then stack="cman" # TODO: Technically these could be anywhere :-/ elif [ -f /etc/corosync/corosync.conf ]; then stack="corosync" elif [ -f /etc/ais/openais.conf ]; then stack="openais" + elif [ -f /etc/ha.d/ha.cf ]; then + stack="heartbeat" + else - stack="heartbeat" + # We still don't know. This might be a Pacemaker Remote node, + # or the configuration might be in a nonstandard location. + stack="any" fi debug "Detected the '$stack' cluster stack" echo $stack } find_cluster_cf() { case $1 in cman) echo "/etc/cluster/cluster.conf";; corosync) best_size=0 best_file="" # TODO: Technically these could be anywhere :-/ for cf in /etc/ais/openais.conf /etc/corosync/corosync.conf; do if [ -f $cf ]; then size=`wc -l $cf | awk '{print $1}'` if [ $size -gt $best_size ]; then best_size=$size best_file=$cf fi fi done if [ -z "$best_file" ]; then debug "Looking for corosync configuration file. This may take a while..." - for f in `find / $depth -type f -name corosync.conf`; do + for f in `find / -maxdepth $maxdepth -type f -name corosync.conf`; do best_file=$f break done fi debug "Located corosync config file: $best_file" echo "$best_file" ;; openais) # TODO: Technically it could be anywhere :-/ cf="/etc/ais/openais.conf" if [ -f $cf ]; then echo "$cf" fi ;; heartbeat) cf="/etc/ha.d/ha.cf" if [ -f $cf ]; then echo "$cf" fi ;; + any) + # Cluster type is undetermined. Don't complain, because this + # might be a Pacemaker Remote node. + ;; *) warning "Unknown cluster type: $1" ;; esac } # # check for the major prereq for a) parameter parsing and b) # parsing logs # t=`get_time "12:00"` if [ "$t" = "" ]; then fatal "please install the perl Date::Parse module (perl-DateTime-Format-DateParse on Fedora/Red Hat)" fi + +# vim: set expandtab tabstop=8 softtabstop=4 shiftwidth=4 textwidth=80: