#! /usr/dt/bin/dtksh # $Copyright: Copyright 2011 Symantec Corporation, All Rights Reserved $ VERSION="3.1 " # #Script to run basic stats on drive/ media errors. #Written by Martin Holt / Copywrite Martin Holt 2012 #Written /tested on Solaris 10 and NetBackup 6.5.4 - 7.1 #Version 1.0 - Initial concept and development #Version 1.1 - This one almost works ... #Version 1.2 - Changed output of results, added 'getops' menu #Version 1.3 - Added drive_specific () and media_specific () #Version 1.4 - Changed options #Version 1.5 - Fixed bug that gave incorrect total if tape name equals/ similar to drivename #Version 1.6 - Added errorsfile () and alt_file () #Version 1.7 - Added check for valid drive for -d option #Version 1.8 - Changed case statement to set variables to improve handling of multiple options #Version 1.9 - Added unique_file function (-u option) #Version 2.0 - Rewrote errorsfile() for improved filtering of non-valid lines. Changed shell to /usr/dt/bin/dtksh to cope with associate arrays #Version 2.1 - Added -n option (bypasses drive/ media check in errors file) #Version 2.2 - Added -m option (sendmail) #Version 2.3 - Added check_errors_file {} to cope with possiblity of an empty errors file #Version 2.4 - Added function for help page (-h) #Version 2.5 - Fixed issue with file the 'tee' command to allow it to use a file descriptor as opposed to filename #Version 2.6 - Tidied up formatting #Version 2.7 - Added disclaimer #Version 2.8 - Added random number in for creation of /tmp/tperr. file #Version 2.9 - Fixed issue with if running with no options, which is 'invalid' #Version 3.0 - Added log creaetion/ collection functionality #Version 3.1 - -n option now skips craetion of associate arrays , updated -h (help output) # Shortened /tmp dir name, and put in check if this exists (it shouldn't .... but just in case) # Important, script now deletes /tmp/tperr_symantec_ directory on exit ... #Notes # Shell HAS to be /usr/dt/bin/dtksh to support associate arrays # Script creates a tmp directory in /tmp, based on the script name (tperr.sh) this is deleted via rm line in cleanup() function # -l option 'edits' (via copy) bp.conf, recommended to run this when no jobs are running # No option to 'remove' logs / VERBOSE settings # -l option DOES NOT restart ltid, this must me done manually to pick up log changes ############################################################################################# ############################################################################################# # trap CTRL-C and execute the trap_exit() function: trap trap_exit INT HUP QUIT TERM #Define variables DIR=/tmp/tperr_symantec_$$ ORIG_ERRORS_FILE=/usr/openv/netbackup/db/media/errors ERRORS_FILE=$DIR/merrors UNIQUE_ERRORS_FILE=$DIR/unique_errors MEDIALIST=$DIR/medialist DRIVELIST=$DIR/drivelist MAILFILE=$DIR/mailfile.txt LOGDIR=/usr/openv/netbackup/logs/tperr_logs OPT_A=0 #Used in case statement OPT_T=0 #Used in case statement OPT_D=0 #Used in case statement OPT_F=0 #Used in case statement OPT_U=0 #Used in case statement OPT_N=0 #Used in case statement OPT_M=0 #Used in case statement OPT_l=0 #Used in case statement OPT_L=0 #Used in case statement OPT_H=0 #Used in case statement #Set up tmp directory, meida + drive lists and output file for mail ... if [[ -d $DIR ]] then echo "Directory $DIR already exists - exiting ... " ;exit else mkdir $DIR fi exec 3> $MAILFILE #Define file descriptor to prevent multiple opening/ closing of "$MAILFILE" echo "tperr.sh - Tape/ Drive Analysis Version $VERSION" >&3 echo "******************************************* \n" >&3 #Define functions #Ensure cleanup if ctrl-c etc ... trap_exit () { cleanup exit } unique_file () { echo "Unique file option" if [[ $OPT_T = 1 ]] then grep $VOLUMENAME $ORIG_ERRORS_FILE >$UNIQUE_ERRORS_FILE fi if [[ $OPT_D = 1 ]] then grep $DRIVENAME $ORIG_ERRORS_FILE >$UNIQUE_ERRORS_FILE fi } errorsfile () { #Check errors file and removes lines that have invalid media/ drives if [[ $OPT_N = 0 ]] then vmquery -a |grep "media ID" |awk '{print $3}'>$MEDIALIST #Needed for errors_file () function, skipped with -n option typeset -A media_array cat $MEDIALIST |while read MLINE do media_array[$MLINE]=1 done typeset -A drive_array tpconfig -emm_dev_list |grep Drive: |awk '{print $2}'>$DRIVELIST #Needed for errors_file () function, skipped with -n option cat $DRIVELIST |while read DLINE do drive_array[$DLINE]=1 done if [[ $OPT_U = 1 ]] then cat $UNIQUE_ERRORS_FILE |while read ELINE do set $ELINE if [[ $(echo ${media_array[$3]}) == 1 && $(echo ${drive_array[$6]}) == 1 ]] then echo $ELINE >>$ERRORS_FILE fi done else cat $ORIG_ERRORS_FILE |sed -e '/^$/d' |while read ELINE do set $ELINE if [[ $(echo ${media_array[$3]}) == 1 && $(echo ${drive_array[$6]}) == 1 ]] then echo $ELINE >>$ERRORS_FILE fi done fi elif [[ $OPT_U = 0 ]] then cp $ORIG_ERRORS_FILE $ERRORS_FILE else cp $UNIQUE_ERRORS_FILE $ERRORS_FILE fi } errors_file_check () { if [[ -s $ERRORS_FILE ]] then echo "Errors File exists ...." else echo "No entries in Errors file, exiting ... (if not using -n option, try re-running with this option)" exit fi } optcheck () { if (( $# == 0 )) then echo "No options specified, please see tperr.sh -h" fi } calcstat () { #Work out the numbers of errors for each drive/ tape #This is the main 'function' in the script, the rest is more or less just formatting ... TAPESTAT=$(sort -u -k3,3 -k6,6 $ERRORS_FILE |awk '{++t[$3]}END{for (i in t)print i" has had errors in "t[i]" different drives!"}') DRIVESTAT=$(sort -u -k3,3 -k6,6 $ERRORS_FILE |awk '{++t[$6]}END{for (i in t)print i" has had errors with "t[i]" different tapes!"}') TAPECOUNT=$(awk '{a[$3]++} END {for(x in a) print a[x],x"!"}' $ERRORS_FILE) #No. of times a given media occurs in the media file. DRIVECOUNT=$(awk '{a[$6]++} END {for(x in a) print a[x],x"!"}' $ERRORS_FILE) #No. of times a given drive occurs in the media file. } format_output_tape () { #Format output of TAPESTAT and TAPECOUNT variables into more readable format echo $TAPESTAT |tr "!" "\012" |sed -e '/^$/d' |sed -e 's/^[ ][ ]*//' |while read LINE do echo "$LINE \c" |tee >(cat - >&3) echo " (Total occurrences (errors) of this volume is $(echo "$TAPECOUNT" |tr "!" "\012" |grep $(echo $LINE |awk '{print $1}') |awk '{print $1}'))" |tee >(cat - >&3) done echo "\n" } format_output_drive () { #Format output of DRIVESTAT and DRIVECOUNT variables into more readable format echo $DRIVESTAT |tr "!" "\012" |sed -e '/^$/d' |sed -e 's/^[ ][ ]*//' |while read LINE do echo "$LINE \c" |tee >(cat - >&3) echo " (Total occurrences (errors) for this drive is $(echo "$DRIVECOUNT" |tr "!" "\012" |grep "$(echo ${LINE} |awk '{print " "$1}')$" |awk '{print $1} '))" |tee >(cat - >&3) done } media_specific () { # Ensure that media has been specified on the command line when script is used with the -m option, if not exit if [[ $VOLUMENAME = "" ]] then echo "-m option requires value" exit #If we have a media specified then we are good to go ... else #For drives that have had VOLUMENAME and given an error, show other VOLUMES that have had an issue echo "Errors involving media $VOLUMENAME" |tee >(cat - >&3) grep -w $VOLUMENAME $ERRORS_FILE echo "\n The drives that had an error with media $VOLUMENAME also had errors with the following other media ... \n" |tee >(cat - >&3) #Get unique list of drives for DRIVE in $(echo $(grep -w $VOLUMENAME $ERRORS_FILE |awk '{print $6}' |sort -u)) do echo "NetBackup Drive - "$DRIVE |tee >(cat - >&3) VOL=$(grep -w $DRIVE $ERRORS_FILE |awk '{print $3}' |sort -u ) echo "$VOL" |while read LINE do echo " $LINE \c" |tee >(cat - >&3) echo "($(egrep "$LINE.*$DRIVE" $ERRORS_FILE |wc -l))" |tee >(cat - >&3) done done fi } drive_specific () { # Ensure that drive has been specified on the command line when script is used with the -d option, if not exit if [[ $DRIVENAME = "" ]] then echo "-d option requires value" exit else #If we have a drive specified then we are good to go ... #For tapes that have given an error in DRIVENAME, show the other drives that have had issues that these tapes have been in echo "Errors involving drive $DRIVENAME ..." |tee >(cat - >&3) grep -w $DRIVENAME $ERRORS_FILE echo "\n The tapes that had an error in drive $DRIVENAME, also had errors in the following other drives ... \n" |tee >(cat - >&3) #Get unique list of tapes for TAPE in $(echo $(grep -w $DRIVENAME $ERRORS_FILE |awk '{print $3}' |sort -u)) do echo "Media - $TAPE" |tee >(cat - >&3) DRV=$(grep -w $TAPE $ERRORS_FILE |awk '{print $6}' |sort -u ) echo "$DRV" |while read LINE do echo " Drive - $LINE \c" |tee >(cat - >&3) echo "($(egrep "$TAPE.*$LINE" $ERRORS_FILE |wc -l))" |tee >(cat - >&3) done done fi } cleanup () { #Send mail if OPT_M = 1 and then remove the tmp files and directory ... if [[ $OPT_M = 1 ]] then mailx -s "tperr.sh results" $EMAILADD <$MAILFILE fi rm -r /tmp/tperr_symantec_$$ } tperr_help () { echo "List of valid options to use on tperr.sh:" echo "tperr.sh -a" echo "tperr.sh -a -f /tmp/file" echo "tperr.sh -d " echo "tperr.sh -d -u" echo "tperr.sh -d -f " echo "tperr.sh -d -f -u" echo "tperr.sh -t " echo "tperr.sh -t -u" echo "tperr.sh -t -f " echo "tperr.sh -t -f -u" ;echo"" echo "Description of options:" echo "[-a] - Generally summary of media/ drive errors" echo "[-f ] - Specify alternate location of errors file. This can be copied from another server" echo "[-d ] - For any tape that had an error in , shows the other drives that these tapes had an error in" echo "[-t ] - For any drive that had an error with , shows the other tapes that this had an error with" echo "[-u] - Used with either the -d or -t options to limit the output to only the or specified" echo "[-n] - Do not validate each line of the errors file. This is required if the errors file is from another NBU environment" echo "[-m ] - Send output to " ;echo "" echo "[-l] - Set up media manager logs and increase verbose levels " ;echo "" echo "[-L] - Collect media manager logs" ;echo "" echo "For the -m option to work the Solaris mailx command must be working" echo "It can be tested with a command such as mailx -s "Test mail" >/usr/openv/volmgr/vm.conf else cp /usr/openv/volmgr/vm.conf /usr/openv/volmgr/vm.conf.safe echo "VERBOSE" >>/usr/openv/volmgr/vm.conf fi #Copy the bp.conf and vm.conf files (not copied to /tmp in case system is rebooted ...) #bp.conf file must exist, so just check that the copy is successful, else exit #cp /usr/openv/netbackup/bp.conf /usr/openv/netbackup/bp.conf.tperr.$$ #Create a backup of bp.conf cp /usr/openv/netbackup/bp.conf /usr/openv/netbackup/bp.conf.$$ if (( $(cksum /usr/openv/netbackup/bp.conf |awk '{print $1}') != $(cksum /usr/openv/netbackup/bp.conf.$$ |awk '{print $1}') )) then echo "Exiting as copy of bp.conf has failed, cksums do NOT match" exit else #If all ok, create a new bp.conf and add VERBOSE entries #Have to copy file, so easier to remove any BPTM/ BPBRM_VERBOSE entries to replace with level 5, quicker to do this than check and only remove # if they exist and != 5 grep -v BPTM_VERBOSE /usr/openv/netbackup/bp.conf |grep -v BPBRM_VERBOSE >/usr/openv/netbackup/bp.conf.tperr.new mv /usr/openv/netbackup/bp.conf.tperr.new /usr/openv/netbackup/bp.conf echo "BPTM_VERBOSE = 5" >>/usr/openv/netbackup/bp.conf echo "BPBRM_VERBOSE = 5" >>/usr/openv/netbackup/bp.conf fi #vm.conf file may not exist, so cp if it does, then check cp is successful, else exit if [[ -f /usr/openv/volmgr/vm.conf ]] then # cp /usr/openv/volmgr/vm.conf /usr/openv/volmgr/vm.conf.tperr.$$ cp /usr/openv/volmgr/vm.conf /usr/openv/volmgr/vm.conf.tperr if (( $(cksum /usr/openv/volmgr/vm.conf |awk '{print $1}') != $(cksum /usr/openv/volmgr/vm.conf.tperr |awk '{print $1}') )) then echo "Exiting as copy of vm.conf has failed, cksums do not match" exit else #If all ok, create a new vm.conf and add VERBOSE line grep -v VERBOSE /usr/openv/volmgr/vm.conf >/usr/openv/volmgr/vm.conf.tperr.new mv /usr/openv/volmgr/vm.conf.tperr.new /usr/openv/volmgr/vm.conf echo "VERBOSE" >> /usr/openv/volmgr/vm.conf fi fi } log_collect () { if [[ ! -d $LOGDIR ]] then mkdir $LOGDIR else cp /usr/openv/netbackup/logs/bpbrm/$(date '+log.%m%d%y%n') $LOGDIR/bpbrm.txt >/dev/null 2>&1 cp /usr/openv/netbackup/logs/bptm/$(date '+log.%m%d%y%n') $LOGDIR/bptm.txt >/dev/null 2>&1 cp /usr/openv/volmgr/debug/ltid/$(date '+log.%m%d%y%n') $LOGDIR/ltid.txt >/dev/null 2>&1 cp /usr/openv/volmgr/debug/robots/$(date '+log.%m%d%y%n') $LOGDIR/robots.txt >/dev/null 2>&1 cp /usr/openv/volmgr/debug/reqlib/$(date '+log.%m%d%y%n') $LOGDIR/reqlib.txt >/dev/null 2>&1 cp /usr/openv/volmgr/debug/daemon/$(date '+log.%m%d%y%n') $LOGDIR/vmd.txt >/dev/null 2>&1 cp /var/adm/messages $LOGDIR tar cvf tperr_logs.tar $LOGDIR/* echo "Logs copied to $LOGDIR" ls -al $LOGDIR fi } ## Main script while getopts :f:t:m:d:haunlL option # Script will accept -a/u/m/d/f/l/L as options, with -t/m/d/f requiring a command line argument. See terr.sh -? do case $option in a) OPT_A=1 ;; f) OPT_F=1 ORIG_ERRORS_FILE=$OPTARG;; #f) alt_file;; t) OPT_T=1 VOLUMENAME=$OPTARG;; d) OPT_D=1 DRIVENAME=$OPTARG;; u) OPT_U=1;; n) OPT_N=1;; m) OPT_M=1 EMAILADD=$OPTARG;; l) log_create OPT_l=1;; L) log_collect OPT_L=1;; h) tperr_help OPT_H=1;; *) echo "Incorrect Option ($OPTARG) Used";; esac done if [[ $OPT_L = 0 && $OPT_l = 0 && $OPT_T = 0 && $OPT_A = 0 && $OPT_D = 0 && $OPT_H = 0 ]] then echo "Invalid options, -a, -d, -l or -t must be specified" exit fi if [[ $OPT_T = 1 && $OPT_A = 1 ]] then echo "Please re-run using only option -t or -a at the same time" exit fi if [[ $OPT_D = 1 && $OPT_A = 1 ]] then echo "Please re-run using only option -d or -a at the same time" exit fi if [[ $OPT_A = 1 && $OPT_U = 0 ]] then errorsfile errors_file_check calcstat format_output_tape format_output_drive cleanup fi if [[ $OPT_T = 1 && $OPT_D = 0 && $OPT_U = 0 ]] then errorsfile errors_file_check calcstat media_specific cleanup fi if [[ $OPT_T = 1 && $OPT_D = 0 && $OPT_U = 1 ]] then unique_file errorsfile errors_file_check calcstat media_specific cleanup fi if [[ $OPT_T = 0 && $OPT_D = 1 && $OPT_U = 0 ]] then errorsfile errors_file_check calcstat drive_specific cleanup fi if [[ $OPT_T = 0 && $OPT_D = 1 && $OPT_U = 1 ]] then unique_file errorsfile errors_file_check calcstat drive_specific cleanup fi if [[ $OPT_A = 1 && $OPT_U = 1 ]] then echo "Please re-run using only option -a without option -u at the same time" cleanup exit fi if [[ $OPT_T = 1 && $OPT_D = 1 ]] then echo "Please re-run using only option -t or -d at the same time" cleanup exit fi if [[ $OPT_T = 1 && $OPT_A = 1 ]] then echo "Please re-run using only option -t or -a at the same time" cleanup exit fi if [[ $OPT_D = 1 && $OPT_A = 1 ]] then echo "Please re-run using only option -d or -a at the same time" cleanup exit fi