src/utils/qa/qaAutoTrack.sh fb895c82bef394408aff9fa5ce3008daa4cee6e3

fb895c82bef394408aff9fa5ce3008daa4cee6e3
mspeir
  Wed Jul 31 12:16:46 2019 -0700
Fixed an issue with qaAutoTrack not properly checking date for grcIncidentDb tracks. Bumped up tooOld date for grcIncidentDb tracks to 6 months since updates seem to come infrequently from GRC. Fixed issue with runQaAutoTrack not deleting old log files. Added checks for grcIncidentDb track on danRer10 and galGal5. Removed check for danRer7 as GRC site does not seem to have data for this assembly anymore. no redmine

diff --git src/utils/qa/qaAutoTrack.sh src/utils/qa/qaAutoTrack.sh
index e09fc9b..19ecebf 100755
--- src/utils/qa/qaAutoTrack.sh
+++ src/utils/qa/qaAutoTrack.sh
@@ -1,339 +1,326 @@
 #!/bin/bash
 # quit if something within the script fails
 set -beEu -o pipefail
 source `which qaConfig.bash`
 export HGDB_CONF=$HOME/.hg.conf.beta
 umask 002
 
 ################################
 #
 #  02-13-2015
 #  Matthew Speir
 #
 #  qaAutoTrack.sh
 #  Performs basic QA for auto-pushed tracks, which includes:
 #  - Checks when data for track was last updated
 #  - Coverage from featureBits -countGaps
 #  - Percentage difference in coverage between now and the last time the script was run
 #
 ################################
 
 ##### Variables #####
 # Set by command-line options
 db=""
 tableName=""
 bigBedMode=""
 verboseMode=false
 overwriteLogUrl=false
 newLogDir=""
 
 # Other variables
 currDate=$(date +%F)
 currTime=$(date +%H_%M_%S)
 output="" # holds output message
 logUrl="http://genecats.soe.ucsc.edu/qa/test-results/qaAutoTrackLogs"
 logDir="/usr/local/apache/htdocs-genecats/qa/test-results/qaAutoTrackLogs"
 currLogFile=""
 prevLogFile=""
 prevLogDate=""
 
 # Variables for issue checking
 issueNote=""
 tooOld=""
 percentDiff=""
 
 ##### Functions #####
 
 # Usage message function
 showHelp() {
 cat << EOF
 Usage: `basename $0` [-hbvs] [-l log dir] [-u log url] \$database \$table
 
 	Required arguments:
 	database	 UCSC database name, e.g. hg19 or hg38.
 	table		 Table name, e.g. gwasCatalog.
 
 	Optional arguments:
 	-h		 Display this help and exit
 	-b		 BigBed mode. Used for tracks supported
 			 bigBed files, i.e. grcIncidentDb.
 	-v		 Verbose mode. Outputs test results to
 			 standard out as well as file.
 	-l log directory Alternate directory for output log
 	-u log url	 Alternate URL for output log.
 	-s		 Suppress URL in output. Will instead
 			 print directory in messages.
 
 Performs basic QA for auto-pushed tracks, which includes:
 - Checks when data for track was last updated
 - Coverage from featureBits -countGaps
 - Percentage difference in coverage between now and
   the last time the script was run
 
 By default, all results are output to a file and only issues
 are output to the command line. Use the "-v" option to see
 results on the command line as well. All log files are output to:
 http://genecats.soe.ucsc.edu/qa/test-results/qaAutoTrack.
 
 Notes:
 	- For OMIM, ClinGen (formerly ISCA), or ClinVar tracks use omim, isca, or clinvar as the table name.
 	- If run more than once per day, then subsequent log files will include current time in name.
 
 EOF
 }
 
 # Output function
 function outputCovDiff () {
 	# four positional arguments. $1 == prevLogFile. $2 == tblCov. $3 == tableName. $4 == tblDate.
 	# Dependency: Coverage information from previous log file (positional argument $1)
 	# If no previous log file exists or if previous log file doesn't contain coverage info,
 	# then no coverage diff will be calculated and the percentDiff variable will retain its
 	# default value of ""
 
 	if [[ $1 != "" ]] # Check for previous log file. True if file exists.
 	then
 		# get info needed for diff
 		rawCount=$(echo $2 | awk '{print $1}')
 		prevCov=$(egrep -A2 "^$3" $1 | grep "^Coverage New" | cut -d" " -f3-) # Grabs coverage from previous log file.
 		rawCountPrev=$(echo $prevCov | awk '{print $1}')
 
 		if [[ prevCov != "" ]] # Check needed so script doesn't fail if prevLogFile doesn't contain coverage info
 		then
 			# Calculate diff between new and old coverage
 			rawCountDiff=$(echo $(expr $rawCount - $rawCountPrev)|tr -d -)
 			rawCountAvg=$(expr $rawCount / 2 + $rawCountPrev / 2)
 			percentDiff=$(awk -v rcd=$rawCountDiff -v rca=$rawCountAvg 'BEGIN{print 100 * rcd / rca}')
 
 			# Build output string
 			output+="$3\nLast updated: $4\nCoverage New: $2\nCoverage Old: $prevCov\nCoverage Diff: $percentDiff%\n\n"
 		else
 			output+="$3\nLast updated: $4\nCoverage New: $2\n\n"
 		fi
 	else
 		output+="$3\nLast updated: $4\nCoverage New: $2\n\n"
 	fi
 }
 
 # Function to raise errors
 function checkForIssues () {
 	# four positional arguments. $1 == tblDate. $2 == tooOld. $3 == tableName. $4 == precentDiff.
 	# Dependency: percentDiff, positional argument $4, is expected to be set by outputCovDiff
 	# If percent diff is not set (so it retains default empty "" value), then checkForIssues
 	# will raise no errors.
 
 	# Raises an error if it's been too long since last update
 	if [ $(date -d "$1" +%s) -le $(date -d "$2" +%s) ]
 	then
 		issueNote+="$db.$3 has not been updated since $1\n"
 	fi
 
 	# Raises error if coverage diff between track versions is too large
 	if [[ "$4" != "" ]]
 	then
 		# Round our percentDiff to 3 decimal places.
 		percentDiffRounded=$(printf "%.3f\n" "$4")
 		if [ $(echo "$percentDiffRounded >= 10" | bc) -ne 0 ]
 		then
 			issueNote+="Large coverage diff for $db.$3\n"
 		fi
 	fi
 }
 
 ##### Parse command-line input #####
 
 OPTIND=1 # Reset is necessary if getopts was used previously in the script.
 while getopts "hl:u:bvs" opt
 do
 	case $opt in
 		h)
 			showHelp
 			exit 0
 			;;
 		v)
 			verboseMode=true
 			;;
 		b)
 			bigBedMode="on"
 			;;
 		l)
 			# Check if directory is valid before attempting to write to it.
 			if [ -d "$OPTARG" ]
 			then
 				newLogDir=$OPTARG
 			else
 				echo -e "Sorry, directory \"$2\" does not exist."
 				echo -e "Check spelling or create this directory and try again.\n"
 				exit 1
 			fi
 			;;
 		u)
 			logUrl=$OPTARG
 			;;
 		s)
 			overwriteLogUrl=true
 			;;
 		'?')
 			showHelp >&2
 			exit 1
 			;;
 	esac
 done
 
 shift "$((OPTIND-1))" # Shift off the options and optional --.
 
 # Check number of required arguments
 if [ $# -ne 2 ]
 then
         # Output usage message if number of required arguments is wrong
         showHelp >&2
         exit 1
 else
         # Set variables with required argument values
         db=$1
         tableName=$2
 fi
 
 ##### Main Program #####
 
 # Set some variables based on optional input
 if [[ "$newLogDir" != "" ]]
 then
 	# Set logDir if newLogDir is provided
 	logDir="$newLogDir"
 fi
 if [[ $overwriteLogUrl == true ]]
 then
 	# Overwrite url if option is set
 	logUrl=$logDir
 fi
 
 # set currLogFile
 currLogFile="$db.$tableName.$currDate.$currTime.txt"
 
 # set info for prevLog
 prevLogDate=$(ls -Lt $logDir | sed -n /$db.$tableName/p | head -1 | awk -F . '{print $3}')
 prevLogTime=$(ls -Lt $logDir | sed -n /$db.$tableName/p | head -1 | awk -F . '{print $4}')
 
 #initialize output string
 output="\n$db\n"
 
 if [ -e $logDir/$db.$tableName.$prevLogDate.$prevLogTime.txt ]
 then
 	prevLogFile="$logDir/$db.$tableName.$prevLogDate.$prevLogTime.txt"
 fi
 
 # Set tooOld for different tables
-if  [[ "$tableName" == "clinvar" ]] || [[ "$tableName" == "grcIndcidentDb" ]]
+if  [[ "$tableName" == "clinvar" ]]
 then
 	tooOld=$(date -d "$currDate - 2 months" +%F)
+elif [[ "$tableName" == "grcIndcidentDb" ]]
+then
+	tooOld=$(date -d "$currDate - 6 months" +%F)
 else
 	tooOld=$(date -d "$currDate - 1 month" +%F)
 fi
 
 # Run tests for different tracks 
 if [[ $bigBedMode == "on" ]]
 then
 	# ClinVar has muliple tables
 	if [[ $tableName == "clinvar" ]]
 	then
 		for tbl in clinvarMain clinvarCnv
 		do
 			# Get file name from beta
 			fileName=$(hgsql -Ne "SELECT * FROM $tbl LIMIT 1" $db)
 			# Get table update time from beta
 			tblDate=$(ssh qateam@hgwbeta "date -d '$(stat -Lc '%y' $fileName)' +%F' '%T")
 			MYTEMPFILE=$(mktemp --tmpdir tmp.XXXXXXXXXX.bed)
 			# featureBits doesn't work with bigBeds, need to turn into bed first
 			ssh qateam@hgwbeta "/usr/local/apache/cgi-bin/utils/bigBedToBed $fileName stdout" > $MYTEMPFILE
 			tblCov=$(featureBits -countGaps $db $MYTEMPFILE 2>&1)
 
 			outputCovDiff "$prevLogFile" "$tblCov" "$tbl" "$tblDate"
 			# Check for issues with table
 			checkForIssues "$tblDate" "$tooOld" "$tbl" "$percentDiff"
 
 			rm -f $MYTEMPFILE
 		done
-	# GRC Incident track relies on remote file so curl must be used instead of stat
-	elif [[ $tableName == "grcIncidentDb" ]]
-	then
-		fileName=$(hgsql -Ne "SELECT * FROM $tableName LIMIT 1" $db)
-		# Use curl to get update time on file
-		tblDate=$(date -d "$(curl -s -v -I $fileName 2>&1 | grep '^< Last-Modified:'| cut -d ' ' -f3- )" '+%F %T')
-		MYTEMPFILE=$(mktemp --tmpdir tmp.XXXXXXXXXX.bed)
-		# featureBits doesn't work with bigBeds, need to turn into bed first
-		bigBedToBed $fileName $MYTEMPFILE
-		tblCov=$(featureBits -countGaps $db $MYTEMPFILE 2>&1)
-
-		outputCovDiff "$prevLogFile" "$tblCov" "$tableName" "$tblDate"
-		# Check for issues with table
-		checkForIssues "$tblDate" "$tooOld" "$tableName" "$percentDiff"
-			
-		rm -f $MYTEMPFILE
 	# Tests for all other bigBed based autopushed tracks (assuming they don't use remote bigBed files)
 	else
 		fileName=$(hgsql -Ne "SELECT * FROM $tableName LIMIT 1" $db)
 		# Get table update time from beta
 		tblDate=$(ssh qateam@hgwbeta "date -d '$(stat -Lc '%y' $fileName)' +%F' '%T")
 		MYTEMPFILE=$(mktemp --tmpdir tmp.XXXXXXXXXX.bed)
 		# featureBits doesn't work with bigBeds, need to turn into bed first
 		ssh qateam@hgwbeta "/usr/local/apache/cgi-bin/utils/bigBedToBed $fileName stdout" > $MYTEMPFILE
 		tblCov=$(featureBits -countGaps $db $MYTEMPFILE 2>&1)
 
 		outputCovDiff "$prevLogFile" "$tblCov" "$tableName" "$tblDate"
 	
 		# Check for issues with table
 		checkForIssues "$tblDate" "$tooOld" "$tableName" "$percentDiff"
 
 		rm -f $MYTEMPFILE
 	fi
 # Tests for non-bigBed tracks
 else
 	# OMIM and ISCA both have a large number of tables
 	if [[ $tableName == "omim" ]] || [[ $tableName == "isca" ]]
 	then
 		for tbl in $(hgsql -Ne "SHOW TABLES LIKE '%$tableName%'" $db) # Grabs list of all omim or isca tables from beta
         	do
 			tblDate=$(hgsql -Ne "SELECT UPDATE_TIME FROM information_schema.tables WHERE TABLE_SCHEMA='$db' AND TABLE_NAME='$tbl'")
 			# Only some omim tables have coordinates
                 	if [[ $tbl == "omimGene2" ]] || [[ $tbl == "omimAvSnp" ]] || [[ $tbl == "omimLocation" ]] || [[ $tableName == "isca" ]]
                 	then
                         	tblCov=$(ssh qateam@hgwbeta "featureBits -countGaps $db $tbl 2>&1")
 
 				outputCovDiff "$prevLogFile" "$tblCov" "$tbl" "$tblDate"
 				# Check for issues with table
 				checkForIssues "$tblDate" "$tooOld" "$tbl" "$percentDiff"
 
 			# Output for tables that don't contain coordinates
 			else
 				output+="$tbl\nLast updated: $tblDate\n\n"
 				# Check for issues with table
 				checkForIssues "$tblDate" "$tooOld" "$tbl" ""
 			fi
 		done
 	# Tests for all other table based autopushed tracks
 	else
 		tblDate=$(hgsql -Ne "SELECT UPDATE_TIME FROM information_schema.tables WHERE TABLE_SCHEMA='$db' AND TABLE_NAME='$tableName'")
 		tblCov=$(ssh qateam@hgwbeta "featureBits -countGaps $db $tableName 2>&1")
 
 		outputCovDiff "$prevLogFile" "$tblCov" "$tableName" "$tblDate"
 
 		# Check for issues with table
 		checkForIssues "$tblDate" "$tooOld" "$tableName" "$percentDiff"
 	fi
 fi
 
 # Output results of tests
 if [[ $issueNote != "" ]]
 then
 	#Put URL to log file at end of issue note
 	issueNote+="\nSee $logUrl/$currLogFile for more details about these errors.\n\n"
 
 	echo -e $issueNote | tee $logDir/$currLogFile
 fi
 
 if [[ $verboseMode == true ]] # True if verboseMode is on
 then
 	echo -e $output | tee -a $logDir/$currLogFile
 else
 	echo -e $output >> $logDir/$currLogFile
 fi