cfa9546a9f8fe1b79434faa7ec9cf81093b57d34
lrnassar
  Tue Jun 3 13:43:30 2025 -0700
Some small tweaks to the trackCountsParse script that generates a list of potential tracks to nest or hide based on usage. 1. Check to see if the makeUsageReport script has ran, and if not run it for those dates. 2. Cleaned up some greps that needed escaped backslashes. 3. Typo fix. 4. Clarifying the line to call an easy to format tabFmt script. Refs #35858

diff --git src/utils/qa/trackCountsParse src/utils/qa/trackCountsParse
index f6bd6b77d23..07d5b851c45 100755
--- src/utils/qa/trackCountsParse
+++ src/utils/qa/trackCountsParse
@@ -1,25 +1,25 @@
 #!/usr/bin/env python3
 
 import subprocess
 from collections import OrderedDict
 from time import localtime, strftime
 from datetime import datetime
 from dateutil.relativedelta import relativedelta
 import calendar
 import math
-import subprocess,sys,argparse
+import subprocess,sys,argparse,os
 
 def parseArgs():
     """
     Parse the command line arguments.
     """
     parser = argparse.ArgumentParser(description = __doc__,
                                      formatter_class=argparse.RawDescriptionHelpFormatter)
     optional = parser._action_groups.pop()
 
     required = parser.add_argument_group('required arguments')
 
     required.add_argument ("dbs",
         help = "Database to query for track counts, e.g. hg19, hg38, mm10.")
     required.add_argument ("workDir",
         help = "Work directory to use for processing and final output. Use full path with '/' at the end.")
@@ -56,37 +56,48 @@
         exit(0)
     parser._action_groups.append(optional)
     options = parser.parse_args()
     return  options
 
 def bash(cmd):
     """Run the cmd in bash subprocess"""
     try:
         rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
                                        stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
         bashStdoutt = rawBashOutput.stdout
     except subprocess.CalledProcessError as e:
         raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
     return(bashStdoutt)
 
+def file_exists(filepath):
+    return os.path.isfile(filepath)
+
 def generateTrackCounts(dbs,workDir,startDate,endDate):
     #Generate track usage report binned by month for a specific time frame and dbs
     #Format date format is XXXX-XX-XX, e.g. 2023-11-01
     outputFileName = workDir+dbs+"."+startDate+"to"+endDate+".trackCounts.txt"
     #Run the script and remove the ct line (ct_), hub lines (hub_), dup lines (dup_)
     #And header line (^#), as well as remove the first column of repeating database
-    bash("/hive/users/chmalee/logs/byDate/makeUsageReport -t -db "+dbs+" --bin-months -s "+startDate+" -e "+endDate+" | grep -v \"hub_\|ct_\|dup_\|^#\" | cut -f2- > "+outputFileName)
+    
+    if not file_exists(outputFileName):
+        print("Generating new file")
+        print("/hive/users/chmalee/logs/byDate/makeUsageReport -t -db " + dbs + " --bin-months -s " + startDate + " -e " + endDate + " | grep -v \"hub_\\|ct_\\|dup_\\|^#\" | cut -f2- > " + outputFileName)
+        cmd = ("/hive/users/chmalee/logs/byDate/makeUsageReport -t -db " + dbs + " --bin-months -s " + startDate + " -e " + endDate + " | grep -v \"hub_\\|ct_\\|dup_\\|^#\" | cut -f2- > " + outputFileName)
+        bash(cmd)
+    else:
+        print("File already generated: " + outputFileName)
+        
     return(outputFileName)
 
 def createDicFromTrackCountsFile(trackCountsFilePath):
     """
     This function reads a file containing track counts and creates a dictionary 
     with track names as keys and their corresponding counts as values.
     
     Args:
     - trackCountsFilePath (str): Path to the file containing track counts.
     
     Returns:
     - dict: A dictionary containing track names as keys and their counts as values.
     """
     trackList = open(trackCountsFilePath, 'r')
     trackCountsDic = {}
@@ -225,31 +236,31 @@
     listOfCountsToSort = []
     n=0
     #Fetch the shortLabels and make a list where each entry is a dic with trackName, shortLabel, and trackCount
     for key in finalDicOfTopLevelTracksAndCounts.keys():
         if key != "":
             tdbQuery = bash("tdbQuery \"select * from "+dbs+" where track='"+key+"'\"").split("\n")
             if "shortLabel" in str(tdbQuery):
                 for entry in tdbQuery:        
                     if entry.startswith("shortLabel"):
                         n+=1
                         shortLabel = " ".join(entry.split(" ")[1:])
                         listOfCountsToSort.append({'trackName':key,'shortLabel':shortLabel,'trackCounts':int(finalDicOfTopLevelTracksAndCounts[key])})
             else:
                 n+=1
                 listOfCountsToSort.append({'trackName':key,'shortLabel':key,'trackCounts':int(finalDicOfTopLevelTracksAndCounts[key])})
-    print("Tota number of tracks: "+str(n))
+    print("Total number of tracks: "+str(n))
     
     #Sort the trackCounts list to return in order to find data that meets cutoff threshhold
     listOfCountsToSort.sort(key=get_count, reverse=True)
     return(listOfCountsToSort)
     
 def refineTrackCountsBasedOnCutOff(listOfTracks,cutOffThreshhold,period):
     """
     Take an ordered list containing dics of track counts and filter it based
     on a set threshhold. Then return a new ordered dictionary that contains
     the tracks below the threshhold with trackNames as keys and shortLabel
     and counts as values
     """
     trackCountCutoff = listOfTracks[int(len(listOfTracks)/2)]['trackCounts']*cutOffThreshhold
     finalTrackCountsDic = OrderedDict()
     for track in listOfTracks:
@@ -361,39 +372,39 @@
 def writeFinalTrackListToFile(finalOutputTrackDicToReport,workDir,dbs,cutOffThreshhold,numOfMonthsToCompare):
     """
     Take the final processed dictionary and write it out to a tsv file including the vars
     used in the data generation.
     """
     date = datetime.today().strftime('%Y-%m')
     monthToParse = datetime.strftime(datetime.strptime(date, '%Y-%m') - relativedelta(months=numOfMonthsToCompare), '%Y-%m')
     fileNamePathStartToEndDate = workDir+monthToParse+"-"+date+"."+dbs+".tracksToArchive.tsv"
     finalOutputFile = open(fileNamePathStartToEndDate,'w')
     finalOutputFile.write("#Variables used in this file generation: dbs="+dbs+" numOfMonthsToCompare="+str(numOfMonthsToCompare)+" cutOffThreshhold="+str(cutOffThreshhold)+"\n")
     finalOutputFile.write("#trackName\tshortLabel\tgroup\taverageTrackCount\taverageCountComparedToMaxForPeriod\taverageCountComparedToCutoff\n")
     for track in finalOutputTrackDicToReport:
         finalOutputFile.write(track['trackName']+"\t"+track['shortLabel']+"\t"+track['group']+"\t"+str(track['averageTrackCount'])+"\t"+str(track['averageCountComparedToMaxForPeriod'])+"\t"+str(track['averageCountComparedToCutoff'])+"\n")
     finalOutputFile.close()
     print("\nCutoff tracks file complete: "+fileNamePathStartToEndDate)
-    print("\nYou can pipe this output into ~markd/bin/tabFmt removing the first line to get a nicely formatted output.")
+    print("\nYou nicely format the output as such: tail -n +2 outputFilePath.tsv | tabFmt stdin")
 
 def main():
     """Initialize options and call other functions"""
     options = parseArgs()
     dbs,workDir,cutOffThreshhold,numOfMonthsToCompare = options.dbs,options.workDir,options.cutOffThreshhold,options.numOfMonthsToCompare
     #Line below exists only for debugging purposes
-    #dbs,workDir,cutOffThreshhold,numOfMonthsToCompare = 'hg38','/hive/users/lrnassar/temp/tmp/',.3,12
-    if options.singleReport == True:
+    # dbs,workDir,cutOffThreshhold,numOfMonthsToCompare,singleReport = 'hg38','/hive/users/lrnassar/temp/tmp/',.3,6,False
+    if singleReport == True:
         startDate,endDate,parentChildAssociationsDic = options.startDate,options.endDate,{}
         logFile = generateTrackCounts(dbs,workDir,startDate,endDate)
         trackCountsDic,totalCount = createDicFromTrackCountsFile(logFile)
         parentChildAssociationsDic = lookUpTracksToFindParentChildAssociations(trackCountsDic,totalCount,parentChildAssociationsDic,dbs)
         finalDicOfTopLevelTracksAndCounts = buildFinalDicWithOnlyTopLevelTrackCounts(trackCountsDic,parentChildAssociationsDic)
         makeFinalFileOnTopLevelTrackCounts(finalDicOfTopLevelTracksAndCounts,workDir+"trackCounts.tsv",dbs)    
 
     else:
         print("Script started: "+strftime("%Y-%m-%d %H:%M:%S", localtime()))
         dateRanges = getDateRangesForComparison(numOfMonthsToCompare)
         finalDicWithCutOffDics,parentChildAssociationsDic = OrderedDict(),{}
         for period in dateRanges:
             logFile = generateTrackCounts(dbs,workDir,dateRanges[period]['startDate'],dateRanges[period]['endDate'])
             trackCountsDic,totalCount = createDicFromTrackCountsFile(logFile)
             parentChildAssociationsDic = lookUpTracksToFindParentChildAssociations(trackCountsDic,totalCount,parentChildAssociationsDic,dbs)