cfa9546a9f8fe1b79434faa7ec9cf81093b57d34 lrnassar Tue Jun 3 13:43:30 2025 -0700 Some small tweaks to the trackCountsParse script that generates a list of potential tracks to nest or hide based on usage. 1. Check to see if the makeUsageReport script has ran, and if not run it for those dates. 2. Cleaned up some greps that needed escaped backslashes. 3. Typo fix. 4. Clarifying the line to call an easy to format tabFmt script. Refs #35858 diff --git src/utils/qa/trackCountsParse src/utils/qa/trackCountsParse index f6bd6b77d23..07d5b851c45 100755 --- src/utils/qa/trackCountsParse +++ src/utils/qa/trackCountsParse @@ -1,25 +1,25 @@ #!/usr/bin/env python3 import subprocess from collections import OrderedDict from time import localtime, strftime from datetime import datetime from dateutil.relativedelta import relativedelta import calendar import math -import subprocess,sys,argparse +import subprocess,sys,argparse,os def parseArgs(): """ Parse the command line arguments. """ parser = argparse.ArgumentParser(description = __doc__, formatter_class=argparse.RawDescriptionHelpFormatter) optional = parser._action_groups.pop() required = parser.add_argument_group('required arguments') required.add_argument ("dbs", help = "Database to query for track counts, e.g. hg19, hg38, mm10.") required.add_argument ("workDir", help = "Work directory to use for processing and final output. Use full path with '/' at the end.") @@ -56,37 +56,48 @@ exit(0) parser._action_groups.append(optional) options = parser.parse_args() return options def bash(cmd): """Run the cmd in bash subprocess""" try: rawBashOutput = subprocess.run(cmd, check=True, shell=True,\ stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) bashStdoutt = rawBashOutput.stdout except subprocess.CalledProcessError as e: raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) return(bashStdoutt) +def file_exists(filepath): + return os.path.isfile(filepath) + def generateTrackCounts(dbs,workDir,startDate,endDate): #Generate track usage report binned by month for a specific time frame and dbs #Format date format is XXXX-XX-XX, e.g. 2023-11-01 outputFileName = workDir+dbs+"."+startDate+"to"+endDate+".trackCounts.txt" #Run the script and remove the ct line (ct_), hub lines (hub_), dup lines (dup_) #And header line (^#), as well as remove the first column of repeating database - bash("/hive/users/chmalee/logs/byDate/makeUsageReport -t -db "+dbs+" --bin-months -s "+startDate+" -e "+endDate+" | grep -v \"hub_\|ct_\|dup_\|^#\" | cut -f2- > "+outputFileName) + + if not file_exists(outputFileName): + print("Generating new file") + print("/hive/users/chmalee/logs/byDate/makeUsageReport -t -db " + dbs + " --bin-months -s " + startDate + " -e " + endDate + " | grep -v \"hub_\\|ct_\\|dup_\\|^#\" | cut -f2- > " + outputFileName) + cmd = ("/hive/users/chmalee/logs/byDate/makeUsageReport -t -db " + dbs + " --bin-months -s " + startDate + " -e " + endDate + " | grep -v \"hub_\\|ct_\\|dup_\\|^#\" | cut -f2- > " + outputFileName) + bash(cmd) + else: + print("File already generated: " + outputFileName) + return(outputFileName) def createDicFromTrackCountsFile(trackCountsFilePath): """ This function reads a file containing track counts and creates a dictionary with track names as keys and their corresponding counts as values. Args: - trackCountsFilePath (str): Path to the file containing track counts. Returns: - dict: A dictionary containing track names as keys and their counts as values. """ trackList = open(trackCountsFilePath, 'r') trackCountsDic = {} @@ -225,31 +236,31 @@ listOfCountsToSort = [] n=0 #Fetch the shortLabels and make a list where each entry is a dic with trackName, shortLabel, and trackCount for key in finalDicOfTopLevelTracksAndCounts.keys(): if key != "": tdbQuery = bash("tdbQuery \"select * from "+dbs+" where track='"+key+"'\"").split("\n") if "shortLabel" in str(tdbQuery): for entry in tdbQuery: if entry.startswith("shortLabel"): n+=1 shortLabel = " ".join(entry.split(" ")[1:]) listOfCountsToSort.append({'trackName':key,'shortLabel':shortLabel,'trackCounts':int(finalDicOfTopLevelTracksAndCounts[key])}) else: n+=1 listOfCountsToSort.append({'trackName':key,'shortLabel':key,'trackCounts':int(finalDicOfTopLevelTracksAndCounts[key])}) - print("Tota number of tracks: "+str(n)) + print("Total number of tracks: "+str(n)) #Sort the trackCounts list to return in order to find data that meets cutoff threshhold listOfCountsToSort.sort(key=get_count, reverse=True) return(listOfCountsToSort) def refineTrackCountsBasedOnCutOff(listOfTracks,cutOffThreshhold,period): """ Take an ordered list containing dics of track counts and filter it based on a set threshhold. Then return a new ordered dictionary that contains the tracks below the threshhold with trackNames as keys and shortLabel and counts as values """ trackCountCutoff = listOfTracks[int(len(listOfTracks)/2)]['trackCounts']*cutOffThreshhold finalTrackCountsDic = OrderedDict() for track in listOfTracks: @@ -361,39 +372,39 @@ def writeFinalTrackListToFile(finalOutputTrackDicToReport,workDir,dbs,cutOffThreshhold,numOfMonthsToCompare): """ Take the final processed dictionary and write it out to a tsv file including the vars used in the data generation. """ date = datetime.today().strftime('%Y-%m') monthToParse = datetime.strftime(datetime.strptime(date, '%Y-%m') - relativedelta(months=numOfMonthsToCompare), '%Y-%m') fileNamePathStartToEndDate = workDir+monthToParse+"-"+date+"."+dbs+".tracksToArchive.tsv" finalOutputFile = open(fileNamePathStartToEndDate,'w') finalOutputFile.write("#Variables used in this file generation: dbs="+dbs+" numOfMonthsToCompare="+str(numOfMonthsToCompare)+" cutOffThreshhold="+str(cutOffThreshhold)+"\n") finalOutputFile.write("#trackName\tshortLabel\tgroup\taverageTrackCount\taverageCountComparedToMaxForPeriod\taverageCountComparedToCutoff\n") for track in finalOutputTrackDicToReport: finalOutputFile.write(track['trackName']+"\t"+track['shortLabel']+"\t"+track['group']+"\t"+str(track['averageTrackCount'])+"\t"+str(track['averageCountComparedToMaxForPeriod'])+"\t"+str(track['averageCountComparedToCutoff'])+"\n") finalOutputFile.close() print("\nCutoff tracks file complete: "+fileNamePathStartToEndDate) - print("\nYou can pipe this output into ~markd/bin/tabFmt removing the first line to get a nicely formatted output.") + print("\nYou nicely format the output as such: tail -n +2 outputFilePath.tsv | tabFmt stdin") def main(): """Initialize options and call other functions""" options = parseArgs() dbs,workDir,cutOffThreshhold,numOfMonthsToCompare = options.dbs,options.workDir,options.cutOffThreshhold,options.numOfMonthsToCompare #Line below exists only for debugging purposes - #dbs,workDir,cutOffThreshhold,numOfMonthsToCompare = 'hg38','/hive/users/lrnassar/temp/tmp/',.3,12 - if options.singleReport == True: + # dbs,workDir,cutOffThreshhold,numOfMonthsToCompare,singleReport = 'hg38','/hive/users/lrnassar/temp/tmp/',.3,6,False + if singleReport == True: startDate,endDate,parentChildAssociationsDic = options.startDate,options.endDate,{} logFile = generateTrackCounts(dbs,workDir,startDate,endDate) trackCountsDic,totalCount = createDicFromTrackCountsFile(logFile) parentChildAssociationsDic = lookUpTracksToFindParentChildAssociations(trackCountsDic,totalCount,parentChildAssociationsDic,dbs) finalDicOfTopLevelTracksAndCounts = buildFinalDicWithOnlyTopLevelTrackCounts(trackCountsDic,parentChildAssociationsDic) makeFinalFileOnTopLevelTrackCounts(finalDicOfTopLevelTracksAndCounts,workDir+"trackCounts.tsv",dbs) else: print("Script started: "+strftime("%Y-%m-%d %H:%M:%S", localtime())) dateRanges = getDateRangesForComparison(numOfMonthsToCompare) finalDicWithCutOffDics,parentChildAssociationsDic = OrderedDict(),{} for period in dateRanges: logFile = generateTrackCounts(dbs,workDir,dateRanges[period]['startDate'],dateRanges[period]['endDate']) trackCountsDic,totalCount = createDicFromTrackCountsFile(logFile) parentChildAssociationsDic = lookUpTracksToFindParentChildAssociations(trackCountsDic,totalCount,parentChildAssociationsDic,dbs)