src/utils/qa/assemblyStatsCron.py 01291cd79ec246a80e79856a5c1f724d55aa39cd

01291cd79ec246a80e79856a5c1f724d55aa39cd
lrnassar
  Fri Feb 14 11:33:30 2025 -0800
Moving up the user var declaration, I had left it below its first use when I added the unfinished run cleanup.

diff --git src/utils/qa/assemblyStatsCron.py src/utils/qa/assemblyStatsCron.py
index aba7b5eb6e3..664ebf79221 100755
--- src/utils/qa/assemblyStatsCron.py
+++ src/utils/qa/assemblyStatsCron.py
@@ -1,417 +1,417 @@
 #07/20/19
 #This was adapted from a jupyter notebook - hence lots of weird bash calls
 
 import datetime
 from collections import OrderedDict
 import getpass
 import subprocess
 
 def bash(cmd):
     """Run the cmd in bash subprocess"""
     try:
         rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
                                        stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
         bashStdoutt = rawBashOutput.stdout
     except subprocess.CalledProcessError as e:
         raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
     return(bashStdoutt)
 
 def bashNoErrorCatch(cmd):
     """Run the cmd in bash subprocess, don't catch error since grep returns exit code 1 when no match is found"""
     try:
         rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
                                        stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
         bashStdoutt = rawBashOutput.stdout.rstrip().split("\n")
     except:
         bashStdoutt = []
     return(bashStdoutt)
 
+user = getpass.getuser()
+
 #Clean out any previous unfinished run
 bash("rm -f /hive/users/"+user+"/ErrorLogs/*")
 bash("rm -f /hive/users/"+user+"/ErrorLogsOutput/*")
 
-user = getpass.getuser()
-
 # Get the year to query proper wwwstats directory
 today = datetime.datetime.today()
 year = str(today).split('-')[0]
 
 # Get the last 5 error logs from the RR
 latestLogs = bash('ls /hive/users/chmalee/logs/trimmedLogs/result/hgw1').rstrip().split("\n")
 
 ######################## TESTING MODE - ONLY PROCESS MINIMAL AMOUNT OF LOGS ########################
 testMode = False #Set true for testing mode
 #################################################################################################
 
 if testMode: #Default is one RR log and one asia log
     latestLogs = latestLogs[len(latestLogs)-1:]
     nodes = ['RR', 'asiaNode'] #Add nodes with error logs, nodes can be added or removed
     machines = ['hgw1'] #Add hgw machines to check
 else:
     latestLogs = latestLogs[len(latestLogs)-5:]
     nodes = ['RR', 'asiaNode', 'euroNode'] #Add nodes with error logs, nodes can be added or removed
     machines = ['hgw1','hgw2'] #Add hgw machines to check
 
 for node in nodes:
     if node == 'RR':
         for machine in machines:
             for log in latestLogs: #Copy the 5 latest error logs for each of the rr machines
                 bash("cp /hive/users/chmalee/logs/trimmedLogs/result/"+machine+'/'+log+' /hive/users/'+user+'/ErrorLogs/'+node+machine+log)
 
     else:
         latestLogs = bash("ls /hive/users/chmalee/logs/trimmedLogs/result/"+node).rstrip().split("\n")
         if testMode:
             latestLogs = latestLogs[len(latestLogs)-1:]
         else:
             latestLogs = latestLogs[len(latestLogs)-5:]
 
         for log in latestLogs: #Copy the 5 latest error logs for each of the other nodes
             bash('cp /hive/users/chmalee/logs/trimmedLogs/result/'+node+'/'+log+' /hive/users/'+user+'/ErrorLogs/'+node+log)
 
 # Run generateUsageStats.py with -d (directory), -t (default track stats), -o (output)
 bash("/cluster/home/"+user+'/kent/src/hg/logCrawl/dbTrackAndSearchUsage/generateUsageStats.py -d /hive/users/'+user+'/ErrorLogs --allOutput -t -o /hive/users/'+user+'/ErrorLogsOutput > /dev/null')
 
 #The following section pulls out a list of default track for the top X assemblies for filtering
 file = open("/hive/users/"+user+"/ErrorLogsOutput/defaults.txt", "a")
 
 #The head command can be expanded to be more inclusive if additional assembly defaults are finding their way onto the list
 bash("sort /hive/users/"+user+'/ErrorLogsOutput/dbCounts.tsv -rnk2 > /hive/users/'+user+'/ErrorLogsOutput/dbCountsTopSorted.tsv')
 dbs = bash('head -n 4 /hive/users/'+user+'/ErrorLogsOutput/dbCountsTopSorted.tsv | cut -f1 -d "\t"').rstrip().split("\n")
 
 
 for db in dbs[0:4]:
     #The following part queries hgTracks for each of the assemblies and extracts the list of defaults
     bash('echo '+db+' > /hive/users/'+user+'/ErrorLogsOutput/temp.txt')
     defaults = bash('HGDB_CONF=$HOME/.hg.conf.beta /usr/local/apache/cgi-bin/hgTracks db='+db+' hgt.trackImgOnly=1 > /dev/null').rstrip().split("\n")
 
     for trackLogs in range(len(defaults)):
         if trackLogs >= (len(defaults)-1):
             pass
         else:
             defaultsLine = defaults[trackLogs].split(" ")[4]
             defaultsLine = defaultsLine.split(',')
             for track in range(len(defaultsLine)):
                 defaultsLine[track] = defaultsLine[track][0:(len(defaultsLine[track])-2)]
                 file.write(db+"\t"+defaultsLine[track]+"\n")
     file.write(db+"\t"+"cytoBand"+"\n")
 file.close()
 
 bash('echo This cronjob pulls out GB stats over the last month, across all RR machines and Asia/Euro mirrors using the generateUsageStats.py script. It only counts each hgsid occurrence once, filtering our any hgsid that only showed up one time. > /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 ##### Report the database usage, aggregating curated hubs and GenArk ######
 
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo List of db usage, hubs are aggregated across mirrors to a single count: >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash("echo db$'\\t'dbUse >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
 bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 dbCountsRaw = open('/hive/users/'+user+'/ErrorLogsOutput/dbCounts.tsv','r')
 dbCountsCombined = open('/hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv','w')
 
 dbsCounts = {}
 for line in dbCountsRaw:
     line = line.rstrip().split("\t")
     if line[0].startswith("hub"):
         db = "_".join(line[0].split("_")[2:])
     else:
         db = line[0]
     count = int(line[1])
     if db not in dbsCounts:
         dbsCounts[db] = count
     else:
         dbsCounts[db] = dbsCounts[db] + count
 
 for key in dbsCounts:
     dbCountsCombined.write(key+"\t"+str(dbsCounts[key])+"\n")
 
 dbCountsCombined.close()
 dbCountsRaw.close()
 
 bash('sort /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv -rnk2 > /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv.sorted')
 bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv.sorted | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 ##### Report default track usage for hg38 and hg19 ######
 
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo "List of default track usage for hg38, sorted by how many users are turning off the track:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash("echo db$'\\t'trackUse$'\\t'% using$'\\t'% turning off$'\\t'trackName >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
 bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 bash('grep ^hg38 /hive/users/'+user+'/ErrorLogsOutput/defaultCounts.tsv | grep -v "MarkH3k27ac" | sort -nrk 5 | awk -v OFS="\\t" \'{ print $1,$3,$4,$5,$2 }\' | head -n 15 | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo "List of default track usage for hg19, sorted by how many users are turning off the track:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash("echo db$'\\t'trackUse$'\\t'% using$'\\t'% turning off$'\\t'trackName >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
 bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 bash('grep ^hg19 /hive/users/'+user+'/ErrorLogsOutput/defaultCounts.tsv | grep -v "MarkH3k27ac" | sort -nrk 5 | awk -v OFS="\\t" \'{ print $1,$3,$4,$5,$2 }\' | head -n 15| /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo List of non-default track usage: >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash("echo db$'\\t'trackUse$'\\t'trackName >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
 bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 bash('sort /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv -rnk3 > /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted')
 bash('cat /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted | grep -v -f /hive/users/'+user+'/ErrorLogsOutput/defaults.txt > /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted.noDefaults')
 bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted.noDefaults | awk -v OFS="\\t" \'{ print $1,$3,$2 }\' | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 ##### Report public hub usage and non-public hub usage ######
 
 pubHubFile = open("/hive/users/"+user+"/ErrorLogsOutput/pubHubs.txt", "w") #Using a file to be able to use the same ~markd/bin/tabFmt format
 bash("sort /hive/users/"+user+"/ErrorLogsOutput/trackCountsHubs.tsv -rnk4 -t $\'\\t\' > /hive/users/"+user+"/ErrorLogsOutput/trackCountsHubs.tsv.sorted")
 allPubHubs = bash('cat /hive/users/'+user+'/ErrorLogsOutput/trackCountsHubs.tsv.sorted').rstrip().split("\n")
 
 #This section pulls out only a single occurence of each public hub, picking the first track (most uses) to represent it
 results = OrderedDict()
 for each in allPubHubs:
     each = each.split('\t')
     if each[0] not in results.keys():
         results[each[0]] = each[0:]
 
 for key, value in results.items():
     pubHubFile.write(value[0]+"\t"+value[1]+"\t"+value[2]+"\t"+value[3]+"\n")
 pubHubFile.close()
 
 #Query hubPublic and hubStats in order to filter out public hubs then sort out the IDs
 bash('/cluster/bin/x86_64/hgsql -h genome-centdb -e "select hubUrl from hubPublic" hgcentral > /hive/users/'+user+'/ErrorLogsOutput/hubPublicHubUrl.txt')
 bash('/cluster/bin/x86_64/hgsql -h genome-centdb -e "select hubUrl,id from hubStatus" hgcentral> /hive/users/'+user+'/ErrorLogsOutput/hubStatusHubUrl.txt')
 bash('grep -f /hive/users/'+user+'/ErrorLogsOutput/hubPublicHubUrl.txt /hive/users/'+user+'/ErrorLogsOutput/hubStatusHubUrl.txt | cut -f2 > /hive/users/'+user+'/ErrorLogsOutput/publicIDs.txt')
 
 #Add hub_ID format to match stats program output, then grep out the public hubs from the track list
 bash('cat /hive/users/'+user+'/ErrorLogsOutput/publicIDs.txt | sed s/^/hub_/g > /hive/users/'+user+'/ErrorLogsOutput/hubPublicIDs.txt')
 bash('grep "hub_" /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv | sort -rnk3 > /hive/users/'+user+'/ErrorLogsOutput/allTracksOrderedUsage.txt')
 
 #Pull out whole fields from euro and RR hubStatus in order to collect the info for matching IDs
 bash('ssh qateam@genome-euro "hgsql -e \'select id,hubUrl,shortLabel,lastOkTime from hubStatus\' hgcentral" > /hive/users/'+user+'/ErrorLogsOutput/genomeEuroHubStatus.txt')
 bash('/cluster/bin/x86_64/hgsql -h genome-centdb -e "select id,hubUrl,shortLabel,lastOkTime from hubStatus where lastOkTime !=\'\'" hgcentral > /hive/users/'+user+'/ErrorLogsOutput/RRHubStatus.txt')
 #The genome-asia hubStatus is automatically generated via cron by qateam on asia and copied to dev. Use line below to create a new file from personal user
 if user != 'qateam':
     bash("ssh "+user+"@genome-asia \"hgsql -e 'select id,hubUrl,shortLabel,lastOkTime from hubStatus' hgcentral\" > /hive/users/"+user+"/ErrorLogsOutput/genomeAsiaHubStatus.txt")
 
 hubs = bash('cat /hive/users/'+user+'/ErrorLogsOutput/allTracksOrderedUsage.txt').rstrip().split("\n")
 pubHubUrls = bash('cat /hive/users/'+user+'/ErrorLogsOutput/hubPublicHubUrl.txt').rstrip().split("\n")
 lastMonth = today - datetime.timedelta(days=30)
 lastMonthFormat = lastMonth.strftime('%Y-%m')
 hubsDic = OrderedDict()
 
 for hub in hubs: #Start iterating through track lines, stop pulling items at 20
     if len(hubsDic.keys()) < 20:
         #Pull out the use number, associated db, and hubkey
         hub = hub.split("\t")
         if 'hub' in hub[1]:
             hubKey = hub[1].split("_")[1]
         elif 'hub' in hub[0]:
             hubKey = hub[0].split("_")[1]
         else:
             print(hub)
 
         hubDb = hub[0]
         hubTrack = hub[1]
         hubCount = hub[2]
 
         if hubKey not in hubsDic.keys(): #Check that this key has not yet been counted
             entryMade = False #Reset entryMade variable, made to check RR and then Euro if RR has no match
             # Grep out the proper hubstatus line, returns an empty list if no matches
             #        ^1017$'\t'
             rrHub = bashNoErrorCatch("grep ^"+hubKey+"$'\\t' /hive/users/"+user+"/ErrorLogsOutput/RRHubStatus.txt")
 
             if rrHub != []:
                 #Pull out matching hubURL, ShortLabel, lastOKtime (used to see compare against)
                 #recurring hubIDs between RR and euro
                 rrHub = rrHub[0].split("\t")
                 if len(rrHub) > 2:
                     rrHubURL = rrHub[1]
                     rrHubShortLabel = rrHub[2]
                     rrHubLastOk = rrHub[3]
                     if rrHubLastOk != '':
                         rrHubLastOk = datetime.datetime.strptime(rrHubLastOk.split(" ")[0], '%Y-%m-%d')
                         if rrHubLastOk > lastMonth: #If  hub has been OK in last month, assume it's correct
                             if rrHubURL in pubHubUrls:
                                 entryMade = True
                             else: #Pull out all relevant info and save to dictionary
                                 hubsDic[hubKey] = {}
                                 hubsDic[hubKey]['shortLabel'] = rrHubShortLabel
                                 hubsDic[hubKey]['hubURL'] = rrHubURL
                                 hubsDic[hubKey]['hubCount'] = hubCount
                                 hubsDic[hubKey]['hubDb'] = hubDb
                                 hubsDic[hubKey]['machine'] = "RR"
                                 hubsDic[hubKey]['hubTrack'] = hubTrack
                                 entryMade = True #Set true so that the hubID isn't searched for in Euro
 
             if entryMade is False: #This assumes that the hubID was either not present in the RR hubStatus, or it pointed to a hub not used in last month
                 euroHub = bashNoErrorCatch("grep ^"+hubKey+"$'\\t' /hive/users/"+user+"/ErrorLogsOutput/genomeEuroHubStatus.txt")
                 if euroHub != []:
                     euroHub = euroHub[0].split("\t")
                     if len(euroHub) > 2:
                         euroHubURL = euroHub[1]
                         euroHubShortLabel = euroHub[2]
                         euroHubLastOk = euroHub[3]
                         if euroHubLastOk != '':
                             euroHubLastOk = datetime.datetime.strptime(euroHubLastOk.split(" ")[0], '%Y-%m-%d')
                             if euroHubLastOk > lastMonth:
                                 if euroHub[1] in pubHubUrls:
                                     entryMade = True
                                 else:
                                     hubsDic[hubKey] = {}
                                     hubsDic[hubKey]['shortLabel'] = euroHubShortLabel
                                     hubsDic[hubKey]['hubURL'] = euroHubURL
                                     hubsDic[hubKey]['hubCount'] = hubCount
                                     hubsDic[hubKey]['hubDb'] = hubDb
                                     hubsDic[hubKey]['machine'] = "Euro"
                                     hubsDic[hubKey]['hubTrack'] = hubTrack
                                     entryMade = True
 
             if entryMade is False: #This assumes that the hubID was either not present in the RR hubStatus, or it pointed to a hub not used in last month
                 asiaHub = bashNoErrorCatch("grep ^"+hubKey+"$'\\t' /hive/users/"+user+"/ErrorLogsOutput/genomeAsiaHubStatus.txt")
                 if asiaHub != []:
                     asiaHub = asiaHub[0].split("\t")
                     if len(asiaHub) > 2:
                         asiaHubURL = asiaHub[1]
                         asiaHubShortLabel = asiaHub[2]
                         asiaHubLastOk = asiaHub[3]
                         if asiaHubLastOk != '':
                             asiaHubLastOk = datetime.datetime.strptime(asiaHubLastOk.split(" ")[0], '%Y-%m-%d')
                             if asiaHubLastOk > lastMonth:
                                 if asiaHub[1] in pubHubUrls:
                                     pass
                                 else:
                                     hubsDic[hubKey] = {}
                                     hubsDic[hubKey]['shortLabel'] = asiaHubShortLabel
                                     hubsDic[hubKey]['hubURL'] = asiaHubURL
                                     hubsDic[hubKey]['hubCount'] = hubCount
                                     hubsDic[hubKey]['hubDb'] = hubDb
                                     hubsDic[hubKey]['machine'] = "Asia"
                                     hubsDic[hubKey]['hubTrack'] = hubTrack
                                     entryMade = True
                     else:
                         bash("echo The following hub: "+hub+" was not found in the RR/euro/asia hubStatus with a lastOkTime within the last month. This likely means an error has occurred. >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
 #Create new file to write out the top 20 most used hubs
 hubsNotPublic = open("/hive/users/"+user+"/ErrorLogsOutput/hubsNotPublic.txt", "a")
 for key in hubsDic.keys():
     hubsNotPublic.write(hubsDic[key]['hubDb']+"\t"+hubsDic[key]['machine']+"\t"+str(hubsDic[key]['hubCount'])+"\t"+hubsDic[key]['shortLabel']+"\t"+hubsDic[key]['hubURL']+"\n")
 hubsNotPublic.close()
 
 #Work on a non-public hub list added up across all mirrors
 #Create a dic of a all hub counts
 allHubCounts = {}
 trackCounts = open("/hive/users/"+user+"/ErrorLogsOutput/trackCounts.tsv.sorted","r")
 for line in trackCounts:
     line = line.rstrip().split("\t")
     if line[1].startswith("hub_"):
         if "hub" in line[0]: #check for database like hub_164399_GCA_004023905.1
             database = "_".join(line[0].split("_")[2:])
         else:
             database = line[0]
         nameToMatch = "_".join(line[1].split("_")[2:]) + database
         if nameToMatch not in allHubCounts.keys():
             allHubCounts[nameToMatch] = int(line[2])
         elif nameToMatch in allHubCounts.keys():
             allHubCounts[nameToMatch] = allHubCounts[nameToMatch] + int(line[2])
 #         else: #For debugging
 #             print("This error should not happen.")
 trackCounts.close()
 
 #Now use the new list to report the most popular public hub numbers across all mirrors
 pubHubList = open("/hive/users/"+user+"/ErrorLogsOutput/pubHubs.txt", "r")
 pubHubDic = {}
 for pubHub in pubHubList:
     pubHub = pubHub.rstrip().split("\t")
     if "hub" in pubHub[1]: #check for database like hub_164399_GCA_004023905.1
         database = "_".join(pubHub[1].split("_")[2:])
         name = pubHub[2] + database
     else:
         name = pubHub[2] + pubHub[1]
         database = pubHub[1]
     #Make name a combination of track name + assembly
     pubHubDic[name] = {}
     pubHubDic[name]['trackName'] = pubHub[2]
     pubHubDic[name]['dbs'] = database
     pubHubDic[name]['hubName'] = pubHub[0]
     if name in allHubCounts:
         pubHubDic[name]['count'] = allHubCounts[name]
     else:
         print("Name not in public hub list: "+name)
 pubHubList.close()
 
 pubHubFile = open("/hive/users/"+user+"/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.txt", "w")
 for key in pubHubDic:
     pubHubFile.write(pubHubDic[key]['dbs']+"\t"+str(pubHubDic[key]['count'])+"\t"+pubHubDic[key]['trackName']+"\t"+pubHubDic[key]['hubName']+"\n")
 pubHubFile.close()
 
 bash('sort -rnk2 /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.txt > /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.sorted.txt')
 
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo "List of public hub usage (only most used track represented). Counts added across all mirrors:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash("echo db$'\\t'trackUse$'\\t'track$'\\t'pubHub >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
 bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.sorted.txt | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 #Then do the same for non-public hubs
 for hub in hubsDic:
     if "hub" in hubsDic[hub]['hubDb']: #check for database like hub_164399_GCA_004023905.1
         hubsDic[hub]['hubDb'] = "_".join(hubsDic[hub]['hubDb'].split("_")[2:])
     if "hub" in hubsDic[hub]['hubTrack']: #check for database like hub_116154_xenoRefGene
         hubsDic[hub]['hubTrack'] = "_".join(hubsDic[hub]['hubTrack'].split("_")[2:])
 
     nameToMatch = hubsDic[hub]["hubTrack"] + hubsDic[hub]['hubDb']
     if nameToMatch in allHubCounts.keys():
         hubsDic[hub]['updatedCount'] = allHubCounts[nameToMatch]
 #     else:  #For debugging
 #         print("Could not find the following hub in the track list: "+nameToMatch)
 #         print(hubsDic[hub]["hubTrack"])
 
 #Use the hubURL as the ID and consolidate the hubs into a single entry
 #Note there is a small improvement that can be made here - I pickt he most popular track per mirror and compare
 #to the others. The best way would be to add up the most popular track for each hub in each mirror, minor diff
 #but that is what leads to most popular machine showing up as Asia or Euro for some instead of the RR.
 consolidatePubHubsToWriteOut = OrderedDict()
 for hub in hubsDic:
     hubUrl = hubsDic[hub]['hubURL']
     if hubUrl not in consolidatePubHubsToWriteOut.keys():
         consolidatePubHubsToWriteOut[hubUrl] = {}
         consolidatePubHubsToWriteOut[hubUrl]['count'] = hubsDic[hub]['updatedCount']
         consolidatePubHubsToWriteOut[hubUrl]['hubDb'] = hubsDic[hub]['hubDb']
         consolidatePubHubsToWriteOut[hubUrl]['shortLabel'] = hubsDic[hub]['shortLabel']
         consolidatePubHubsToWriteOut[hubUrl]['mostPopumachine'] = hubsDic[hub]['machine']
     else:
         if consolidatePubHubsToWriteOut[hubUrl]['count'] < hubsDic[hub]['updatedCount']:
             consolidatePubHubsToWriteOut[hubUrl]['count'] = hubsDic[hub]['updatedCount']
             consolidatePubHubsToWriteOut[hubUrl]['mostPopumachine'] = hubsDic[hub]['machine']
 
 nonPubHubsFile = open("/hive/users/"+user+"/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.txt", "w")
 for key in consolidatePubHubsToWriteOut:
     nonPubHubsFile.write(consolidatePubHubsToWriteOut[key]['hubDb']+"\t"+str(consolidatePubHubsToWriteOut[key]['count'])+"\t"+consolidatePubHubsToWriteOut[key]['shortLabel']+"\t"+key+"\t"+consolidatePubHubsToWriteOut[key]['mostPopumachine']+"\n")
 nonPubHubsFile.close()
 
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo "List of hub usage that are not public hubs. Counts are added across all mirrors/machines. This includes curated hubs:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash("echo db$'\\t'useCount$'\\t'shortLabel$'\\t'hubUrl$'\\t'mostPopularMachine >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
 bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('sort -rnk2 /hive/users/'+user+'/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.txt > /hive/users/'+user+'/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.sorted.txt')
 bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.sorted.txt | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo Previous outputs of this cron can be found here: https://genecats.gi.ucsc.edu/qa/test-results/usageStats/ >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo Monthly usage counts of all public hubs can be found here: https://genecats.gi.ucsc.edu/qa/test-results/usageStats/publicHubUsageCounts/ >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo Archive of monthly raw data can be found here: /hive/users/qateam/assemblyStatsCronArchive/ >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 bash("mkdir -p /hive/users/"+user+"/assemblyStatsCronArchive/"+lastMonthFormat)
 bash("cp /hive/users/"+user+"/ErrorLogsOutput/* /hive/users/"+user+"/assemblyStatsCronArchive/"+lastMonthFormat)
 
 if user == 'qateam':
     bash("cat /hive/users/"+user+"/ErrorLogsOutput/results.txt > /usr/local/apache/htdocs-genecats/qa/test-results/usageStats/"+lastMonthFormat)
     publicHubPageHeader = """This page contains the usage count of UCSC Genome Browser public hubs. The numbers represent
 individual browsing sessions across all UCSC mirrors for the month of """+lastMonthFormat+""".
 
 assembly\tusageCount\tmostPopularTrack\thubName
 """
     with open("/usr/local/apache/htdocs-genecats/qa/test-results/usageStats/publicHubUsageCounts/pubHubUsageCounts."+lastMonthFormat+".txt",'w') as hubsUsageFile:
         hubsUsageFile.write(publicHubPageHeader)
     bash('cat /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.sorted.txt >> /usr/local/apache/htdocs-genecats/qa/test-results/usageStats/publicHubUsageCounts/pubHubUsageCounts.'+lastMonthFormat+'.txt')
 resultsFile = open('/hive/users/'+user+'/ErrorLogsOutput/results.txt','r')
 for line in resultsFile:
     print(line.rstrip())
 resultsFile.close()
 
 bash("rm /hive/users/"+user+"/ErrorLogs/*")
 bash("rm /hive/users/"+user+"/ErrorLogsOutput/*")