src/utils/qa/assemblyStatsCron.py 77c1fd0753c5b43a3c10baa351ed40962be704cb

77c1fd0753c5b43a3c10baa351ed40962be704cb
lrnassar
  Tue Aug 27 16:53:48 2024 -0700
Small change to generateUsageStats.py was to improve how it splits character lines. Before if counts were high, e.g. track:1034, it would slice out last 2 numbers but keep the :10. For the assemblyStatsCron various changes were made as documented in the RM ticket. This script is a fairly big mess, and has a lot of room for improvement via functions, better commentings, not using bash to write out, and general optimizations like parsing files mutltiple times. However, it is only used internally for general exploratory overviews and thus it has not warranted a proper restructuring and cleaning. The primary motivation here was to add up counts from track usage across all hubs instead of only reporting the highest one. Refs #34266

diff --git src/utils/qa/assemblyStatsCron.py src/utils/qa/assemblyStatsCron.py
index 76c44b4..33703ef 100755
--- src/utils/qa/assemblyStatsCron.py
+++ src/utils/qa/assemblyStatsCron.py
@@ -1,287 +1,416 @@
 #07/20/19
 #This was adapted from a jupyter notebook - hence lots of weird bash calls
 
 import datetime
 from collections import OrderedDict
 import getpass
 import subprocess
 
 def bash(cmd):
     """Run the cmd in bash subprocess"""
     try:
         rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
                                        stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
         bashStdoutt = rawBashOutput.stdout
     except subprocess.CalledProcessError as e:
         raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
     return(bashStdoutt)
 
 def bashNoErrorCatch(cmd):
     """Run the cmd in bash subprocess, don't catch error since grep returns exit code 1 when no match is found"""
     try:
         rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
                                        stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
         bashStdoutt = rawBashOutput.stdout.rstrip().split("\n")
     except:
         bashStdoutt = []
     return(bashStdoutt)
 
 user = getpass.getuser()
 
 # Get the year to query proper wwwstats directory
 today = datetime.datetime.today()
 year = str(today).split('-')[0]
 
 # Get the last 5 error logs from the RR
 latestLogs = bash('ls /hive/users/chmalee/logs/trimmedLogs/result/hgw1').rstrip().split("\n")
-######################## FOR TESTING JUST ONE LOG
-# latestLogs = latestLogs[len(latestLogs)-1:]
-#########################
-latestLogs = latestLogs[len(latestLogs)-5:]
 
-nodes = ['RR', 'asiaNode', 'euroNode'] #Add nodes with error logs, nodes can be added or removed
+######################## TESTING MODE - ONLY PROCESS MINIMAL AMOUNT OF LOGS ########################
+testMode = False #Set true for testing mode
+#################################################################################################
 
+if testMode: #Default is one RR log and one asia log
+    latestLogs = latestLogs[len(latestLogs)-1:]
+    nodes = ['RR', 'asiaNode'] #Add nodes with error logs, nodes can be added or removed
+    machines = ['hgw1'] #Add hgw machines to check
+else:
+    latestLogs = latestLogs[len(latestLogs)-5:]
+    nodes = ['RR', 'asiaNode', 'euroNode'] #Add nodes with error logs, nodes can be added or removed
     machines = ['hgw1','hgw2'] #Add hgw machines to check
+
 for node in nodes:
     if node == 'RR':
         for machine in machines:
             for log in latestLogs: #Copy the 5 latest error logs for each of the rr machines
                 bash("cp /hive/users/chmalee/logs/trimmedLogs/result/"+machine+'/'+log+' /hive/users/'+user+'/ErrorLogs/'+node+machine+log)
 
     else:
         latestLogs = bash("ls /hive/users/chmalee/logs/trimmedLogs/result/"+node).rstrip().split("\n")
+        if testMode:
+            latestLogs = latestLogs[len(latestLogs)-1:]
+        else:
             latestLogs = latestLogs[len(latestLogs)-5:]
+            
         for log in latestLogs: #Copy the 5 latest error logs for each of the other nodes
             bash('cp /hive/users/chmalee/logs/trimmedLogs/result/'+node+'/'+log+' /hive/users/'+user+'/ErrorLogs/'+node+log)
 
 # Run generateUsageStats.py with -d (directory), -t (default track stats), -o (output)
 bash("/cluster/home/"+user+'/kent/src/hg/logCrawl/dbTrackAndSearchUsage/generateUsageStats.py -d /hive/users/'+user+'/ErrorLogs --allOutput -t -o /hive/users/'+user+'/ErrorLogsOutput > /dev/null')
 
 #The following section pulls out a list of default track for the top X assemblies for filtering
 file = open("/hive/users/"+user+"/ErrorLogsOutput/defaults.txt", "a")
 
 #The head command can be expanded to be more inclusive if additional assembly defaults are finding their way onto the list
 bash("sort /hive/users/"+user+'/ErrorLogsOutput/dbCounts.tsv -rnk2 > /hive/users/'+user+'/ErrorLogsOutput/dbCountsTopSorted.tsv')
 dbs = bash('head -n 4 /hive/users/'+user+'/ErrorLogsOutput/dbCountsTopSorted.tsv | cut -f1 -d "\t"').rstrip().split("\n")
 
 
 for db in dbs[0:4]:
     #The following part queries hgTracks for each of the assemblies and extracts the list of defaults
     bash('echo '+db+' > /hive/users/'+user+'/ErrorLogsOutput/temp.txt')
     defaults = bash('HGDB_CONF=$HOME/.hg.conf.beta /usr/local/apache/cgi-bin/hgTracks db='+db+' hgt.trackImgOnly=1 > /dev/null').rstrip().split("\n")
 
     for trackLogs in range(len(defaults)):
         if trackLogs >= (len(defaults)-1):
             pass
         else:
             defaultsLine = defaults[trackLogs].split(" ")[4]
             defaultsLine = defaultsLine.split(',')
             for track in range(len(defaultsLine)):
                 defaultsLine[track] = defaultsLine[track][0:(len(defaultsLine[track])-2)]
                 file.write(db+"\t"+defaultsLine[track]+"\n")
     file.write(db+"\t"+"cytoBand"+"\n")
 file.close()
 
 bash('echo This cronjob pulls out GB stats over the last month, across all RR machines and Asia/Euro mirrors using the generateUsageStats.py script. It only counts each hgsid occurrence once, filtering our any hgsid that only showed up one time. > /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
+##### Report the database usage, aggregating curated hubs and GenArk ######
+
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
-bash('echo List of db usage: >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
+bash('echo List of db usage, hubs are aggregated across mirrors to a single count: >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash("echo db$'\\t'dbUse >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
 bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
-bash('sort /hive/users/'+user+'/ErrorLogsOutput/dbCounts.tsv -rnk2 > /hive/users/'+user+'/ErrorLogsOutput/dbCounts.tsv.sorted')
-bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/dbCounts.tsv.sorted | /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
+dbCountsRaw = open('/hive/users/'+user+'/ErrorLogsOutput/dbCounts.tsv','r')
+dbCountsCombined = open('/hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv','w')
+
+dbsCounts = {}
+for line in dbCountsRaw:
+    line = line.rstrip().split("\t")
+    if line[0].startswith("hub"):
+        db = "_".join(line[0].split("_")[2:])
+    else:
+        db = line[0]
+    count = int(line[1])
+    if db not in dbsCounts:
+        dbsCounts[db] = count
+    else:
+        dbsCounts[db] = dbsCounts[db] + count
+        
+for key in dbsCounts:
+    dbCountsCombined.write(key+"\t"+str(dbsCounts[key])+"\n")
+
+dbCountsCombined.close()
+dbCountsRaw.close()
+    
+bash('sort /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv -rnk2 > /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv.sorted')
+bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv.sorted | /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
+
+##### Report default track usage for hg38 and hg19 ######
 
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo "List of default track usage for hg38, sorted by how many users are turning off the track:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash("echo db$'\\t'trackUse$'\\t'% using$'\\t'% turning off$'\\t'trackName >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
 bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 bash('grep ^hg38 /hive/users/'+user+'/ErrorLogsOutput/defaultCounts.tsv | grep -v "MarkH3k27ac" | sort -nrk 5 | awk -v OFS="\\t" \'{ print $1,$3,$4,$5,$2 }\' | head -n 15 | /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo "List of default track usage for hg19, sorted by how many users are turning off the track:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash("echo db$'\\t'trackUse$'\\t'% using$'\\t'% turning off$'\\t'trackName >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
 bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 bash('grep ^hg19 /hive/users/'+user+'/ErrorLogsOutput/defaultCounts.tsv | grep -v "MarkH3k27ac" | sort -nrk 5 | awk -v OFS="\\t" \'{ print $1,$3,$4,$5,$2 }\' | head -n 15| /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo List of non-default track usage: >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash("echo db$'\\t'trackUse$'\\t'trackName >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
 bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 bash('sort /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv -rnk3 > /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted')
 bash('cat /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted | grep -v -f /hive/users/'+user+'/ErrorLogsOutput/defaults.txt > /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted.noDefaults')
 bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted.noDefaults | awk -v OFS="\\t" \'{ print $1,$3,$2 }\' | /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
-bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
-bash('echo "List of public hub usage (only most used track represented):" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
-bash("echo db$'\\t'trackUse$'\\t'track$'\\t'pubHub >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
-bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
+##### Report public hub usage and non-public hub usage ######
 
-file2 = open("/hive/users/"+user+"/ErrorLogsOutput/filteredHubs.txt", "a") #Using a file to be able to use the same ~markd/bin/tabFmt format
+pubHubFile = open("/hive/users/"+user+"/ErrorLogsOutput/pubHubs.txt", "w") #Using a file to be able to use the same ~markd/bin/tabFmt format
 bash("sort /hive/users/"+user+"/ErrorLogsOutput/trackCountsHubs.tsv -rnk4 -t $\'\\t\' > /hive/users/"+user+"/ErrorLogsOutput/trackCountsHubs.tsv.sorted")
-topHubs = bash('head -n 15000 /hive/users/'+user+'/ErrorLogsOutput/trackCountsHubs.tsv.sorted').rstrip().split("\n")
+allPubHubs = bash('cat /hive/users/'+user+'/ErrorLogsOutput/trackCountsHubs.tsv.sorted').rstrip().split("\n")
+
 #This section pulls out only a single occurence of each public hub, picking the first track (most uses) to represent it
 results = OrderedDict()
-for each in topHubs:
-    if len(results.keys()) < 15:
+for each in allPubHubs:
     each = each.split('\t')
     if each[0] not in results.keys():
         results[each[0]] = each[0:]
-    else:
-        pass
-for key, value in results.items():
-    file2.write(value[0]+"\t"+value[1]+"\t"+value[2]+"\t"+value[3]+"\n")
-file2.close()
-
-bash('cat /hive/users/'+user+'/ErrorLogsOutput/filteredHubs.txt | awk -F "\\t" -v OFS="\\t" \'{ print $2,$4,$3,$1 }\' | /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
-
-bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
-bash('echo List of public hub track usage overall: >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
-bash("echo db$'\\t'trackUse$'\\t'track$'\\t'pubHub >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
-bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
-bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/trackCountsHubs.tsv.sorted | awk -F "\\t" -v OFS="\\t" \'{ print $2,$4,$3,$1 }\' | /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
+for key, value in results.items():
+    pubHubFile.write(value[0]+"\t"+value[1]+"\t"+value[2]+"\t"+value[3]+"\n")
+pubHubFile.close()
 
 #Query hubPublic and hubStats in order to filter out public hubs then sort out the IDs
 bash('/cluster/bin/x86_64/hgsql -h genome-centdb -e "select hubUrl from hubPublic" hgcentral > /hive/users/'+user+'/ErrorLogsOutput/hubPublicHubUrl.txt')
 bash('/cluster/bin/x86_64/hgsql -h genome-centdb -e "select hubUrl,id from hubStatus" hgcentral> /hive/users/'+user+'/ErrorLogsOutput/hubStatusHubUrl.txt')
 bash('grep -f /hive/users/'+user+'/ErrorLogsOutput/hubPublicHubUrl.txt /hive/users/'+user+'/ErrorLogsOutput/hubStatusHubUrl.txt | cut -f2 > /hive/users/'+user+'/ErrorLogsOutput/publicIDs.txt')
 
 #Add hub_ID format to match stats program output, then grep out the public hubs from the track list
 bash('cat /hive/users/'+user+'/ErrorLogsOutput/publicIDs.txt | sed s/^/hub_/g > /hive/users/'+user+'/ErrorLogsOutput/hubPublicIDs.txt')
 bash('grep "hub_" /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv | sort -rnk3 > /hive/users/'+user+'/ErrorLogsOutput/allTracksOrderedUsage.txt')
 
 #Pull out whole fields from euro and RR hubStatus in order to collect the info for matching IDs
 bash('ssh qateam@genome-euro "hgsql -e \'select id,hubUrl,shortLabel,lastOkTime from hubStatus\' hgcentral" > /hive/users/'+user+'/ErrorLogsOutput/genomeEuroHubStatus.txt')
 bash('/cluster/bin/x86_64/hgsql -h genome-centdb -e "select id,hubUrl,shortLabel,lastOkTime from hubStatus where lastOkTime !=\'\'" hgcentral > /hive/users/'+user+'/ErrorLogsOutput/RRHubStatus.txt')
 #The genome-asia hubStatus is automatically generated via cron by qateam on asia and copied to dev. Use line below to create a new file from personal user
 if user != 'qateam':
     bash("ssh "+user+"@genome-asia \"hgsql -e 'select id,hubUrl,shortLabel,lastOkTime from hubStatus' hgcentral\" > /hive/users/"+user+"/ErrorLogsOutput/genomeAsiaHubStatus.txt")
 
 hubs = bash('cat /hive/users/'+user+'/ErrorLogsOutput/allTracksOrderedUsage.txt').rstrip().split("\n")
 pubHubUrls = bash('cat /hive/users/'+user+'/ErrorLogsOutput/hubPublicHubUrl.txt').rstrip().split("\n")
 lastMonth = today - datetime.timedelta(days=30)
 lastMonthFormat = lastMonth.strftime('%Y-%m')
 hubsDic = OrderedDict()
 
 for hub in hubs: #Start iterating through track lines, stop pulling items at 20
     if len(hubsDic.keys()) < 20:
         #Pull out the use number, associated db, and hubkey
         hub = hub.split("\t")
         if 'hub' in hub[1]:
             hubKey = hub[1].split("_")[1]
         elif 'hub' in hub[0]:
             hubKey = hub[0].split("_")[1]
         else:
             print(hub)
 
-        hubCount = hub[2]
         hubDb = hub[0]
+        hubTrack = hub[1]
+        hubCount = hub[2]
+        
         if hubKey not in hubsDic.keys(): #Check that this key has not yet been counted
             entryMade = False #Reset entryMade variable, made to check RR and then Euro if RR has no match
             # Grep out the proper hubstatus line, returns an empty list if no matches
             #        ^1017$'\t'
             rrHub = bashNoErrorCatch("grep ^"+hubKey+"$'\\t' /hive/users/"+user+"/ErrorLogsOutput/RRHubStatus.txt")
 
             if rrHub != []:
                 #Pull out matching hubURL, ShortLabel, lastOKtime (used to see compare against)
                 #recurring hubIDs between RR and euro
                 rrHub = rrHub[0].split("\t")
                 if len(rrHub) > 2:
                     rrHubURL = rrHub[1]
                     rrHubShortLabel = rrHub[2]
                     rrHubLastOk = rrHub[3]
                     if rrHubLastOk != '':
                         rrHubLastOk = datetime.datetime.strptime(rrHubLastOk.split(" ")[0], '%Y-%m-%d')
                         if rrHubLastOk > lastMonth: #If  hub has been OK in last month, assume it's correct
                             if rrHubURL in pubHubUrls:
                                 entryMade = True
                             else: #Pull out all relevant info and save to dictionary
                                 hubsDic[hubKey] = {}
                                 hubsDic[hubKey]['shortLabel'] = rrHubShortLabel
                                 hubsDic[hubKey]['hubURL'] = rrHubURL
                                 hubsDic[hubKey]['hubCount'] = hubCount
                                 hubsDic[hubKey]['hubDb'] = hubDb
                                 hubsDic[hubKey]['machine'] = "RR"
+                                hubsDic[hubKey]['hubTrack'] = hubTrack
                                 entryMade = True #Set true so that the hubID isn't searched for in Euro
 
             if entryMade is False: #This assumes that the hubID was either not present in the RR hubStatus, or it pointed to a hub not used in last month
                 euroHub = bashNoErrorCatch("grep ^"+hubKey+"$'\\t' /hive/users/"+user+"/ErrorLogsOutput/genomeEuroHubStatus.txt")
                 if euroHub != []:
                     euroHub = euroHub[0].split("\t")
                     if len(euroHub) > 2:
                         euroHubURL = euroHub[1]
                         euroHubShortLabel = euroHub[2]
                         euroHubLastOk = euroHub[3]
                         if euroHubLastOk != '':
                             euroHubLastOk = datetime.datetime.strptime(euroHubLastOk.split(" ")[0], '%Y-%m-%d')
                             if euroHubLastOk > lastMonth:
                                 if euroHub[1] in pubHubUrls:
                                     entryMade = True
                                 else:
                                     hubsDic[hubKey] = {}
                                     hubsDic[hubKey]['shortLabel'] = euroHubShortLabel
                                     hubsDic[hubKey]['hubURL'] = euroHubURL
                                     hubsDic[hubKey]['hubCount'] = hubCount
                                     hubsDic[hubKey]['hubDb'] = hubDb
                                     hubsDic[hubKey]['machine'] = "Euro"
+                                    hubsDic[hubKey]['hubTrack'] = hubTrack
                                     entryMade = True
 
             if entryMade is False: #This assumes that the hubID was either not present in the RR hubStatus, or it pointed to a hub not used in last month
                 asiaHub = bashNoErrorCatch("grep ^"+hubKey+"$'\\t' /hive/users/"+user+"/ErrorLogsOutput/genomeAsiaHubStatus.txt")
                 if asiaHub != []:
                     asiaHub = asiaHub[0].split("\t")
                     if len(asiaHub) > 2:
                         asiaHubURL = asiaHub[1]
                         asiaHubShortLabel = asiaHub[2]
                         asiaHubLastOk = asiaHub[3]
                         if asiaHubLastOk != '':
                             asiaHubLastOk = datetime.datetime.strptime(asiaHubLastOk.split(" ")[0], '%Y-%m-%d')
                             if asiaHubLastOk > lastMonth:
                                 if asiaHub[1] in pubHubUrls:
                                     pass
                                 else:
                                     hubsDic[hubKey] = {}
                                     hubsDic[hubKey]['shortLabel'] = asiaHubShortLabel
                                     hubsDic[hubKey]['hubURL'] = asiaHubURL
                                     hubsDic[hubKey]['hubCount'] = hubCount
                                     hubsDic[hubKey]['hubDb'] = hubDb
                                     hubsDic[hubKey]['machine'] = "Asia"
+                                    hubsDic[hubKey]['hubTrack'] = hubTrack
                                     entryMade = True
                     else:
                         bash("echo The following hub: "+hub+" was not found in the RR/euro/asia hubStatus with a lastOkTime within the last month. This likely means an error has occurred. >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
 
 #Create new file to write out the top 20 most used hubs
 hubsNotPublic = open("/hive/users/"+user+"/ErrorLogsOutput/hubsNotPublic.txt", "a")
 for key in hubsDic.keys():
     hubsNotPublic.write(hubsDic[key]['hubDb']+"\t"+hubsDic[key]['machine']+"\t"+str(hubsDic[key]['hubCount'])+"\t"+hubsDic[key]['shortLabel']+"\t"+hubsDic[key]['hubURL']+"\n")
 hubsNotPublic.close()
 
+#Work on a non-public hub list added up across all mirrors
+#Create a dic of a all hub counts
+allHubCounts = {}
+trackCounts = open("/hive/users/"+user+"/ErrorLogsOutput/trackCounts.tsv.sorted","r")
+for line in trackCounts:
+    line = line.rstrip().split("\t")
+    if line[1].startswith("hub_"):
+        if "hub" in line[0]: #check for database like hub_164399_GCA_004023905.1
+            database = "_".join(line[0].split("_")[2:])
+        else:
+            database = line[0]
+        nameToMatch = "_".join(line[1].split("_")[2:]) + database
+        if nameToMatch not in allHubCounts.keys():
+            allHubCounts[nameToMatch] = int(line[2])
+        elif nameToMatch in allHubCounts.keys():
+            allHubCounts[nameToMatch] = allHubCounts[nameToMatch] + int(line[2])
+#         else: #For debugging
+#             print("This error should not happen.")
+trackCounts.close()
+
+#Now use the new list to report the most popular public hub numbers across all mirrors
+pubHubList = open("/hive/users/"+user+"/ErrorLogsOutput/pubHubs.txt", "r")
+pubHubDic = {}
+for pubHub in pubHubList:
+    pubHub = pubHub.rstrip().split("\t")
+    if "hub" in pubHub[1]: #check for database like hub_164399_GCA_004023905.1
+        database = "_".join(pubHub[1].split("_")[2:])
+        name = pubHub[2] + database
+    else:
+        name = pubHub[2] + pubHub[1]
+        database = pubHub[1]
+    #Make name a combination of track name + assembly
+    pubHubDic[name] = {}
+    pubHubDic[name]['trackName'] = pubHub[2]
+    pubHubDic[name]['dbs'] = database
+    pubHubDic[name]['hubName'] = pubHub[0]
+    if name in allHubCounts:
+        pubHubDic[name]['count'] = allHubCounts[name]
+    else:
+        print("Name not in public hub list: "+name)
+pubHubList.close()
+
+pubHubFile = open("/hive/users/"+user+"/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.txt", "w")
+for key in pubHubDic:
+    pubHubFile.write(pubHubDic[key]['dbs']+"\t"+str(pubHubDic[key]['count'])+"\t"+pubHubDic[key]['trackName']+"\t"+pubHubDic[key]['hubName']+"\n")
+pubHubFile.close()
+
+bash('sort -rnk2 /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.txt > /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.sorted.txt')
+
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
-bash('echo "List of hub uses that are not public hubs. Some public hubs sneak in due to alternative hubUrl. This includes curated hubs:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
-bash("echo db$'\\t'machine$'\\t'trackUse$'\\t'shortLabel$'\\t'hubUrl >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
+bash('echo "List of public hub usage (only most used track represented). Counts added across all mirrors:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
+bash("echo db$'\\t'trackUse$'\\t'track$'\\t'pubHub >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
 bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
+bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.sorted.txt | /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
+
+#Then do the same for non-public hubs
+for hub in hubsDic:
+    if "hub" in hubsDic[hub]['hubDb']: #check for database like hub_164399_GCA_004023905.1
+        hubsDic[hub]['hubDb'] = "_".join(hubsDic[hub]['hubDb'].split("_")[2:])
+    if "hub" in hubsDic[hub]['hubTrack']: #check for database like hub_116154_xenoRefGene
+        hubsDic[hub]['hubTrack'] = "_".join(hubsDic[hub]['hubTrack'].split("_")[2:])
+        
+    nameToMatch = hubsDic[hub]["hubTrack"] + hubsDic[hub]['hubDb']
+    if nameToMatch in allHubCounts.keys():
+        hubsDic[hub]['updatedCount'] = allHubCounts[nameToMatch]
+#     else:  #For debugging
+#         print("Could not find the following hub in the track list: "+nameToMatch)
+#         print(hubsDic[hub]["hubTrack"])
+
+#Use the hubURL as the ID and consolidate the hubs into a single entry
+#Note there is a small improvement that can be made here - I pickt he most popular track per mirror and compare
+#to the others. The best way would be to add up the most popular track for each hub in each mirror, minor diff
+#but that is what leads to most popular machine showing up as Asia or Euro for some instead of the RR.
+consolidatePubHubsToWriteOut = OrderedDict()
+for hub in hubsDic:
+    hubUrl = hubsDic[hub]['hubURL']
+    if hubUrl not in consolidatePubHubsToWriteOut.keys():
+        consolidatePubHubsToWriteOut[hubUrl] = {}
+        consolidatePubHubsToWriteOut[hubUrl]['count'] = hubsDic[hub]['updatedCount']
+        consolidatePubHubsToWriteOut[hubUrl]['hubDb'] = hubsDic[hub]['hubDb']
+        consolidatePubHubsToWriteOut[hubUrl]['shortLabel'] = hubsDic[hub]['shortLabel']
+        consolidatePubHubsToWriteOut[hubUrl]['mostPopumachine'] = hubsDic[hub]['machine']
+    else:
+        if consolidatePubHubsToWriteOut[hubUrl]['count'] < hubsDic[hub]['updatedCount']:
+            consolidatePubHubsToWriteOut[hubUrl]['count'] = hubsDic[hub]['updatedCount']
+            consolidatePubHubsToWriteOut[hubUrl]['mostPopumachine'] = hubsDic[hub]['machine']
+
+nonPubHubsFile = open("/hive/users/"+user+"/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.txt", "w")
+for key in consolidatePubHubsToWriteOut:
+    nonPubHubsFile.write(consolidatePubHubsToWriteOut[key]['hubDb']+"\t"+str(consolidatePubHubsToWriteOut[key]['count'])+"\t"+consolidatePubHubsToWriteOut[key]['shortLabel']+"\t"+key+"\t"+consolidatePubHubsToWriteOut[key]['mostPopumachine']+"\n")
+nonPubHubsFile.close()
 
-bash('cat /hive/users/'+user+'/ErrorLogsOutput/hubsNotPublic.txt | /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
+bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
+bash('echo "List of hub usage that are not public hubs. Counts are added across all mirrors/machines. This includes curated hubs:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
+bash("echo db$'\\t'useCount$'\\t'shortLabel$'\\t'hubUrl$'\\t'mostPopularMachine >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
+bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
+bash('sort -rnk2 /hive/users/'+user+'/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.txt > /hive/users/'+user+'/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.sorted.txt')
+bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.sorted.txt | /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo Previous outputs of this cron can be found here: https://genecats.gi.ucsc.edu/qa/test-results/usageStats/ >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
+bash('echo Monthly usage counts of all public hubs can be found here: https://genecats.gi.ucsc.edu/qa/test-results/usageStats/publicHubUsageCounts/ >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo Archive of monthly raw data can be found here: /hive/users/qateam/assemblyStatsCronArchive/ >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 bash("mkdir -p /hive/users/"+user+"/assemblyStatsCronArchive/"+lastMonthFormat)
 bash("cp /hive/users/"+user+"/ErrorLogsOutput/* /hive/users/"+user+"/assemblyStatsCronArchive/"+lastMonthFormat)
 
 if user == 'qateam':
     bash("cat /hive/users/"+user+"/ErrorLogsOutput/results.txt > /usr/local/apache/htdocs-genecats/qa/test-results/usageStats/"+lastMonthFormat)
+    publicHubPageHeader = """This page contains the usage count of UCSC Genome Browser public hubs. The numbers represent 
+individual browsing sessions across all UCSC mirrors for the month of """+lastMonthFormat+""".
+
+assembly\tusageCount\tmostPopularTrack\thubName
+"""
+    with open("/usr/local/apache/htdocs-genecats/qa/test-results/usageStats/publicHubUsageCounts/pubHubUsageCounts."+lastMonthFormat+".txt",'w') as hubsUsageFile:
+        hubsUsageFile.write(publicHubPageHeader)
+    bash('cat /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.sorted.txt >> /usr/local/apache/htdocs-genecats/qa/test-results/usageStats/publicHubUsageCounts/pubHubUsageCounts.'+lastMonthFormat+'.txt')
 
 resultsFile = open('/hive/users/'+user+'/ErrorLogsOutput/results.txt','r')
 for line in resultsFile:
     print(line.rstrip())
 resultsFile.close()
 
 bash("rm /hive/users/"+user+"/ErrorLogs/*")
 bash("rm /hive/users/"+user+"/ErrorLogsOutput/*")