src/utils/qa/assemblyStatsCron.py dd0dcde4d9c1dd587b2988ec90e960c4685c8c92

dd0dcde4d9c1dd587b2988ec90e960c4685c8c92
lrnassar
  Tue Mar 25 20:24:46 2025 -0700
Added a column to report what % of the total database usage each one of them is. No RM.

diff --git src/utils/qa/assemblyStatsCron.py src/utils/qa/assemblyStatsCron.py
index 664ebf79221..e3371ad7f58 100755
--- src/utils/qa/assemblyStatsCron.py
+++ src/utils/qa/assemblyStatsCron.py
@@ -90,51 +90,53 @@
         else:
             defaultsLine = defaults[trackLogs].split(" ")[4]
             defaultsLine = defaultsLine.split(',')
             for track in range(len(defaultsLine)):
                 defaultsLine[track] = defaultsLine[track][0:(len(defaultsLine[track])-2)]
                 file.write(db+"\t"+defaultsLine[track]+"\n")
     file.write(db+"\t"+"cytoBand"+"\n")
 file.close()
 
 bash('echo This cronjob pulls out GB stats over the last month, across all RR machines and Asia/Euro mirrors using the generateUsageStats.py script. It only counts each hgsid occurrence once, filtering our any hgsid that only showed up one time. > /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 ##### Report the database usage, aggregating curated hubs and GenArk ######
 
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo List of db usage, hubs are aggregated across mirrors to a single count: >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
-bash("echo db$'\\t'dbUse >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
+bash("echo db$'\\t'dbUse$'\\t'percentUse >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
 bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 dbCountsRaw = open('/hive/users/'+user+'/ErrorLogsOutput/dbCounts.tsv','r')
 dbCountsCombined = open('/hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv','w')
 
 dbsCounts = {}
+totalCount = 0
 for line in dbCountsRaw:
     line = line.rstrip().split("\t")
     if line[0].startswith("hub"):
         db = "_".join(line[0].split("_")[2:])
     else:
         db = line[0]
     count = int(line[1])
+    totalCount = totalCount + count
     if db not in dbsCounts:
         dbsCounts[db] = count
     else:
         dbsCounts[db] = dbsCounts[db] + count
 
 for key in dbsCounts:
-    dbCountsCombined.write(key+"\t"+str(dbsCounts[key])+"\n")
+    dbCountsCombined.write(key+"\t"+str(dbsCounts[key])+"\t"+str(round(dbsCounts[key]/totalCount,3))+"\n")
 
 dbCountsCombined.close()
 dbCountsRaw.close()
 
 bash('sort /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv -rnk2 > /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv.sorted')
 bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv.sorted | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 ##### Report default track usage for hg38 and hg19 ######
 
 bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash('echo "List of default track usage for hg38, sorted by how many users are turning off the track:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 bash("echo db$'\\t'trackUse$'\\t'% using$'\\t'% turning off$'\\t'trackName >> /hive/users/"+user+"/ErrorLogsOutput/results.txt")
 bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')
 
 bash('grep ^hg38 /hive/users/'+user+'/ErrorLogsOutput/defaultCounts.tsv | grep -v "MarkH3k27ac" | sort -nrk 5 | awk -v OFS="\\t" \'{ print $1,$3,$4,$5,$2 }\' | head -n 15 | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')