dd0dcde4d9c1dd587b2988ec90e960c4685c8c92 lrnassar Tue Mar 25 20:24:46 2025 -0700 Added a column to report what % of the total database usage each one of them is. No RM. diff --git src/utils/qa/assemblyStatsCron.py src/utils/qa/assemblyStatsCron.py index 664ebf79221..e3371ad7f58 100755 --- src/utils/qa/assemblyStatsCron.py +++ src/utils/qa/assemblyStatsCron.py @@ -90,51 +90,53 @@ else: defaultsLine = defaults[trackLogs].split(" ")[4] defaultsLine = defaultsLine.split(',') for track in range(len(defaultsLine)): defaultsLine[track] = defaultsLine[track][0:(len(defaultsLine[track])-2)] file.write(db+"\t"+defaultsLine[track]+"\n") file.write(db+"\t"+"cytoBand"+"\n") file.close() bash('echo This cronjob pulls out GB stats over the last month, across all RR machines and Asia/Euro mirrors using the generateUsageStats.py script. It only counts each hgsid occurrence once, filtering our any hgsid that only showed up one time. > /hive/users/'+user+'/ErrorLogsOutput/results.txt') ##### Report the database usage, aggregating curated hubs and GenArk ###### bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo List of db usage, hubs are aggregated across mirrors to a single count: >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') -bash("echo db$'\\t'dbUse >> /hive/users/"+user+"/ErrorLogsOutput/results.txt") +bash("echo db$'\\t'dbUse$'\\t'percentUse >> /hive/users/"+user+"/ErrorLogsOutput/results.txt") bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') dbCountsRaw = open('/hive/users/'+user+'/ErrorLogsOutput/dbCounts.tsv','r') dbCountsCombined = open('/hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv','w') dbsCounts = {} +totalCount = 0 for line in dbCountsRaw: line = line.rstrip().split("\t") if line[0].startswith("hub"): db = "_".join(line[0].split("_")[2:]) else: db = line[0] count = int(line[1]) + totalCount = totalCount + count if db not in dbsCounts: dbsCounts[db] = count else: dbsCounts[db] = dbsCounts[db] + count for key in dbsCounts: - dbCountsCombined.write(key+"\t"+str(dbsCounts[key])+"\n") + dbCountsCombined.write(key+"\t"+str(dbsCounts[key])+"\t"+str(round(dbsCounts[key]/totalCount,3))+"\n") dbCountsCombined.close() dbCountsRaw.close() bash('sort /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv -rnk2 > /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv.sorted') bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv.sorted | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') ##### Report default track usage for hg38 and hg19 ###### bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo "List of default track usage for hg38, sorted by how many users are turning off the track:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash("echo db$'\\t'trackUse$'\\t'% using$'\\t'% turning off$'\\t'trackName >> /hive/users/"+user+"/ErrorLogsOutput/results.txt") bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('grep ^hg38 /hive/users/'+user+'/ErrorLogsOutput/defaultCounts.tsv | grep -v "MarkH3k27ac" | sort -nrk 5 | awk -v OFS="\\t" \'{ print $1,$3,$4,$5,$2 }\' | head -n 15 | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt')