336661f13f731c2a26a88f9acb9673f8751dac77 lrnassar Fri Feb 14 08:56:56 2025 -0800 Fixing up the monthly usage stats cron - adding a new path to the tabFmt script, and adding a cleanup step at the top of the script so that it doesn't reuse old logs if a run fails. diff --git src/utils/qa/assemblyStatsCron.py src/utils/qa/assemblyStatsCron.py index 33703eff284..aba7b5eb6e3 100755 --- src/utils/qa/assemblyStatsCron.py +++ src/utils/qa/assemblyStatsCron.py @@ -14,30 +14,34 @@ bashStdoutt = rawBashOutput.stdout except subprocess.CalledProcessError as e: raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) return(bashStdoutt) def bashNoErrorCatch(cmd): """Run the cmd in bash subprocess, don't catch error since grep returns exit code 1 when no match is found""" try: rawBashOutput = subprocess.run(cmd, check=True, shell=True,\ stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) bashStdoutt = rawBashOutput.stdout.rstrip().split("\n") except: bashStdoutt = [] return(bashStdoutt) +#Clean out any previous unfinished run +bash("rm -f /hive/users/"+user+"/ErrorLogs/*") +bash("rm -f /hive/users/"+user+"/ErrorLogsOutput/*") + user = getpass.getuser() # Get the year to query proper wwwstats directory today = datetime.datetime.today() year = str(today).split('-')[0] # Get the last 5 error logs from the RR latestLogs = bash('ls /hive/users/chmalee/logs/trimmedLogs/result/hgw1').rstrip().split("\n") ######################## TESTING MODE - ONLY PROCESS MINIMAL AMOUNT OF LOGS ######################## testMode = False #Set true for testing mode ################################################################################################# if testMode: #Default is one RR log and one asia log latestLogs = latestLogs[len(latestLogs)-1:] @@ -112,56 +116,56 @@ else: db = line[0] count = int(line[1]) if db not in dbsCounts: dbsCounts[db] = count else: dbsCounts[db] = dbsCounts[db] + count for key in dbsCounts: dbCountsCombined.write(key+"\t"+str(dbsCounts[key])+"\n") dbCountsCombined.close() dbCountsRaw.close() bash('sort /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv -rnk2 > /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv.sorted') -bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv.sorted | /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') +bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv.sorted | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') ##### Report default track usage for hg38 and hg19 ###### bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo "List of default track usage for hg38, sorted by how many users are turning off the track:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash("echo db$'\\t'trackUse$'\\t'% using$'\\t'% turning off$'\\t'trackName >> /hive/users/"+user+"/ErrorLogsOutput/results.txt") bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') -bash('grep ^hg38 /hive/users/'+user+'/ErrorLogsOutput/defaultCounts.tsv | grep -v "MarkH3k27ac" | sort -nrk 5 | awk -v OFS="\\t" \'{ print $1,$3,$4,$5,$2 }\' | head -n 15 | /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') +bash('grep ^hg38 /hive/users/'+user+'/ErrorLogsOutput/defaultCounts.tsv | grep -v "MarkH3k27ac" | sort -nrk 5 | awk -v OFS="\\t" \'{ print $1,$3,$4,$5,$2 }\' | head -n 15 | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo "List of default track usage for hg19, sorted by how many users are turning off the track:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash("echo db$'\\t'trackUse$'\\t'% using$'\\t'% turning off$'\\t'trackName >> /hive/users/"+user+"/ErrorLogsOutput/results.txt") bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') -bash('grep ^hg19 /hive/users/'+user+'/ErrorLogsOutput/defaultCounts.tsv | grep -v "MarkH3k27ac" | sort -nrk 5 | awk -v OFS="\\t" \'{ print $1,$3,$4,$5,$2 }\' | head -n 15| /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') +bash('grep ^hg19 /hive/users/'+user+'/ErrorLogsOutput/defaultCounts.tsv | grep -v "MarkH3k27ac" | sort -nrk 5 | awk -v OFS="\\t" \'{ print $1,$3,$4,$5,$2 }\' | head -n 15| /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo List of non-default track usage: >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash("echo db$'\\t'trackUse$'\\t'trackName >> /hive/users/"+user+"/ErrorLogsOutput/results.txt") bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('sort /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv -rnk3 > /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted') bash('cat /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted | grep -v -f /hive/users/'+user+'/ErrorLogsOutput/defaults.txt > /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted.noDefaults') -bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted.noDefaults | awk -v OFS="\\t" \'{ print $1,$3,$2 }\' | /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') +bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted.noDefaults | awk -v OFS="\\t" \'{ print $1,$3,$2 }\' | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') ##### Report public hub usage and non-public hub usage ###### pubHubFile = open("/hive/users/"+user+"/ErrorLogsOutput/pubHubs.txt", "w") #Using a file to be able to use the same ~markd/bin/tabFmt format bash("sort /hive/users/"+user+"/ErrorLogsOutput/trackCountsHubs.tsv -rnk4 -t $\'\\t\' > /hive/users/"+user+"/ErrorLogsOutput/trackCountsHubs.tsv.sorted") allPubHubs = bash('cat /hive/users/'+user+'/ErrorLogsOutput/trackCountsHubs.tsv.sorted').rstrip().split("\n") #This section pulls out only a single occurence of each public hub, picking the first track (most uses) to represent it results = OrderedDict() for each in allPubHubs: each = each.split('\t') if each[0] not in results.keys(): results[each[0]] = each[0:] for key, value in results.items(): @@ -269,31 +273,30 @@ asiaHubLastOk = datetime.datetime.strptime(asiaHubLastOk.split(" ")[0], '%Y-%m-%d') if asiaHubLastOk > lastMonth: if asiaHub[1] in pubHubUrls: pass else: hubsDic[hubKey] = {} hubsDic[hubKey]['shortLabel'] = asiaHubShortLabel hubsDic[hubKey]['hubURL'] = asiaHubURL hubsDic[hubKey]['hubCount'] = hubCount hubsDic[hubKey]['hubDb'] = hubDb hubsDic[hubKey]['machine'] = "Asia" hubsDic[hubKey]['hubTrack'] = hubTrack entryMade = True else: bash("echo The following hub: "+hub+" was not found in the RR/euro/asia hubStatus with a lastOkTime within the last month. This likely means an error has occurred. >> /hive/users/"+user+"/ErrorLogsOutput/results.txt") - #Create new file to write out the top 20 most used hubs hubsNotPublic = open("/hive/users/"+user+"/ErrorLogsOutput/hubsNotPublic.txt", "a") for key in hubsDic.keys(): hubsNotPublic.write(hubsDic[key]['hubDb']+"\t"+hubsDic[key]['machine']+"\t"+str(hubsDic[key]['hubCount'])+"\t"+hubsDic[key]['shortLabel']+"\t"+hubsDic[key]['hubURL']+"\n") hubsNotPublic.close() #Work on a non-public hub list added up across all mirrors #Create a dic of a all hub counts allHubCounts = {} trackCounts = open("/hive/users/"+user+"/ErrorLogsOutput/trackCounts.tsv.sorted","r") for line in trackCounts: line = line.rstrip().split("\t") if line[1].startswith("hub_"): if "hub" in line[0]: #check for database like hub_164399_GCA_004023905.1 database = "_".join(line[0].split("_")[2:]) @@ -329,32 +332,31 @@ else: print("Name not in public hub list: "+name) pubHubList.close() pubHubFile = open("/hive/users/"+user+"/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.txt", "w") for key in pubHubDic: pubHubFile.write(pubHubDic[key]['dbs']+"\t"+str(pubHubDic[key]['count'])+"\t"+pubHubDic[key]['trackName']+"\t"+pubHubDic[key]['hubName']+"\n") pubHubFile.close() bash('sort -rnk2 /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.txt > /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.sorted.txt') bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo "List of public hub usage (only most used track represented). Counts added across all mirrors:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash("echo db$'\\t'trackUse$'\\t'track$'\\t'pubHub >> /hive/users/"+user+"/ErrorLogsOutput/results.txt") bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') -bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.sorted.txt | /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') - +bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.sorted.txt | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') #Then do the same for non-public hubs for hub in hubsDic: if "hub" in hubsDic[hub]['hubDb']: #check for database like hub_164399_GCA_004023905.1 hubsDic[hub]['hubDb'] = "_".join(hubsDic[hub]['hubDb'].split("_")[2:]) if "hub" in hubsDic[hub]['hubTrack']: #check for database like hub_116154_xenoRefGene hubsDic[hub]['hubTrack'] = "_".join(hubsDic[hub]['hubTrack'].split("_")[2:]) nameToMatch = hubsDic[hub]["hubTrack"] + hubsDic[hub]['hubDb'] if nameToMatch in allHubCounts.keys(): hubsDic[hub]['updatedCount'] = allHubCounts[nameToMatch] # else: #For debugging # print("Could not find the following hub in the track list: "+nameToMatch) # print(hubsDic[hub]["hubTrack"]) #Use the hubURL as the ID and consolidate the hubs into a single entry @@ -373,44 +375,43 @@ else: if consolidatePubHubsToWriteOut[hubUrl]['count'] < hubsDic[hub]['updatedCount']: consolidatePubHubsToWriteOut[hubUrl]['count'] = hubsDic[hub]['updatedCount'] consolidatePubHubsToWriteOut[hubUrl]['mostPopumachine'] = hubsDic[hub]['machine'] nonPubHubsFile = open("/hive/users/"+user+"/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.txt", "w") for key in consolidatePubHubsToWriteOut: nonPubHubsFile.write(consolidatePubHubsToWriteOut[key]['hubDb']+"\t"+str(consolidatePubHubsToWriteOut[key]['count'])+"\t"+consolidatePubHubsToWriteOut[key]['shortLabel']+"\t"+key+"\t"+consolidatePubHubsToWriteOut[key]['mostPopumachine']+"\n") nonPubHubsFile.close() bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo "List of hub usage that are not public hubs. Counts are added across all mirrors/machines. This includes curated hubs:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash("echo db$'\\t'useCount$'\\t'shortLabel$'\\t'hubUrl$'\\t'mostPopularMachine >> /hive/users/"+user+"/ErrorLogsOutput/results.txt") bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('sort -rnk2 /hive/users/'+user+'/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.txt > /hive/users/'+user+'/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.sorted.txt') -bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.sorted.txt | /cluster/home/lrnassar/temp/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') +bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.sorted.txt | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo Previous outputs of this cron can be found here: https://genecats.gi.ucsc.edu/qa/test-results/usageStats/ >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo Monthly usage counts of all public hubs can be found here: https://genecats.gi.ucsc.edu/qa/test-results/usageStats/publicHubUsageCounts/ >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo Archive of monthly raw data can be found here: /hive/users/qateam/assemblyStatsCronArchive/ >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash("mkdir -p /hive/users/"+user+"/assemblyStatsCronArchive/"+lastMonthFormat) bash("cp /hive/users/"+user+"/ErrorLogsOutput/* /hive/users/"+user+"/assemblyStatsCronArchive/"+lastMonthFormat) if user == 'qateam': bash("cat /hive/users/"+user+"/ErrorLogsOutput/results.txt > /usr/local/apache/htdocs-genecats/qa/test-results/usageStats/"+lastMonthFormat) publicHubPageHeader = """This page contains the usage count of UCSC Genome Browser public hubs. The numbers represent individual browsing sessions across all UCSC mirrors for the month of """+lastMonthFormat+""". assembly\tusageCount\tmostPopularTrack\thubName """ with open("/usr/local/apache/htdocs-genecats/qa/test-results/usageStats/publicHubUsageCounts/pubHubUsageCounts."+lastMonthFormat+".txt",'w') as hubsUsageFile: hubsUsageFile.write(publicHubPageHeader) bash('cat /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.sorted.txt >> /usr/local/apache/htdocs-genecats/qa/test-results/usageStats/publicHubUsageCounts/pubHubUsageCounts.'+lastMonthFormat+'.txt') - resultsFile = open('/hive/users/'+user+'/ErrorLogsOutput/results.txt','r') for line in resultsFile: print(line.rstrip()) resultsFile.close() bash("rm /hive/users/"+user+"/ErrorLogs/*") bash("rm /hive/users/"+user+"/ErrorLogsOutput/*")