ae16b2213a7c8fd157250ca9753dade44d7c2671 lrnassar Fri Apr 4 11:32:59 2025 -0700 Since tabFmt was committed to the tree in the new hgwdev, it needs a stdin designation when used in pipes to work. No RM. diff --git src/utils/qa/assemblyStatsCron.py src/utils/qa/assemblyStatsCron.py index e3371ad7f58..ab26ec89128 100755 --- src/utils/qa/assemblyStatsCron.py +++ src/utils/qa/assemblyStatsCron.py @@ -118,60 +118,60 @@ db = line[0] count = int(line[1]) totalCount = totalCount + count if db not in dbsCounts: dbsCounts[db] = count else: dbsCounts[db] = dbsCounts[db] + count for key in dbsCounts: dbCountsCombined.write(key+"\t"+str(dbsCounts[key])+"\t"+str(round(dbsCounts[key]/totalCount,3))+"\n") dbCountsCombined.close() dbCountsRaw.close() bash('sort /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv -rnk2 > /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv.sorted') -bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv.sorted | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') +bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/dbCountsCombinedWithCuratedHubs.tsv.sorted | /cluster/bin/x86_64/tabFmt stdin >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') ##### Report default track usage for hg38 and hg19 ###### bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo "List of default track usage for hg38, sorted by how many users are turning off the track:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash("echo db$'\\t'trackUse$'\\t'% using$'\\t'% turning off$'\\t'trackName >> /hive/users/"+user+"/ErrorLogsOutput/results.txt") bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') -bash('grep ^hg38 /hive/users/'+user+'/ErrorLogsOutput/defaultCounts.tsv | grep -v "MarkH3k27ac" | sort -nrk 5 | awk -v OFS="\\t" \'{ print $1,$3,$4,$5,$2 }\' | head -n 15 | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') +bash('grep ^hg38 /hive/users/'+user+'/ErrorLogsOutput/defaultCounts.tsv | grep -v "MarkH3k27ac" | sort -nrk 5 | awk -v OFS="\\t" \'{ print $1,$3,$4,$5,$2 }\' | head -n 15 | /cluster/bin/x86_64/tabFmt stdin >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo "List of default track usage for hg19, sorted by how many users are turning off the track:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash("echo db$'\\t'trackUse$'\\t'% using$'\\t'% turning off$'\\t'trackName >> /hive/users/"+user+"/ErrorLogsOutput/results.txt") bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') -bash('grep ^hg19 /hive/users/'+user+'/ErrorLogsOutput/defaultCounts.tsv | grep -v "MarkH3k27ac" | sort -nrk 5 | awk -v OFS="\\t" \'{ print $1,$3,$4,$5,$2 }\' | head -n 15| /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') +bash('grep ^hg19 /hive/users/'+user+'/ErrorLogsOutput/defaultCounts.tsv | grep -v "MarkH3k27ac" | sort -nrk 5 | awk -v OFS="\\t" \'{ print $1,$3,$4,$5,$2 }\' | head -n 15| /cluster/bin/x86_64/tabFmt stdin >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo List of non-default track usage: >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash("echo db$'\\t'trackUse$'\\t'trackName >> /hive/users/"+user+"/ErrorLogsOutput/results.txt") bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('sort /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv -rnk3 > /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted') bash('cat /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted | grep -v -f /hive/users/'+user+'/ErrorLogsOutput/defaults.txt > /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted.noDefaults') -bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted.noDefaults | awk -v OFS="\\t" \'{ print $1,$3,$2 }\' | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') +bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/trackCounts.tsv.sorted.noDefaults | awk -v OFS="\\t" \'{ print $1,$3,$2 }\' | /cluster/bin/x86_64/tabFmt stdin >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') ##### Report public hub usage and non-public hub usage ###### -pubHubFile = open("/hive/users/"+user+"/ErrorLogsOutput/pubHubs.txt", "w") #Using a file to be able to use the same ~markd/bin/tabFmt format +pubHubFile = open("/hive/users/"+user+"/ErrorLogsOutput/pubHubs.txt", "w") #Using a file to be able to use the same ~markd/bin/tabFmt stdin format bash("sort /hive/users/"+user+"/ErrorLogsOutput/trackCountsHubs.tsv -rnk4 -t $\'\\t\' > /hive/users/"+user+"/ErrorLogsOutput/trackCountsHubs.tsv.sorted") allPubHubs = bash('cat /hive/users/'+user+'/ErrorLogsOutput/trackCountsHubs.tsv.sorted').rstrip().split("\n") #This section pulls out only a single occurence of each public hub, picking the first track (most uses) to represent it results = OrderedDict() for each in allPubHubs: each = each.split('\t') if each[0] not in results.keys(): results[each[0]] = each[0:] for key, value in results.items(): pubHubFile.write(value[0]+"\t"+value[1]+"\t"+value[2]+"\t"+value[3]+"\n") pubHubFile.close() #Query hubPublic and hubStats in order to filter out public hubs then sort out the IDs @@ -334,31 +334,31 @@ else: print("Name not in public hub list: "+name) pubHubList.close() pubHubFile = open("/hive/users/"+user+"/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.txt", "w") for key in pubHubDic: pubHubFile.write(pubHubDic[key]['dbs']+"\t"+str(pubHubDic[key]['count'])+"\t"+pubHubDic[key]['trackName']+"\t"+pubHubDic[key]['hubName']+"\n") pubHubFile.close() bash('sort -rnk2 /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.txt > /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.sorted.txt') bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo "List of public hub usage (only most used track represented). Counts added across all mirrors:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash("echo db$'\\t'trackUse$'\\t'track$'\\t'pubHub >> /hive/users/"+user+"/ErrorLogsOutput/results.txt") bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') -bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.sorted.txt | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') +bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/allPubHubsCombinedWithMirrorsCounts.sorted.txt | /cluster/bin/x86_64/tabFmt stdin >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') #Then do the same for non-public hubs for hub in hubsDic: if "hub" in hubsDic[hub]['hubDb']: #check for database like hub_164399_GCA_004023905.1 hubsDic[hub]['hubDb'] = "_".join(hubsDic[hub]['hubDb'].split("_")[2:]) if "hub" in hubsDic[hub]['hubTrack']: #check for database like hub_116154_xenoRefGene hubsDic[hub]['hubTrack'] = "_".join(hubsDic[hub]['hubTrack'].split("_")[2:]) nameToMatch = hubsDic[hub]["hubTrack"] + hubsDic[hub]['hubDb'] if nameToMatch in allHubCounts.keys(): hubsDic[hub]['updatedCount'] = allHubCounts[nameToMatch] # else: #For debugging # print("Could not find the following hub in the track list: "+nameToMatch) # print(hubsDic[hub]["hubTrack"]) #Use the hubURL as the ID and consolidate the hubs into a single entry @@ -377,31 +377,31 @@ else: if consolidatePubHubsToWriteOut[hubUrl]['count'] < hubsDic[hub]['updatedCount']: consolidatePubHubsToWriteOut[hubUrl]['count'] = hubsDic[hub]['updatedCount'] consolidatePubHubsToWriteOut[hubUrl]['mostPopumachine'] = hubsDic[hub]['machine'] nonPubHubsFile = open("/hive/users/"+user+"/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.txt", "w") for key in consolidatePubHubsToWriteOut: nonPubHubsFile.write(consolidatePubHubsToWriteOut[key]['hubDb']+"\t"+str(consolidatePubHubsToWriteOut[key]['count'])+"\t"+consolidatePubHubsToWriteOut[key]['shortLabel']+"\t"+key+"\t"+consolidatePubHubsToWriteOut[key]['mostPopumachine']+"\n") nonPubHubsFile.close() bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo "List of hub usage that are not public hubs. Counts are added across all mirrors/machines. This includes curated hubs:" >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash("echo db$'\\t'useCount$'\\t'shortLabel$'\\t'hubUrl$'\\t'mostPopularMachine >> /hive/users/"+user+"/ErrorLogsOutput/results.txt") bash('echo -------------------------------------------------------------------------------------- >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('sort -rnk2 /hive/users/'+user+'/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.txt > /hive/users/'+user+'/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.sorted.txt') -bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.sorted.txt | /cluster/bin/x86_64/tabFmt >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') +bash('head -n 15 /hive/users/'+user+'/ErrorLogsOutput/allRegularHubsCombinedWithMirrorsCounts.sorted.txt | /cluster/bin/x86_64/tabFmt stdin >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo Previous outputs of this cron can be found here: https://genecats.gi.ucsc.edu/qa/test-results/usageStats/ >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo Monthly usage counts of all public hubs can be found here: https://genecats.gi.ucsc.edu/qa/test-results/usageStats/publicHubUsageCounts/ >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash('echo Archive of monthly raw data can be found here: /hive/users/qateam/assemblyStatsCronArchive/ >> /hive/users/'+user+'/ErrorLogsOutput/results.txt') bash("mkdir -p /hive/users/"+user+"/assemblyStatsCronArchive/"+lastMonthFormat) bash("cp /hive/users/"+user+"/ErrorLogsOutput/* /hive/users/"+user+"/assemblyStatsCronArchive/"+lastMonthFormat) if user == 'qateam': bash("cat /hive/users/"+user+"/ErrorLogsOutput/results.txt > /usr/local/apache/htdocs-genecats/qa/test-results/usageStats/"+lastMonthFormat) publicHubPageHeader = """This page contains the usage count of UCSC Genome Browser public hubs. The numbers represent individual browsing sessions across all UCSC mirrors for the month of """+lastMonthFormat+""".