77c1fd0753c5b43a3c10baa351ed40962be704cb lrnassar Tue Aug 27 16:53:48 2024 -0700 Small change to generateUsageStats.py was to improve how it splits character lines. Before if counts were high, e.g. track:1034, it would slice out last 2 numbers but keep the :10. For the assemblyStatsCron various changes were made as documented in the RM ticket. This script is a fairly big mess, and has a lot of room for improvement via functions, better commentings, not using bash to write out, and general optimizations like parsing files mutltiple times. However, it is only used internally for general exploratory overviews and thus it has not warranted a proper restructuring and cleaning. The primary motivation here was to add up counts from track usage across all hubs instead of only reporting the highest one. Refs #34266 diff --git src/hg/logCrawl/dbTrackAndSearchUsage/generateUsageStats.py src/hg/logCrawl/dbTrackAndSearchUsage/generateUsageStats.py index b1af1bb..5d39cff 100755 --- src/hg/logCrawl/dbTrackAndSearchUsage/generateUsageStats.py +++ src/hg/logCrawl/dbTrackAndSearchUsage/generateUsageStats.py @@ -112,50 +112,50 @@ dbUsers[db][hgsid] += 1 # If perMonth is true, then we record information about db usage on a perMonth basis if perMonth == True: if hgsid not in dbUsersMonth[db][year][month]: dbUsersMonth[db][year][month][hgsid] = 1 else: dbUsersMonth[db][year][month][hgsid] += 1 ##### Process per track information if processTrackUsers == True: for track in tracks: # Skip empty entries in trackList if track == "": continue # Remove trailing characters - track = track[:-2] + track = track.split(":")[0] # Record track user if hgsid not in trackUsers[db][track]: trackUsers[db][track][hgsid] = 1 else: trackUsers[db][track][hgsid] += 1 # If perMonth is true, then we record information about track usage on a perMonth basis if perMonth == True: # Incremement count for track/hgsid by one if hgsid not in trackUsersMonth[db][year][month][track]: trackUsersMonth[db][year][month][track][hgsid] = 1 else: trackUsersMonth[db][year][month][track][hgsid] += 1 ##### Process public hub tracks if processTrackHubUsers == True: if track.startswith("hub_"): - track = track[:-2] + track = track.split(":")[0] splitTrack = track.split("_", 2) if len(splitTrack) > 2: hubId = splitTrack[1] hubTrack = splitTrack[2] # Processed in same way as normal tracks, although now the top level dictionary is # keyed by hubIds, rather than db name if hubId in publicHubs: if hgsid not in trackUsersHubs[hubId][db][hubTrack]: trackUsersHubs[hubId][db][hubTrack][hgsid] = 1 else: trackUsersHubs[hubId][db][hubTrack][hgsid] += 1 if perMonth == True: if hgsid not in trackUsersHubsMonth[hubId][db][year][month][hubTrack]: @@ -621,31 +621,31 @@ splitLine = trackLog.split(" ") # Build list of tracks tracks = splitLine[4] tracks = tracks.split(",") dbUse = dbCounts[db] # output list to file that contains column headings defaultCountsFile.write("#db\ttrackName\ttrackUse\t% using\t% turning off\n#" + db + "\t" + str(dbUse) + "\n") defaultCounts = [] for track in tracks: if track == "": continue # Remove trailing characters - track = track[:-2] + track = track.split(":")[0] try: trackUse = trackCounts[db][track] relUse = (trackUse/dbUse)*100 relOff = ((dbUse - trackUse)/dbUse)*100 # Store all this info a in a list so that we can sort my most used tracks later defaultCounts.append([track, trackUse, relUse, relOff]) except KeyError: continue # Sort defaultCounts for current assembly by most used track first defaultCountsSorted = sorted(defaultCounts, key=operator.itemgetter(2)) defaultCountsSorted.reverse() # Output sorted defaultCounts to a file