d13f1c8f1fada76cc569fc0c6f073f76fdeace92 kent Thu Oct 1 17:03:37 2020 -0700 We have a shell script that does this so slowly on big files it is painful, so I wrote this in C. diff --git src/hg/logCrawl/dbTrackAndSearchUsage/generateUsageStats.py src/hg/logCrawl/dbTrackAndSearchUsage/generateUsageStats.py index 489942c..00d229c 100755 --- src/hg/logCrawl/dbTrackAndSearchUsage/generateUsageStats.py +++ src/hg/logCrawl/dbTrackAndSearchUsage/generateUsageStats.py @@ -1,488 +1,490 @@ #!/usr/bin/env python3 import subprocess, os, gzip, argparse, sys, json, operator, datetime from collections import Counter, defaultdict ##### # Define dictionaries/sets/etc. needed to record stats ##### # Making these global so that they can be modified by functions whithout needing # a function argument to specify them # Dictionaries for holding information about db use dbUsers = defaultdict(Counter) # Ex dbUsers struct: {"db":{"hgsid":count}} dbCounts = dict() # Ex dbCounts struct: {"db":count} # Dictionaries for recording information per month dbUsersMonth = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))) # Ex dbUsersMonth struct: {"db":{"year":{"month":{"hgsid":count}}}} dbCountsMonth = defaultdict(lambda: defaultdict(lambda: defaultdict())) # Ex dbUsersMonth struct: {"db":{"year":{"month":count}}}} # Dictionaries for holding information about track use per db trackUsers = defaultdict(lambda: defaultdict(lambda: defaultdict())) # Ex trackUsers struct: {"db":{"track":{"hgsid":count}}} trackCounts = defaultdict(dict) # Ex trackCounts struct: {"db":{"track":count}} # Dictionaries for per month information trackUsersMonth = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda:\ defaultdict(lambda: defaultdict())))) # Ex trackUsersMonth struct: {"db":{"year":{"month":{"track":{"hgsid":count}}}}} trackCountsMonth = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda:\ defaultdict()))) # Ex trackCountsMonth struct: {"db":{"year":{"month":{"track":count}}}} # Dictionaries for holding information about track use per hub trackUsersHubs = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))) # Ex trackUsersHubs struct: {"hubId":{"db":{"track":{"hgsid":count}}}} trackCountsHubs = defaultdict(lambda: defaultdict(lambda: defaultdict())) # Ex trackCountsHubs struct: {"hubId":{"db":{"track":count}}} # Dictionaries for per month information trackUsersHubsMonth = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda:\ defaultdict(lambda: defaultdict(lambda: defaultdict()))))) # Ex trackUserHubsMonth struct: {"hubId":{"db":{"year":{"month":{"track":{"hgsid":count}}}}}} trackCountsHubsMonth = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: defaultdict())))) # Ex trackCountsHubsMonth struct: {"hubId":{"db":{"year":{"month":{"track":count}}}}} monthYearSet = set() # Set containing monthYear strings (e.g. "Aug 2017") # Create a dictionary of hubUrls, shortLabels, and hubStatus ids publicHubs = dict() # Use hgsql to grab hub ID from hubStatus table and shortLabel from hubPublic # for each hub in hubPublic table cmd = ["/cluster/bin/x86_64/hgsql", "hgcentral", "-h", "genome-centdb", "-Ne", "select s.id,p.hubUrl,p.shortLabel\ from hubPublic p join hubStatus s where s.hubUrl=p.hubUrl", ] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) cmdout, cmderr = p.communicate() hubs = cmdout.decode("ASCII") hubs = hubs.split("\n") for hub in hubs: if hub == "": continue else: splitHub = hub.split("\t") publicHubs[splitHub[0]] = [splitHub[1], splitHub[2]] ##### ##### def parseTrackLog(line): """Parse trackLog line and return important fields: db, year, month, hgsid, and a list of tracks""" #### Sample line being processed #### # [Sun Mar 05 04:11:27 2017] [error] [client ###.###.###.##] trackLog 0 hg38 hgsid_### cytoBandIdeo:1,cloneEndCTD:2 #### splitLine = line.strip().split('trackLog') prefix = splitLine[0].split() month = prefix[1] year = prefix[4].replace("]","") suffix = splitLine[1].split() db = suffix[1] hgsid = suffix[2] if len(suffix) > 3: activeTracks = suffix[3] tracks = activeTracks.split(",") else: tracks = [] return db, year, month, hgsid, tracks def modDicts(db, year, month, hgsid, tracks, perMonth=False): """Modify global dictionaries to store information on db, track, and hub track usage""" ##### Process info about db usage # Count up number of times each hgsid shows up if hgsid not in dbUsers[db]: # If no entry in dictionary of users, intialize value to 1 dbUsers[db][hgsid] = 1 else: # Otherwise, increment usage by 1 dbUsers[db][hgsid] += 1 # If perMonth is true, then we record information about db usage on a perMonth basis if perMonth == True: if hgsid not in dbUsersMonth[db][year][month]: dbUsersMonth[db][year][month][hgsid] = 1 else: dbUsersMonth[db][year][month][hgsid] += 1 ##### Process per track information for track in tracks: # Skip empty entries in trackList if track == "": continue # Remove trailing characters track = track[:-2] # Record track user if hgsid not in trackUsers[db][track]: trackUsers[db][track][hgsid] = 1 else: trackUsers[db][track][hgsid] += 1 # If perMonth is true, then we record information about track usage on a perMonth basis if perMonth == True: # Incremement count for track/hgsid by one if hgsid not in trackUsersMonth[db][year][month][track]: trackUsersMonth[db][year][month][track][hgsid] = 1 else: trackUsersMonth[db][year][month][track][hgsid] += 1 ##### Process hub tracks if track.startswith("hub_"): track = track[:-2] splitTrack = track.split("_", 2) if len(splitTrack) > 2: hubId = splitTrack[1] hubTrack = splitTrack[2] # Processed in same way as normal tracks, although now the top level dictionary is # keyed by hubIds, rather than db name if hubId in publicHubs: if hgsid not in trackUsersHubs[hubId][db][hubTrack]: trackUsersHubs[hubId][db][hubTrack][hgsid] = 1 else: trackUsersHubs[hubId][db][hubTrack][hgsid] += 1 if perMonth == True: if hgsid not in trackUsersHubsMonth[hubId][db][year][month][hubTrack]: trackUsersHubsMonth[hubId][db][year][month][hubTrack][hgsid] = 1 else: trackUsersHubsMonth[hubId][db][year][month][hubTrack][hgsid] += 1 def processFile(fileName, perMonth=False): """Process a file line by line using the function parseTrackLog and record usage information using the function modDicts""" if fileName.endswith(".gz"): ifh = gzip.open(fileName, "r") else: ifh = open(fileName, "r") for line in ifh: if "str" not in str(type(line)): line = line.decode("ASCII") if "trackLog" in line: db, year, month, hgsid, tracks = parseTrackLog(line) modDicts(db, year, month, hgsid, tracks, perMonth) # Keep track of month/years covered monthYear = month + " " + year monthYearSet.add(monthYear) def processDir(dirName, perMonth=False): """Process files in a directory using processFile function""" fileNames = os.listdir(dirName) for log in fileNames: fileName = os.path.join(dirName, log) processFile(fileName, perMonth) def dumpToJson(data, outputFile, outputDir): """output data to named outputFile""" jsonOut = open(os.path.join(outputDir, outputFile), "w") json.dump(data, jsonOut) jsonOut.close() def main(): # Parse command-line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description="Generates usage statistics for dbs, tracks, and hubs \ -tracks using Apache error_log files") +tracks using processed Apache error_log files. \nThe processed files can be \ +found in the following directory: /hive/users/chmalee/logs/trimmedLogs/result\n\n\ +For more information, see RM#26191.") parser.add_argument("-f","--fileName", type=str, help='input file name, \ must be space-separated Apache error_log file') parser.add_argument("-d","--dirName", type=str , help='input directory \ name, files must be space-separated error_log files. No other files should be \ present in this directory.') parser.add_argument("-p","--perMonth", action='store_true', help='output \ file containing info on db/track/hub track use per month') parser.add_argument("-m","--monthYear", action='store_true', help='output \ file containing month/year pairs (e.g. "Mar 2017")') parser.add_argument("-j","--jsonOut", action='store_true', help='output \ json files for summary dictionaries') parser.add_argument("-t","--outputDefaults", action='store_true', help='output file containing info on default track usage for top 15 most used assemblies') parser.add_argument("-o","--outDir", type=str, help='directory in which to place output files') args = parser.parse_args() # Print help message if no arguments are supplied if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) # File and directory options can't be used together. Catch this and exit. if args.fileName != None and args.dirName != None: print("-f/--fileName and -d/--dirName cannot be used together. Choose one and re-run.") sys.exit(1) # Catch it early if input file/directory doesn't exist and exit. if args.fileName: if not os.path.exists(args.fileName): print(args.fileName, "doesn't exist. Please run on a valid file.") exit(1) elif args.dirName: if not os.path.exists(args.dirName): print(args.dirName, "doesn't exist. Please run on a valid directory.") exit(1) # Setup output directory if args.outDir == None: # If an output directory is unspecified, then a new one with the current date/time is made currDateTime = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") os.makedirs(currDateTime) outDir = currDateTime else: # Otherwise, user supplied a output directory name outDir = args.outDir if not os.path.exists(outDir): # If specied output directory doesn't exist, create it os.makedirs(outDir) # Process input to create dictionaries containing info about users per db/track/hub track if args.fileName: processFile(args.fileName, args.perMonth) elif args.dirName: processDir(args.dirName, args.perMonth) ##### Output files of summaries of db/track/hub track usage information # Output files of db users and db use dbCountsFile = open(os.path.join(outDir, "dbCounts.tsv"), "w") dbUsersFile = open(os.path.join(outDir, "dbUsers.tsv"), "w") # Set of nested for loops to go through dictionary level by level and # output and summarize results into appropriate counts dictionary for db in dbUsers: dbCounts[db] = 0 for hgsid in dbUsers[db]: # Ignore those dbs only used once if dbUsers[db][hgsid] != 1: dbCounts[db] += 1 # Output dbUsers info to a file count = dbUsers[db][hgsid] dbUsersFile.write(db + "\t" + hgsid + "\t" + str(count) + "\n") # Output counts of total users for each db if dbCounts[db] != 0: dbCountsFile.write(db + "\t" + str(dbCounts[db]) + "\n") # Close our output files dbCountsFile.close() dbUsersFile.close() # Output files of track users and track use trackCountsFile = open(os.path.join(outDir, "trackCounts.tsv"), "w") trackUsersFile = open(os.path.join(outDir, "trackUsers.tsv"), "w") # Set of nested for loops to go through dictionary level by level and # output and summarize results into appropriate counts dictionary for db in trackUsers: for track in trackUsers[db]: # Initialize count for track to 0 trackCounts[db][track] = 0 for hgsid in trackUsers[db][track]: # Filter out those hgsids who only used track once - likely to be bots if trackUsers[db][track][hgsid] != 1: trackCounts[db][track] += 1 # Output information on how much each user used each track count = trackUsers[db][track][hgsid] trackUsersFile.write(db + "\t" + track + "\t" + hgsid + "\t" + str(count) + "\n") if trackCounts[db][track] != 0: # Output counts of total users for each track count = trackCounts[db][track] trackCountsFile.write(db + "\t" + track + "\t" + str(count) + "\n") # Close our output files trackCountsFile.close() trackUsersFile.close() # Output files of hub track users and hub track use trackUsersHubsFile = open(os.path.join(outDir, "trackUsersHubs.tsv"), "w") trackCountsHubsFile = open(os.path.join(outDir, "trackCountsHubs.tsv"), "w") # Set of nested for loops to go through dictionary level by level and # output and summarize results into appropriate counts dictionary for hubId in trackUsersHubs: hubLabel = publicHubs[hubId][1] for db in trackUsersHubs[hubId]: for track in trackUsersHubs[hubId][db]: # Initialize count for hub track to 0 trackCountsHubs[hubId][db][track] = 0 for hgsid in trackUsersHubs[hubId][db][track]: # Filter out those hgsids who only used track once - likely to be bots if trackUsersHubs[hubId][db][track][hgsid] != 1: trackCountsHubs[hubId][db][track] += 1 # Output information on how much each user used each track count = trackUsersHubs[hubId][db][track][hgsid] trackUsersHubsFile.write(hubLabel + "\t" + db + "\t" + track + "\t" + hgsid +\ "\t" + str(count) + "\n") if trackCountsHubs[hubId][db][track] != 0: # Output counts of total users for each hub track count = trackCountsHubs[hubId][db][track] trackCountsHubsFile.write(hubLabel + "\t" + db + "\t" + track + "\t" +\ str(count) + "\n") # Close our output files trackUsersHubsFile.close() trackCountsHubsFile.close() # Output file containing info on month/years covered by stats if indicated if args.monthYear == True: monthYearFile = open(os.path.join(outDir, "monthYear.tsv"), "w") for pair in monthYearSet: monthYearFile.write(pair + "\n") monthYearFile.close() ##### Output data per month when indicated if args.perMonth == True: dbUsersMonthFile = open(os.path.join(outDir, "dbUsers.perMonth.tsv"), "w") dbCountsMonthFile = open(os.path.join(outDir, "dbCounts.perMonth.tsv"), "w") # Set of nested for loops to go through dictionary level by level and # output and summarize results into appropriate counts dictionary for db in dbUsersMonth: for year in dbUsersMonth[db]: for month in dbUsersMonth[db][year]: dbCountsMonth[db][year][month] = 0 for hgsid in dbUsersMonth[db][year][month]: if dbUsersMonth[db][year][month][hgsid] != 1: dbCountsMonth[db][year][month] += 1 # Output dbUsersMonth info to a file count = dbUsersMonth[db][year][month][hgsid] dbUsersMonthFile.write(db + "\t" + year + "\t" + month +\ "\t" + hgsid + "\t" + str(count) + "\n") # Output dbCounts to a file if dbCountsMonth[db][year][month] != 0: count = dbCountsMonth[db][year][month] dbCountsMonthFile.write(db + "\t" + year + "\t" + month +\ "\t" + str(count) + "\n") dbUsersMonthFile.close() dbCountsMonthFile.close() # Summarize user dictionaries to create counts per db/track/hub track trackUsersMonthFile = open(os.path.join(outDir, "trackUsers.perMonth.tsv"), "w") trackCountsMonthFile = open(os.path.join(outDir, "trackCounts.perMonth.tsv"), "w") # Set of nested for loops to go through dictionary level by level and # output and summarize results into appropriate counts dictionary for db in trackUsersMonth: for year in trackUsersMonth[db]: for month in trackUsersMonth[db][year]: for track in trackUsersMonth[db][year][month]: trackCountsMonth[db][year][month][track] = 0 for hgsid in trackUsersMonth[db][year][month][track]: if trackUsersMonth[db][year][month][track][hgsid] != 1: trackCountsMonth[db][year][month][track] += 1 count = trackUsersMonth[db][year][month][track][hgsid] trackUsersMonthFile.write(db + "\t" + year + "\t" + month + "\t" +\ hgsid + "\t" + track + "\t" + str(count) + "\n") if trackCountsMonth[db][year][month][track] != 0: count = trackCountsMonth[db][year][month][track] trackCountsMonthFile.write(db + "\t" + year + "\t" + month + "\t" +\ track + "\t" + str(count) + "\n") trackUsersMonthFile.close() trackCountsMonthFile.close() # Summarize user dictionaries to create counts per db/track/hub track trackUsersHubsMonthFile = open(os.path.join(outDir, "trackUsersHubs.perMonth.tsv"), "w") trackCountsHubsMonthFile = open(os.path.join(outDir, "trackCountsHubs.perMonth.tsv"), "w") # Set of nested for loops to go through dictionary level by level and # output and summarize results into appropriate counts dictionary for hubId in trackUsersHubsMonth: hubLabel = publicHubs[hubId][1] for db in trackUsersHubsMonth[hubId]: for year in trackUsersHubsMonth[hubId][db]: for month in trackUsersHubsMonth[hubId][db][year]: for track in trackUsersHubsMonth[hubId][db][year][month]: trackCountsHubsMonth[hubId][db][year][month][track] = 0 for hgsid in trackUsersHubsMonth[hubId][db][year][month][track]: if trackUsersHubsMonth[hubId][db][year][month][track][hgsid] != 1: trackCountsHubsMonth[hubId][db][year][month][track] += 1 count = trackUsersHubsMonth[hubId][db][year][month][track][hgsid] trackUsersHubsMonthFile.write(hubLabel + "\t" + db + "\t" + year +\ "\t" + month + "\t" + track + "\t" + hgsid + "\t" + str(count) + "\n") if trackCountsHubsMonth[hubId][db][year][month][track] != 0: count = trackCountsHubsMonth[hubId][db][year][month][track] trackCountsHubsMonthFile.write(hubLabel + "\t" + db + "\t" + year +\ "\t" + month + "\t" + track + "\t" +\ str(count) + "\n") trackUsersHubsMonthFile.close() trackCountsHubsMonthFile.close() ##### ##### Output json files if indicated ##### if args.jsonOut == True: dumpToJson(dbCounts, "dbCounts.json", outDir) dumpToJson(trackCounts, "trackCounts.json", outDir) dumpToJson(trackCountsHubs, "trackCountsHubs.json", outDir) if args.perMonth == True: dumpToJson(dbCountsMonth, "dbCounts.perMonth.json", outDir) dumpToJson(trackCountsMonth, "trackCounts.perMonth.json", outDir) dumpToJson(trackCountsHubsMonth, "trackCountsHubs.perMonth.json", outDir) #if args.monthYear == True: # dumpToJson(monthYearSet, "monthYearSet.json") ##### ##### Output information on default track usage if indicated ##### if args.outputDefaults == True: # Sort dbs by most popular dbCountsSorted = sorted(dbCounts.items(), key=operator.itemgetter(1)) dbCountsSorted.reverse() defaultCountsFile = open(os.path.join(outDir, "defaultCounts.tsv"), "w") for x in range(0, 15): # Will only output the default track stats for the 15 most popular assemblies db = dbCountsSorted[x][0] dbOpt = "db=" + db # HGDB_CONF must be set here so that we use default tracks from beta, not dev # Dev can contain staged tracks that don't exist on RR, leading to errors later in script cmd = ["cd /usr/local/apache/cgi-bin && HGDB_CONF=$HOME/.hg.conf.beta ./hgTracks " + dbOpt] p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) cmdout, cmderr = p.communicate() errText = cmderr.decode("ASCII") # Convert binary output into ACSII for processing # Process stderr output as that's what contains the trackLog lines splitErrText = errText.split("\n") trackLog = splitErrText[0] # First element is trackLog line, second is CGI_TIME; only want trackLog splitLine = trackLog.split(" ") # Build list of tracks tracks = splitLine[4] tracks = tracks.split(",") dbUse = dbCounts[db] # output list to file that contains column headings defaultCountsFile.write("#db\ttrackName\ttrackUse\t% using\t% turning off\n#" + db + "\t" + str(dbUse) + "\n") defaultCounts = [] for track in tracks: if track == "": continue # Remove trailing characters track = track[:-2] try: trackUse = trackCounts[db][track] relUse = (trackUse/dbUse)*100 relOff = ((dbUse - trackUse)/dbUse)*100 # Store all this info a in a list so that we can sort my most used tracks later defaultCounts.append([track, trackUse, relUse, relOff]) except KeyError: continue # Sort defaultCounts for current assembly by most used track first defaultCountsSorted = sorted(defaultCounts, key=operator.itemgetter(2)) defaultCountsSorted.reverse() # Output sorted defaultCounts to a file for line in defaultCountsSorted: track = line[0] use = line[1] on = line[2] off = line[3] output = "{}\t{}\t{:d}\t{:3.2f}\t{:3.2f}\n".format(db, track, use, on, off) defaultCountsFile.write(output) defaultCountsFile.close() if __name__ == "__main__": main()