5e1a5bda35707ab546e0a70c6b8b38f91d1b4160 lrnassar Wed May 22 14:33:13 2024 -0700 Adding a new variable needed to run hgTracks on the command line due to a change Max made. Refs #32237 diff --git src/hg/logCrawl/dbTrackAndSearchUsage/generateUsageStats.py src/hg/logCrawl/dbTrackAndSearchUsage/generateUsageStats.py index 921b57b..b1af1bb 100755 --- src/hg/logCrawl/dbTrackAndSearchUsage/generateUsageStats.py +++ src/hg/logCrawl/dbTrackAndSearchUsage/generateUsageStats.py @@ -1,674 +1,674 @@ #!/usr/bin/env python3 import subprocess, os, gzip, argparse, sys, json, operator, datetime from collections import Counter, defaultdict ##### # Define dictionaries/sets/etc. needed to record stats ##### # Making these global so that they can be modified by functions whithout needing # a function argument to specify them # Dictionaries for holding information about db use dbUsers = defaultdict(Counter) # Ex dbUsers struct: {"db":{"hgsid":count}} dbCounts = dict() # Ex dbCounts struct: {"db":count} # Dictionaries for recording information per month dbUsersMonth = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))) # Ex dbUsersMonth struct: {"db":{"year":{"month":{"hgsid":count}}}} dbCountsMonth = defaultdict(lambda: defaultdict(lambda: defaultdict())) # Ex dbUsersMonth struct: {"db":{"year":{"month":count}}}} # Dictionaries for holding information about track use per db trackUsers = defaultdict(lambda: defaultdict(lambda: defaultdict())) # Ex trackUsers struct: {"db":{"track":{"hgsid":count}}} trackCounts = defaultdict(dict) # Ex trackCounts struct: {"db":{"track":count}} # Dictionaries for per month information trackUsersMonth = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda:\ defaultdict(lambda: defaultdict())))) # Ex trackUsersMonth struct: {"db":{"year":{"month":{"track":{"hgsid":count}}}}} trackCountsMonth = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda:\ defaultdict()))) # Ex trackCountsMonth struct: {"db":{"year":{"month":{"track":count}}}} # Dictionaries for holding information about track use per hub trackUsersHubs = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict()))) # Ex trackUsersHubs struct: {"hubId":{"db":{"track":{"hgsid":count}}}} trackCountsHubs = defaultdict(lambda: defaultdict(lambda: defaultdict())) # Ex trackCountsHubs struct: {"hubId":{"db":{"track":count}}} # Dictionaries for per month information trackUsersHubsMonth = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda:\ defaultdict(lambda: defaultdict(lambda: defaultdict()))))) # Ex trackUserHubsMonth struct: {"hubId":{"db":{"year":{"month":{"track":{"hgsid":count}}}}}} trackCountsHubsMonth = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict( lambda: defaultdict())))) # Ex trackCountsHubsMonth struct: {"hubId":{"db":{"year":{"month":{"track":count}}}}} monthYearSet = set() # Set containing monthYear strings (e.g. "Aug 2017") # Create a dictionary of hubUrls, shortLabels, and hubStatus ids publicHubs = dict() # Use hgsql to grab hub ID from hubStatus table and shortLabel from hubPublic # for each hub in hubPublic table cmd = ["/cluster/bin/x86_64/hgsql", "hgcentral", "-h", "genome-centdb", "-Ne", "select s.id,p.hubUrl,p.shortLabel\ from hubPublic p join hubStatus s where s.hubUrl=p.hubUrl", ] p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) cmdout, cmderr = p.communicate() hubs = cmdout.decode("ASCII") hubs = hubs.split("\n") for hub in hubs: if hub == "": continue else: splitHub = hub.split("\t") publicHubs[splitHub[0]] = [splitHub[1], splitHub[2]] ##### ##### def parseTrackLog(line): """Parse trackLog line and return important fields: db, year, month, hgsid, and a list of tracks""" #### Sample line being processed #### # From Chris Lee's combined/trimmed log format produced by {script name} # [Sun Mar 05 04:11:27 2017] [error] [client ###.###.###.##] trackLog 0 hg38 hgsid_### cytoBandIdeo:1,cloneEndCTD:2 #### splitLine = line.strip().split('\t') date = splitLine[3] month = date.split()[1] year = date.split()[4] ip = splitLine[1] hgsid = splitLine[2] # temporary, really should rewrite script to use a var like ip_hgsid everywhere #hgsid = ip + "_" + hgsid db = splitLine[4] if len(splitLine) > 5: tracks = splitLine[5].split(",") else: tracks = [] return db, year, month, hgsid, tracks def modDicts(db, year, month, hgsid, tracks, toProcess, perMonth=False): """Modify global dictionaries to store information on db, track, and hub track usage""" ##### Set some variables based on input options processDbUsers = toProcess[0] processTrackUsers = toProcess[1] processTrackHubUsers = toProcess[2] ##### Process info about db usage # Count up number of times each hgsid shows up if processDbUsers == True: if hgsid not in dbUsers[db]: # If no entry in dictionary of users, intialize value to 1 dbUsers[db][hgsid] = 1 else: # Otherwise, increment usage by 1 dbUsers[db][hgsid] += 1 # If perMonth is true, then we record information about db usage on a perMonth basis if perMonth == True: if hgsid not in dbUsersMonth[db][year][month]: dbUsersMonth[db][year][month][hgsid] = 1 else: dbUsersMonth[db][year][month][hgsid] += 1 ##### Process per track information if processTrackUsers == True: for track in tracks: # Skip empty entries in trackList if track == "": continue # Remove trailing characters track = track[:-2] # Record track user if hgsid not in trackUsers[db][track]: trackUsers[db][track][hgsid] = 1 else: trackUsers[db][track][hgsid] += 1 # If perMonth is true, then we record information about track usage on a perMonth basis if perMonth == True: # Incremement count for track/hgsid by one if hgsid not in trackUsersMonth[db][year][month][track]: trackUsersMonth[db][year][month][track][hgsid] = 1 else: trackUsersMonth[db][year][month][track][hgsid] += 1 ##### Process public hub tracks if processTrackHubUsers == True: if track.startswith("hub_"): track = track[:-2] splitTrack = track.split("_", 2) if len(splitTrack) > 2: hubId = splitTrack[1] hubTrack = splitTrack[2] # Processed in same way as normal tracks, although now the top level dictionary is # keyed by hubIds, rather than db name if hubId in publicHubs: if hgsid not in trackUsersHubs[hubId][db][hubTrack]: trackUsersHubs[hubId][db][hubTrack][hgsid] = 1 else: trackUsersHubs[hubId][db][hubTrack][hgsid] += 1 if perMonth == True: if hgsid not in trackUsersHubsMonth[hubId][db][year][month][hubTrack]: trackUsersHubsMonth[hubId][db][year][month][hubTrack][hgsid] = 1 else: trackUsersHubsMonth[hubId][db][year][month][hubTrack][hgsid] += 1 def processFile(fileName, toProcess, perMonth=False): """Process a file line by line using the function parseTrackLog and record usage information using the function modDicts""" if fileName.endswith(".gz"): ifh = gzip.open(fileName, "r") else: ifh = open(fileName, "r") for line in ifh: if "str" not in str(type(line)): line = line.decode("ASCII") db, year, month, hgsid, tracks = parseTrackLog(line) modDicts(db, year, month, hgsid, tracks, toProcess, perMonth) # Keep track of month/years covered monthYear = month + " " + year monthYearSet.add(monthYear) def processDir(dirName, toProcess, perMonth=False): """Process files in a directory using processFile function""" fileNames = os.listdir(dirName) for log in fileNames: fileName = os.path.join(dirName, log) processFile(fileName, toProcess, perMonth) def dumpToJson(data, outputFile, outputDir): """output data to named outputFile""" jsonOut = open(os.path.join(outputDir, outputFile), "w") json.dump(data, jsonOut) jsonOut.close() def main(): # Parse command-line arguments parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description="Generates usage statistics for dbs, tracks, and hubs \ tracks using processed Apache error_log files. \nThe processed files can be \ found in the following directory: /hive/users/chmalee/logs/trimmedLogs/result\n\n\ For more information, see RM#26191.") parser.add_argument("-f","--fileName", type=str, help='input file name, \ must be space-separated Apache error_log file') parser.add_argument("-d","--dirName", type=str , help='input directory \ name, files must be space-separated error_log files. No other files should be \ present in this directory.') parser.add_argument("--dbCounts", action='store_true', help='output a \ file containing users per ucsc database') parser.add_argument("--dbUsers", action='store_true', help='output a \ file containing hgsids associated with each ucsc databases and how many uses each one had') parser.add_argument("--dbUsage", action='store_true', help='output a \ file containing hgsids associated with each ucsc databases and how many uses each one had') parser.add_argument("--trackCounts", action='store_true', help='output a \ file containing counts of how many users had a track on for a particular ucsc database') parser.add_argument("--trackUsers", action='store_true', help='output a \ file containing hgsids, the tracks associated with them and how many trackLog lines they showed up in') parser.add_argument("--trackUsage", action='store_true', help='output a \ file containing hgsids associated with each ucsc databases and how many uses each one had') parser.add_argument("--trackHubCounts", action='store_true', help='output a \ file containing counts of how many users had a hub track on for a particular ucsc database or assembly hub') parser.add_argument("--trackHubUsers", action='store_true', help='output a \ file containing hgsids, the hub tracks associated with them and how many trackLog lines they showed up in') parser.add_argument("--trackHubUsage", action='store_true', help='output a \ file containing hgsids associated with each ucsc databases and how many uses each one had') parser.add_argument("--allOutput", action='store_true', help='output a \ file for each option above (dbCounts, dbUsers, dbUsage, trackCounts, trackUsers, trackHubCounts, trackHubUsers)') parser.add_argument("-p","--perMonth", action='store_true', help='output \ file containing info on db/track/hub track use per month') parser.add_argument("-m","--monthYear", action='store_true', help='output \ file containing month/year pairs (e.g. "Mar 2017")') parser.add_argument("-j","--jsonOut", action='store_true', help='output \ json files for summary dictionaries') parser.add_argument("-t","--outputDefaults", action='store_true', help='output a file containing info on default track usage for top 15 most used assemblies') parser.add_argument("-o","--outDir", type=str, help='directory in which to place output files') args = parser.parse_args() # Print help message if no arguments are supplied if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) # File and directory options can't be used together. Catch this and exit. if args.fileName != None and args.dirName != None: print("-f/--fileName and -d/--dirName cannot be used together. Choose one and re-run.") sys.exit(1) # Catch it early if input file/directory doesn't exist and exit. if args.fileName: if not os.path.exists(args.fileName): print(args.fileName, "doesn't exist. Please run on a valid file.") exit(1) elif args.dirName: if not os.path.exists(args.dirName): print(args.dirName, "doesn't exist. Please run on a valid directory.") exit(1) if not any([args.dbCounts, args.dbUsers, args.dbUsage, args.trackUsers, args.trackCounts, args.trackUsage, args.trackHubUsers, args.trackHubCounts, args.trackHubUsage, args.allOutput]): print("You must specify at least one of: --allOutput, --dbCounts, --dbUsage, --dbUsers, --trackCounts, --trackUsers, --trackHubCounts, --trackHubUsers.") exit(1) # Setup output directory if args.outDir == None: # If an output directory is unspecified, then a new one with the current date/time is made currDateTime = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") os.makedirs(currDateTime) outDir = currDateTime else: # Otherwise, user supplied a output directory name outDir = args.outDir if not os.path.exists(outDir): # If specied output directory doesn't exist, create it os.makedirs(outDir) # If "all" option is set change everyone to True if args.allOutput == True: args.dbCounts = True args.dbUsers = True args.dbUsage = True args.trackUsers = True args.trackCounts = True args.trackUsage = True args.trackHubUsers = True args.trackHubCounts = True args.trackHubUsage = True # Make list of users to process and build out dictionaries for later usersToProcess = [False, False, False] # Pos 0 - process db users # Pos 1 - process track hub users # Pos 2 - process public track hub users # The reason I've done it this way is to allow people to output just the counts # dbs/tracks/hub tracks without outputting the users files. I could have just set # args.*Users = True if args.*Counts was true, but that would mean that the users file # would also be output, defeating the purpose of the option to only output the *Counts if any([args.dbCounts, args.dbUsers, args.dbUsage]): # If dbCounts or dbUsers speficied, we need to process users usersToProcess[0] = True if any([args.trackCounts, args.trackUsers, args.trackUsage]): usersToProcess[1] = True if any([args.trackHubCounts, args.trackHubUsers, args.trackHubUsage]): usersToProcess[1] = True usersToProcess[2] = True # Process input to create dictionaries containing info about users per db/track/hub track if args.fileName: processFile(args.fileName, usersToProcess, args.perMonth) elif args.dirName: processDir(args.dirName, usersToProcess, args.perMonth) ##### Output files of summaries of db/track/hub track usage information # Output files of db users and db use if args.dbCounts == True: dbCountsFile = open(os.path.join(outDir, "dbCounts.tsv"), "w") if args.dbUsers == True: dbUsersFile = open(os.path.join(outDir, "dbUsers.tsv"), "w") if args.dbUsage == True: dbUsageFile = open(os.path.join(outDir, "dbUsage.tsv"), "w") # Set of nested for loops to go through dictionary level by level and # output and summarize results into appropriate counts dictionary if any([args.dbUsage, args.dbUsers, args.dbCounts]): for db in dbUsers: dbCounts[db] = 0 dbUsage = 0 for hgsid in dbUsers[db]: # Ignore those dbs only used once if dbUsers[db][hgsid] != 1: dbCounts[db] += 1 # Output dbUsers info to a file count = dbUsers[db][hgsid] dbUsage += count if args.dbUsers == True: dbUsersFile.write(db + "\t" + hgsid + "\t" + str(count) + "\n") # Output counts of total users for each db if dbCounts[db] != 0: if args.dbCounts == True: dbCountsFile.write(db + "\t" + str(dbCounts[db]) + "\n") if args.dbUsage == True: dbUsageFile.write(db + "\t" + str(dbUsage) + "\n") # Close our output files if args.dbCounts == True: print("Finished outputting dbCounts.tsv") dbCountsFile.close() if args.dbUsers == True: print("Finished outputting dbUsers.tsv") dbUsersFile.close() if args.dbUsage == True: print("Finished outputting dbUsage.tsv") dbUsageFile.close() # Output files of track users and track use if args.trackCounts == True: trackCountsFile = open(os.path.join(outDir, "trackCounts.tsv"), "w") if args.trackUsers == True: trackUsersFile = open(os.path.join(outDir, "trackUsers.tsv"), "w") if args.trackUsage == True: trackUsageFile = open(os.path.join(outDir, "trackUsage.tsv"), "w") # Set of nested for loops to go through dictionary level by level and # output and summarize results into appropriate counts dictionary if any([args.trackUsage, args.trackUsers, args.trackCounts]): for db in trackUsers: for track in trackUsers[db]: # Initialize count for track to 0 trackCounts[db][track] = 0 trackUsage = 0 for hgsid in trackUsers[db][track]: # Filter out those hgsids who only used track once - likely to be bots if trackUsers[db][track][hgsid] != 1: trackCounts[db][track] += 1 # Output information on how much each user used each track count = trackUsers[db][track][hgsid] trackUsage += count if args.trackUsers == True: trackUsersFile.write(db + "\t" + track + "\t" + hgsid + "\t" + str(count) + "\n") if trackCounts[db][track] != 0: if args.trackCounts == True: # Output counts of total users for each track count = trackCounts[db][track] trackCountsFile.write(db + "\t" + track + "\t" + str(count) + "\n") if args.trackUsage == True: trackUsageFile.write(db + "\t" + track + "\t" + str(trackUsage) + "\n") # Close our output files if args.trackCounts == True: print("Finished outputting trackCounts.tsv") trackCountsFile.close() if args.trackUsers == True: print("Finished outputting trackUsers.tsv") trackUsersFile.close() if args.trackUsage == True: print("Finished outputting trackUsage.tsv") trackUsageFile.close() # Output files of hub track users and hub track use if args.trackHubUsers == True: trackUsersHubsFile = open(os.path.join(outDir, "trackUsersHubs.tsv"), "w") if args.trackHubCounts == True: trackCountsHubsFile = open(os.path.join(outDir, "trackCountsHubs.tsv"), "w") if args.trackHubUsage == True: trackUsageHubsFile = open(os.path.join(outDir, "trackUsageHubs.tsv"), "w") # Set of nested for loops to go through dictionary level by level and # output and summarize results into appropriate counts dictionary if any([args.trackHubUsage, args.trackHubUsers, args.trackHubCounts]): for hubId in trackUsersHubs: hubLabel = publicHubs[hubId][1] for db in trackUsersHubs[hubId]: for track in trackUsersHubs[hubId][db]: # Initialize count for hub track to 0 trackCountsHubs[hubId][db][track] = 0 trackHubUsage = 0 for hgsid in trackUsersHubs[hubId][db][track]: # Filter out those hgsids who only used track once - likely to be bots if trackUsersHubs[hubId][db][track][hgsid] != 1: trackCountsHubs[hubId][db][track] += 1 # Output information on how much each user used each track count = trackUsersHubs[hubId][db][track][hgsid] trackHubUsage += count if args.trackHubUsers == True: trackUsersHubsFile.write(hubLabel + "\t" + db + "\t" + track + "\t" + hgsid +\ "\t" + str(count) + "\n") if trackCountsHubs[hubId][db][track] != 0: if args.trackHubCounts == True: # Output counts of total users for each hub track count = trackCountsHubs[hubId][db][track] trackCountsHubsFile.write(hubLabel + "\t" + db + "\t" + track + "\t" +\ str(count) + "\n") if args.trackHubUsage == True: trackUsageHubsFile.write(hubLabel + "\t" + db + "\t" + track + "\t" +\ str(trackHubUsage) + "\n") # Close our output files if args.trackHubUsers == True: print("Finished outputting trackHubUsers.tsv") trackUsersHubsFile.close() if args.trackHubCounts == True: print("Finished outputting trackHubCounts.tsv") trackCountsHubsFile.close() if args.trackHubUsage == True: print("Finished outputting trackHubUsage.tsv") trackUsageHubsFile.close() # Output file containing info on month/years covered by stats if indicated if args.monthYear == True: monthYearFile = open(os.path.join(outDir, "monthYear.tsv"), "w") for pair in monthYearSet: monthYearFile.write(pair + "\n") print("Finished outputting monthYear.tsv") monthYearFile.close() ##### Output data per month when indicated if args.perMonth == True: if args.dbUsers == True: dbUsersMonthFile = open(os.path.join(outDir, "dbUsers.perMonth.tsv"), "w") if args.dbCounts == True: dbCountsMonthFile = open(os.path.join(outDir, "dbCounts.perMonth.tsv"), "w") if args.dbUsage == True: dbUsageMonthFile = open(os.path.join(outDir, "dbUsage.perMonth.tsv"), "w") # Set of nested for loops to go through dictionary level by level and # output and summarize results into appropriate counts dictionary if any([args.dbUsage, args.dbUsers, args.dbCounts]): for db in dbUsersMonth: for year in dbUsersMonth[db]: for month in dbUsersMonth[db][year]: dbCountsMonth[db][year][month] = 0 dbUsageMonth = 0 for hgsid in dbUsersMonth[db][year][month]: if dbUsersMonth[db][year][month][hgsid] != 1: dbCountsMonth[db][year][month] += 1 # Output dbUsersMonth info to a file count = dbUsersMonth[db][year][month][hgsid] dbUsageMonth += count if args.dbUsers == True: dbUsersMonthFile.write(db + "\t" + year + "\t" + month +\ "\t" + hgsid + "\t" + str(count) + "\n") # Output dbCounts to a file if dbCountsMonth[db][year][month] != 0: if args.dbCounts == True: count = dbCountsMonth[db][year][month] dbCountsMonthFile.write(db + "\t" + year + "\t" + month +\ "\t" + str(count) + "\n") if args.dbUsage == True: dbUsageMonthFile.write(db + "\t" + year + "\t" + month +\ "\t" + str(dbUsageMonth) + "\n") if args.dbUsers == True: print("Finished outputting dbUsers.perMonth.tsv") dbUsersMonthFile.close() if args.dbCounts == True: print("Finished outputting dbCounts.perMonth.tsv") dbCountsMonthFile.close() if args.dbUsage == True: print("Finished outputting dbUsage.perMonth.tsv") dbUsageMonthFile.close() # Summarize user dictionaries to create counts per db/track/hub track if args.trackUsers == True: trackUsersMonthFile = open(os.path.join(outDir, "trackUsers.perMonth.tsv"), "w") if args.trackCounts == True: trackCountsMonthFile = open(os.path.join(outDir, "trackCounts.perMonth.tsv"), "w") if args.trackUsage == True: trackUsageMonthFile = open(os.path.join(outDir, "trackUsage.perMonth.tsv"), "w") # Set of nested for loops to go through dictionary level by level and # output and summarize results into appropriate counts dictionary if any([args.trackUsage, args.trackUsers, args.trackCounts]): for db in trackUsersMonth: for year in trackUsersMonth[db]: for month in trackUsersMonth[db][year]: for track in trackUsersMonth[db][year][month]: trackCountsMonth[db][year][month][track] = 0 trackUsageMonth = 0 for hgsid in trackUsersMonth[db][year][month][track]: if trackUsersMonth[db][year][month][track][hgsid] != 1: trackCountsMonth[db][year][month][track] += 1 count = trackUsersMonth[db][year][month][track][hgsid] trackUsageMonth += count if args.trackUsers == True: trackUsersMonthFile.write(db + "\t" + year + "\t" + month + "\t" +\ hgsid + "\t" + track + "\t" + str(count) + "\n") if trackCountsMonth[db][year][month][track] != 0: if args.trackCounts == True: count = trackCountsMonth[db][year][month][track] trackCountsMonthFile.write(db + "\t" + year + "\t" + month + "\t" +\ track + "\t" + str(count) + "\n") if args.trackUsage == True: trackUsageMonthFile.write(db + "\t" + year + "\t" + month + "\t" +\ track + "\t" + str(trackUsageMonth) + "\n") if args.trackUsers == True: print("Finished outputting trackUsers.perMonth.tsv") trackUsersMonthFile.close() if args.trackCounts == True: print("Finished outputting trackUsers.perMonth.tsv") trackCountsMonthFile.close() if args.trackUsage == True: print("Finished outputting trackUsage.perMonth.tsv") trackUsageMonthFile.close() # Summarize user dictionaries to create counts per db/track/hub track if args.trackHubUsers == True: trackUsersHubsMonthFile = open(os.path.join(outDir, "trackUsersHubs.perMonth.tsv"), "w") if args.trackHubCounts == True: trackCountsHubsMonthFile = open(os.path.join(outDir, "trackCountsHubs.perMonth.tsv"), "w") if args.trackHubUsage == True: trackUsageHubsMonthFile = open(os.path.join(outDir, "trackUsageHubs.perMonth.tsv"), "w") # Set of nested for loops to go through dictionary level by level and # output and summarize results into appropriate counts dictionary if any([args.trackHubUsage, args.trackHubUsers, args.trackHubCounts]): for hubId in trackUsersHubsMonth: hubLabel = publicHubs[hubId][1] for db in trackUsersHubsMonth[hubId]: for year in trackUsersHubsMonth[hubId][db]: for month in trackUsersHubsMonth[hubId][db][year]: for track in trackUsersHubsMonth[hubId][db][year][month]: trackCountsHubsMonth[hubId][db][year][month][track] = 0 trackHubUsageMonth = 0 for hgsid in trackUsersHubsMonth[hubId][db][year][month][track]: if trackUsersHubsMonth[hubId][db][year][month][track][hgsid] != 1: trackCountsHubsMonth[hubId][db][year][month][track] += 1 count = trackUsersHubsMonth[hubId][db][year][month][track][hgsid] trackHubUsageMonth += count if args.trackHubUsers == True: trackUsersHubsMonthFile.write(hubLabel + "\t" + db + "\t" + year +\ "\t" + month + "\t" + track + "\t" + hgsid + "\t" + str(count) + "\n") if trackCountsHubsMonth[hubId][db][year][month][track] != 0: if args.trackHubCounts == True: count = trackCountsHubsMonth[hubId][db][year][month][track] trackCountsHubsMonthFile.write(hubLabel + "\t" + db + "\t" + year +\ "\t" + month + "\t" + track + "\t" +\ str(count) + "\n") if args.trackHubUsage == True: trackUsageHubsMonthFile.write(hubLabel + "\t" + db + "\t" + year +\ "\t" + month + "\t" + track + "\t" +\ str(trackHubUsageMonth) + "\n") if args.trackHubUsers == True: print("Finished outputting trackUsersHubs.perMonth.tsv") trackUsersHubsMonthFile.close() if args.trackHubCounts == True: print("Finished outputting trackCountsHubs.perMonth.tsv") trackCountsHubsMonthFile.close() if args.trackHubUsage == True: print("Finished outputting trackUsageHubs.perMonth.tsv") trackUsageHubsMonthFile.close() ##### ##### Output json files if indicated ##### if args.jsonOut == True: dumpToJson(dbCounts, "dbCounts.json", outDir) dumpToJson(trackCounts, "trackCounts.json", outDir) dumpToJson(trackCountsHubs, "trackCountsHubs.json", outDir) if args.perMonth == True: dumpToJson(dbCountsMonth, "dbCounts.perMonth.json", outDir) dumpToJson(trackCountsMonth, "trackCounts.perMonth.json", outDir) dumpToJson(trackCountsHubsMonth, "trackCountsHubs.perMonth.json", outDir) #if args.monthYear == True: # dumpToJson(monthYearSet, "monthYearSet.json") ##### ##### Output information on default track usage if indicated ##### if args.outputDefaults == True and all([args.dbCounts, args.trackCounts]): # Sort dbs by most popular dbCountsSorted = sorted(dbCounts.items(), key=operator.itemgetter(1)) dbCountsSorted.reverse() defaultCountsFile = open(os.path.join(outDir, "defaultCounts.tsv"), "w") for x in range(0, 15): # Will only output the default track stats for the 15 most popular assemblies db = dbCountsSorted[x][0] dbOpt = "db=" + db # HGDB_CONF must be set here so that we use default tracks from beta, not dev # Dev can contain staged tracks that don't exist on RR, leading to errors later in script - cmd = ["cd /usr/local/apache/cgi-bin && HGDB_CONF=$HOME/.hg.conf.beta ./hgTracks " + dbOpt] + cmd = ["cd /usr/local/apache/cgi-bin && HGDB_CONF=$HOME/.hg.conf.beta ./hgTracks hgt.trackImgOnly=1" + dbOpt] p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) cmdout, cmderr = p.communicate() errText = cmderr.decode("ASCII") # Convert binary output into ACSII for processing # Process stderr output as that's what contains the trackLog lines splitErrText = errText.split("\n") trackLog = splitErrText[0] # First element is trackLog line, second is CGI_TIME; only want trackLog splitLine = trackLog.split(" ") # Build list of tracks tracks = splitLine[4] tracks = tracks.split(",") dbUse = dbCounts[db] # output list to file that contains column headings defaultCountsFile.write("#db\ttrackName\ttrackUse\t% using\t% turning off\n#" + db + "\t" + str(dbUse) + "\n") defaultCounts = [] for track in tracks: if track == "": continue # Remove trailing characters track = track[:-2] try: trackUse = trackCounts[db][track] relUse = (trackUse/dbUse)*100 relOff = ((dbUse - trackUse)/dbUse)*100 # Store all this info a in a list so that we can sort my most used tracks later defaultCounts.append([track, trackUse, relUse, relOff]) except KeyError: continue # Sort defaultCounts for current assembly by most used track first defaultCountsSorted = sorted(defaultCounts, key=operator.itemgetter(2)) defaultCountsSorted.reverse() # Output sorted defaultCounts to a file for line in defaultCountsSorted: track = line[0] use = line[1] on = line[2] off = line[3] output = "{}\t{}\t{:d}\t{:3.2f}\t{:3.2f}\n".format(db, track, use, on, off) defaultCountsFile.write(output) print("Finished outputting defaultCounts.tsv") defaultCountsFile.close() elif args.outputDefaults == True: if not any([args.dbCounts, args.trackCounts]): # Need both dbCounts and trackCounts to actually be populated with data to be able to do the default counts print("\nCannot output default tracks if either --trackCounts or --dbCounts is not set. Set both of these options and re-run") #### # Print output directory name, particularly useful if using default output directory names # based on day/time print("\nYour output is in", outDir) if __name__ == "__main__": main()