401352088c5776094516da2b9f617ae5a1357bad chmalee Mon Jul 20 15:14:28 2020 -0700 Adding apache error log trimmer used to make counting database and track usage faster and easier diff --git src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs new file mode 100755 index 0000000..a1f6934 --- /dev/null +++ src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs @@ -0,0 +1,111 @@ +#!/cluster/software/bin/python3 + +""" +Go through an Apache error log and combine multiple trackLog lines into +a single line, and keep only relevant fields. +""" + +import os,sys,gzip,argparse,re +from collections import defaultdict + +useDict = defaultdict(list) + +def parseCommandLine(): + parser = argparse.ArgumentParser(description="", add_help=True, usage="%(prog)s [options]") + parser.add_argument("infile", action="store", default=None, help="Input Apache error_log file, use 'stdin' to read from standard input") + parser.add_argument("-v", "--verbose", action="store_true", default=False, help="Log verbose output to stderr") + args = parser.parse_args() + return args + +def addOrMergeUsage(date, pid, ip, logPart, db, hgsid, tracks): + """Add a new usage onto the usageDict list, or add more tracks onto the end of a previous usage.""" + global useDict + key = "|".join([pid,ip,hgsid]) + if logPart == 0: + val = "|".join([date,db,tracks]) + useDict[key].append(val) + else: + useDict[key][-1] += tracks + +def processPrefix(line): + """Split the apache error log parts between brackets into a list of elements.""" + ret = [] + current = "" + for i in range(len(line)): + c = line[i] + if c == "]" or (not i != len(line) - 1 and current): + # the 'AH01215:' before the trackLog statement, a relatively recent Apache addition + if c != "]": + current += c + ret.append(current.strip()) + current = "" + elif c == "[": + current = "" + else: + current += c + return ret + +def parseTrackLogLine(line, verbose): + splitLine = line.strip().split("trackLog") + + try: + # the apache stuff + prefix = splitLine[0].strip() + prefixParts = processPrefix(prefix) + date = prefixParts[0] + pid = prefixParts[2].split("pid ")[1] + ip = prefixParts[3].split("client ")[1] + + # our stuff + suffixParts = splitLine[1].strip().split(' ') + logPart = int(suffixParts[0]) + db = suffixParts[1] + hgsid = suffixParts[2] + tracks = "" + if len(suffixParts) > 3: + tracks = suffixParts[3] + except IndexError: + sys.stderr.write("offending line:\n%s\n" % line) + sys.exit(1) + return date, pid, ip, logPart, db, hgsid, tracks + +def trimLog(infileHandle, verbose): + lineCount = 0 + matchCount = 0 + # use a re here because there are also "trackLog position" lines too + trackLogMatch = re.compile("trackLog [0-9]+") + for line in infileHandle: + lineCount += 1 + if "str" not in str(type(line)): + line = line.decode("ASCII") + if trackLogMatch.search(line): + matchCount += 1 + date, pid, ip, logPart, db, hgsid, tracks = parseTrackLogLine(line, verbose) + addOrMergeUsage(date, pid, ip, logPart, db, hgsid, tracks) + + if verbose: + sys.stderr.write("Processed %d lines and found %d trackLog lines, trimmed to %d usages\n" % ( + lineCount, matchCount, len(useDict))) + +def dumpTrimmed(): + for key in useDict: + splitKey = key.split("|") + for usage in useDict[key]: + splitVal = usage.split("|") + print("%s\t%s" % ("\t".join(splitKey), "\t".join(splitVal))) + +def main(): + args = parseCommandLine() + if args.infile == "stdin": + infh = sys.stdin + elif args.infile[-3:] == ".gz": + infh = gzip.open(args.infile, "rb") + else: + infh = open(args.infile, "rb") + + trimLog(infh, args.verbose) + dumpTrimmed() + infh.close() + +if __name__ == "__main__": + main()