src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs 401352088c5776094516da2b9f617ae5a1357bad

401352088c5776094516da2b9f617ae5a1357bad
chmalee
  Mon Jul 20 15:14:28 2020 -0700
Adding apache error log trimmer used to make counting database and track usage faster and easier

diff --git src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs
new file mode 100755
index 0000000..a1f6934
--- /dev/null
+++ src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs
@@ -0,0 +1,111 @@
+#!/cluster/software/bin/python3
+
+"""
+Go through an Apache error log and combine multiple trackLog lines into
+a single line, and keep only relevant fields.
+"""
+
+import os,sys,gzip,argparse,re
+from collections import defaultdict
+
+useDict = defaultdict(list)
+
+def parseCommandLine():
+    parser = argparse.ArgumentParser(description="", add_help=True, usage="%(prog)s [options]")
+    parser.add_argument("infile", action="store", default=None, help="Input Apache error_log file, use 'stdin' to read from standard input")
+    parser.add_argument("-v", "--verbose", action="store_true", default=False, help="Log verbose output to stderr")
+    args = parser.parse_args()
+    return args
+
+def addOrMergeUsage(date, pid, ip, logPart, db, hgsid, tracks):
+    """Add a new usage onto the usageDict list, or add more tracks onto the end of a previous usage."""
+    global useDict
+    key = "|".join([pid,ip,hgsid])
+    if logPart == 0:
+        val = "|".join([date,db,tracks])
+        useDict[key].append(val)
+    else:
+        useDict[key][-1] += tracks
+
+def processPrefix(line):
+    """Split the apache error log parts between brackets into a list of elements."""
+    ret = []
+    current = ""
+    for i in range(len(line)):
+        c = line[i]
+        if c == "]" or (not i != len(line) - 1 and current):
+            # the 'AH01215:' before the trackLog statement, a relatively recent Apache addition
+            if c != "]": 
+                current += c
+            ret.append(current.strip())
+            current = ""
+        elif c == "[":
+            current = ""
+        else:
+            current += c
+    return ret
+
+def parseTrackLogLine(line, verbose):
+    splitLine = line.strip().split("trackLog")
+
+    try:
+        # the apache stuff
+        prefix = splitLine[0].strip()
+        prefixParts = processPrefix(prefix)
+        date = prefixParts[0]
+        pid = prefixParts[2].split("pid ")[1]
+        ip = prefixParts[3].split("client ")[1]
+
+        # our stuff
+        suffixParts = splitLine[1].strip().split(' ')
+        logPart = int(suffixParts[0])
+        db = suffixParts[1]
+        hgsid = suffixParts[2]
+        tracks = ""
+        if len(suffixParts) > 3:
+            tracks = suffixParts[3]
+    except IndexError:
+        sys.stderr.write("offending line:\n%s\n" % line)
+        sys.exit(1)
+    return date, pid, ip, logPart, db, hgsid, tracks
+
+def trimLog(infileHandle, verbose):
+    lineCount = 0
+    matchCount = 0
+    # use a re here because there are also "trackLog position" lines too
+    trackLogMatch = re.compile("trackLog [0-9]+")
+    for line in infileHandle:
+        lineCount += 1
+        if "str" not in str(type(line)):
+            line = line.decode("ASCII")
+        if trackLogMatch.search(line):
+            matchCount += 1
+            date, pid, ip, logPart, db, hgsid, tracks = parseTrackLogLine(line, verbose)
+            addOrMergeUsage(date, pid, ip, logPart, db, hgsid, tracks)
+
+    if verbose:
+        sys.stderr.write("Processed %d lines and found %d trackLog lines, trimmed to %d usages\n" % (
+            lineCount, matchCount, len(useDict)))
+
+def dumpTrimmed():
+    for key in useDict:
+        splitKey = key.split("|")
+        for usage in useDict[key]:
+            splitVal = usage.split("|")
+            print("%s\t%s" % ("\t".join(splitKey), "\t".join(splitVal)))
+
+def main():
+    args = parseCommandLine()
+    if args.infile == "stdin":
+        infh = sys.stdin
+    elif args.infile[-3:] == ".gz":
+        infh = gzip.open(args.infile, "rb")
+    else:
+        infh = open(args.infile, "rb")
+
+    trimLog(infh, args.verbose)
+    dumpTrimmed()
+    infh.close()
+
+if __name__ == "__main__":
+    main()