01660f51199249d8a05b8e863bea2f551920d8c9
chmalee
  Tue Aug 27 14:56:15 2024 -0700
Add log trimming scripts to source tree so QA can run them, refs Lou email

diff --git src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs
deleted file mode 100755
index 6226921..0000000
--- src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs
+++ /dev/null
@@ -1,115 +0,0 @@
-#!/cluster/software/bin/python3
-
-"""
-Go through an Apache error log and combine multiple trackLog lines into
-a single line, and keep only relevant fields.
-"""
-
-import os,sys,gzip,argparse,re
-from collections import defaultdict
-
-useDict = defaultdict(list)
-
-def parseCommandLine():
-    parser = argparse.ArgumentParser(description="", add_help=True, usage="%(prog)s [options]")
-    parser.add_argument("infile", action="store", default=None, help="Input Apache error_log file, use 'stdin' to read from standard input")
-    parser.add_argument("-v", "--verbose", action="store_true", default=False, help="Log verbose output to stderr")
-    args = parser.parse_args()
-    return args
-
-def addOrMergeUsage(date, pid, ip, logPart, db, hgsid, tracks):
-    """Add a new usage onto the usageDict list, or add more tracks onto the end of a previous usage."""
-    global useDict
-    key = "|".join([pid,ip,hgsid])
-    if logPart == 0:
-        val = "|".join([date,db,tracks])
-        useDict[key].append(val)
-    else:
-        # be sure to add a comma in case apache failed to add a referer:
-        if useDict[key][-1][-1] != ",":
-            useDict[key][-1] += "," + tracks
-        else:
-            useDict[key][-1] += tracks
-
-def processPrefix(line):
-    """Split the apache error log parts between brackets into a list of elements."""
-    ret = []
-    current = ""
-    for i in range(len(line)):
-        c = line[i]
-        if c == "]" or (not i != len(line) - 1 and current):
-            # the 'AH01215:' before the trackLog statement, a relatively recent Apache addition
-            if c != "]": 
-                current += c
-            ret.append(current.strip())
-            current = ""
-        elif c == "[":
-            current = ""
-        else:
-            current += c
-    return ret
-
-def parseTrackLogLine(line, verbose):
-    splitLine = line.strip().split("trackLog")
-
-    try:
-        # the apache stuff
-        prefix = splitLine[0].strip()
-        prefixParts = processPrefix(prefix)
-        date = prefixParts[0]
-        pid = prefixParts[2].split("pid ")[1]
-        ip = prefixParts[3].split("client ")[1]
-
-        # our stuff
-        suffixParts = splitLine[1].strip().split(' ')
-        logPart = int(suffixParts[0])
-        db = suffixParts[1]
-        hgsid = suffixParts[2]
-        tracks = ""
-        if len(suffixParts) > 3:
-            tracks = suffixParts[3]
-    except IndexError:
-        sys.stderr.write("offending line:\n%s\n" % line)
-        sys.exit(1)
-    return date, pid, ip, logPart, db, hgsid, tracks
-
-def trimLog(infileHandle, verbose):
-    lineCount = 0
-    matchCount = 0
-    # use a re here because there are also "trackLog position" lines too
-    trackLogMatch = re.compile("trackLog [0-9]+")
-    for line in infileHandle:
-        lineCount += 1
-        if "str" not in str(type(line)):
-            line = line.decode("ASCII")
-        if trackLogMatch.search(line):
-            matchCount += 1
-            date, pid, ip, logPart, db, hgsid, tracks = parseTrackLogLine(line, verbose)
-            addOrMergeUsage(date, pid, ip, logPart, db, hgsid, tracks)
-
-    if verbose:
-        sys.stderr.write("Processed %d lines and found %d trackLog lines, trimmed to %d usages\n" % (
-            lineCount, matchCount, len(useDict)))
-
-def dumpTrimmed():
-    for key in useDict:
-        splitKey = key.split("|")
-        for usage in useDict[key]:
-            splitVal = usage.split("|")
-            print("%s\t%s" % ("\t".join(splitKey), "\t".join(splitVal)))
-
-def main():
-    args = parseCommandLine()
-    if args.infile == "stdin":
-        infh = sys.stdin
-    elif args.infile[-3:] == ".gz":
-        infh = gzip.open(args.infile, "rb")
-    else:
-        infh = open(args.infile, "rb")
-
-    trimLog(infh, args.verbose)
-    dumpTrimmed()
-    infh.close()
-
-if __name__ == "__main__":
-    main()