01660f51199249d8a05b8e863bea2f551920d8c9 chmalee Tue Aug 27 14:56:15 2024 -0700 Add log trimming scripts to source tree so QA can run them, refs Lou email diff --git src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs deleted file mode 100755 index 6226921..0000000 --- src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs +++ /dev/null @@ -1,115 +0,0 @@ -#!/cluster/software/bin/python3 - -""" -Go through an Apache error log and combine multiple trackLog lines into -a single line, and keep only relevant fields. -""" - -import os,sys,gzip,argparse,re -from collections import defaultdict - -useDict = defaultdict(list) - -def parseCommandLine(): - parser = argparse.ArgumentParser(description="", add_help=True, usage="%(prog)s [options]") - parser.add_argument("infile", action="store", default=None, help="Input Apache error_log file, use 'stdin' to read from standard input") - parser.add_argument("-v", "--verbose", action="store_true", default=False, help="Log verbose output to stderr") - args = parser.parse_args() - return args - -def addOrMergeUsage(date, pid, ip, logPart, db, hgsid, tracks): - """Add a new usage onto the usageDict list, or add more tracks onto the end of a previous usage.""" - global useDict - key = "|".join([pid,ip,hgsid]) - if logPart == 0: - val = "|".join([date,db,tracks]) - useDict[key].append(val) - else: - # be sure to add a comma in case apache failed to add a referer: - if useDict[key][-1][-1] != ",": - useDict[key][-1] += "," + tracks - else: - useDict[key][-1] += tracks - -def processPrefix(line): - """Split the apache error log parts between brackets into a list of elements.""" - ret = [] - current = "" - for i in range(len(line)): - c = line[i] - if c == "]" or (not i != len(line) - 1 and current): - # the 'AH01215:' before the trackLog statement, a relatively recent Apache addition - if c != "]": - current += c - ret.append(current.strip()) - current = "" - elif c == "[": - current = "" - else: - current += c - return ret - -def parseTrackLogLine(line, verbose): - splitLine = line.strip().split("trackLog") - - try: - # the apache stuff - prefix = splitLine[0].strip() - prefixParts = processPrefix(prefix) - date = prefixParts[0] - pid = prefixParts[2].split("pid ")[1] - ip = prefixParts[3].split("client ")[1] - - # our stuff - suffixParts = splitLine[1].strip().split(' ') - logPart = int(suffixParts[0]) - db = suffixParts[1] - hgsid = suffixParts[2] - tracks = "" - if len(suffixParts) > 3: - tracks = suffixParts[3] - except IndexError: - sys.stderr.write("offending line:\n%s\n" % line) - sys.exit(1) - return date, pid, ip, logPart, db, hgsid, tracks - -def trimLog(infileHandle, verbose): - lineCount = 0 - matchCount = 0 - # use a re here because there are also "trackLog position" lines too - trackLogMatch = re.compile("trackLog [0-9]+") - for line in infileHandle: - lineCount += 1 - if "str" not in str(type(line)): - line = line.decode("ASCII") - if trackLogMatch.search(line): - matchCount += 1 - date, pid, ip, logPart, db, hgsid, tracks = parseTrackLogLine(line, verbose) - addOrMergeUsage(date, pid, ip, logPart, db, hgsid, tracks) - - if verbose: - sys.stderr.write("Processed %d lines and found %d trackLog lines, trimmed to %d usages\n" % ( - lineCount, matchCount, len(useDict))) - -def dumpTrimmed(): - for key in useDict: - splitKey = key.split("|") - for usage in useDict[key]: - splitVal = usage.split("|") - print("%s\t%s" % ("\t".join(splitKey), "\t".join(splitVal))) - -def main(): - args = parseCommandLine() - if args.infile == "stdin": - infh = sys.stdin - elif args.infile[-3:] == ".gz": - infh = gzip.open(args.infile, "rb") - else: - infh = open(args.infile, "rb") - - trimLog(infh, args.verbose) - dumpTrimmed() - infh.close() - -if __name__ == "__main__": - main()