b01e95110f02879f2c581743c81655b1011fb350 chmalee Tue Jul 21 16:52:34 2020 -0700 Fix bug in error log trimmer when munging two lines together diff --git src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs index a1f6934..6226921 100755 --- src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs +++ src/hg/logCrawl/dbTrackAndSearchUsage/trimTrackLogs @@ -1,111 +1,115 @@ #!/cluster/software/bin/python3 """ Go through an Apache error log and combine multiple trackLog lines into a single line, and keep only relevant fields. """ import os,sys,gzip,argparse,re from collections import defaultdict useDict = defaultdict(list) def parseCommandLine(): parser = argparse.ArgumentParser(description="", add_help=True, usage="%(prog)s [options]") parser.add_argument("infile", action="store", default=None, help="Input Apache error_log file, use 'stdin' to read from standard input") parser.add_argument("-v", "--verbose", action="store_true", default=False, help="Log verbose output to stderr") args = parser.parse_args() return args def addOrMergeUsage(date, pid, ip, logPart, db, hgsid, tracks): """Add a new usage onto the usageDict list, or add more tracks onto the end of a previous usage.""" global useDict key = "|".join([pid,ip,hgsid]) if logPart == 0: val = "|".join([date,db,tracks]) useDict[key].append(val) else: + # be sure to add a comma in case apache failed to add a referer: + if useDict[key][-1][-1] != ",": + useDict[key][-1] += "," + tracks + else: useDict[key][-1] += tracks def processPrefix(line): """Split the apache error log parts between brackets into a list of elements.""" ret = [] current = "" for i in range(len(line)): c = line[i] if c == "]" or (not i != len(line) - 1 and current): # the 'AH01215:' before the trackLog statement, a relatively recent Apache addition if c != "]": current += c ret.append(current.strip()) current = "" elif c == "[": current = "" else: current += c return ret def parseTrackLogLine(line, verbose): splitLine = line.strip().split("trackLog") try: # the apache stuff prefix = splitLine[0].strip() prefixParts = processPrefix(prefix) date = prefixParts[0] pid = prefixParts[2].split("pid ")[1] ip = prefixParts[3].split("client ")[1] # our stuff suffixParts = splitLine[1].strip().split(' ') logPart = int(suffixParts[0]) db = suffixParts[1] hgsid = suffixParts[2] tracks = "" if len(suffixParts) > 3: tracks = suffixParts[3] except IndexError: sys.stderr.write("offending line:\n%s\n" % line) sys.exit(1) return date, pid, ip, logPart, db, hgsid, tracks def trimLog(infileHandle, verbose): lineCount = 0 matchCount = 0 # use a re here because there are also "trackLog position" lines too trackLogMatch = re.compile("trackLog [0-9]+") for line in infileHandle: lineCount += 1 if "str" not in str(type(line)): line = line.decode("ASCII") if trackLogMatch.search(line): matchCount += 1 date, pid, ip, logPart, db, hgsid, tracks = parseTrackLogLine(line, verbose) addOrMergeUsage(date, pid, ip, logPart, db, hgsid, tracks) if verbose: sys.stderr.write("Processed %d lines and found %d trackLog lines, trimmed to %d usages\n" % ( lineCount, matchCount, len(useDict))) def dumpTrimmed(): for key in useDict: splitKey = key.split("|") for usage in useDict[key]: splitVal = usage.split("|") print("%s\t%s" % ("\t".join(splitKey), "\t".join(splitVal))) def main(): args = parseCommandLine() if args.infile == "stdin": infh = sys.stdin elif args.infile[-3:] == ".gz": infh = gzip.open(args.infile, "rb") else: infh = open(args.infile, "rb") trimLog(infh, args.verbose) dumpTrimmed() infh.close() if __name__ == "__main__": main()