8796e12036760665e5e7ae83ad30cbbb48d302c9 max Thu Aug 18 11:16:56 2022 -0700 updating apacheLogParse, no redmine diff --git src/utils/apacheLogParse src/utils/apacheLogParse index 09a6f74..32b9988 100755 --- src/utils/apacheLogParse +++ src/utils/apacheLogParse @@ -8,33 +8,30 @@ from os.path import basename, join, abspath, isfile, dirname, isdir from os import listdir import optparse, logging, sys, string from itertools import imap from operator import itemgetter import heapq #TEMPDIR = "/dev/shm" # filename to delete on exit removeTmpName = None # where do we store the raw apache logfiles baseLogDir = "/hive/data/inside/wwwstats/RR" -# years to analyze -years = ["2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020"] - # list of servers to analyze # change these for debugging or if need to focus on single servers servNames = ["hgw1", "hgw2", "hgw3", "hgw4", "hgw5", "hgw6", "hgw7", "hgw8"] # directory for cluster job output jobOutDir = "/hive/data/inside/wwwstats/apacheLogParse" # directory to store csv files for C3 plot htmlDir = "/cluster/home/max/public_html/logParse/plot" # a struct for a cleaned apache log line apacheFields = ['ip', 'time', 'status', "reqType", 'speed', "filePath", "cgiParams", "agent", "referer"] ApacheLine = namedtuple("ApacheLine", apacheFields) # when writing the tab file, we need to replace some characters from the apache log file @@ -106,44 +103,49 @@ speed = fields[10] else: speed = "-1" # there was a time back in 2003 when we didn't have the speed field in the logs except IndexError: logging.warn("index error %s" % fields) return None if len(req)>10000: logging.warn("HTTP request from %s with more than 10000 chars: %s" % (agent, "".join(req[:200]))) return None # parse http request: typically something like # GET cgi-bin/hgTracks?hgsid=xxxx&xxx HTTP1.1 reqRest = req.split(" ", 1) if len(reqRest)!=2: - logging.warn("no space in request field %s" % fields) + logging.warn("not two fields: %s" % fields) return None reqType, rest = reqRest if not rest.endswith("HTTP/1.1") and not rest.endswith("HTTP/1.0"): logging.warn("request does not end with HTTP/1.1 %s, GET params probably too long" % fields) return None reqUrl = rest.replace(" HTTP/1.1", "") reqUrl = reqUrl.replace(" HTTP/1.0", "") #reqType, reqUrl, httpVer = reqFields # split into cgi as a string and the params as a dict, e.g. "cgi-bin/hgXXX" and {'hgsid':'1233'} + try: filePath, paramStr = urlparse.urlsplit(reqUrl)[2:4] + except ValueError: + logging.error("Invalid reqUrl %s" % reqUrl) + return None + cgi = basename(filePath) params = urlparse.parse_qs(paramStr) paramList = [] for key, val in params.iteritems(): val = val[0] # get rid of the = and | characters in these strings key = key.translate(keyValReplTable) val = val.translate(keyValReplTable) kvStr = "%s=%s" % (key, val) if kvStr=="": continue paramList.append(kvStr) paramStr = "|".join(paramList) # we want to put this into a tab-sep file, so remove these chars @@ -255,30 +257,34 @@ ofh.close() outFname = join(outDir, date+".tab.gz") inOutFnames.append( (fileListFname, outFname) ) logging.info("Found %d input files, assigned to %d output files " % (len(inFnames), len(inOutFnames))) return inOutFnames def inOutTab(outDir): " prep in/out file list for parasol log tab-sep reformatting, return list of tuples (inFname, outFname)" if not os.path.isdir(outDir): os.makedirs(outDir) fnames = [] + + years = os.listdir(baseLogDir) + years = [x for x in years if x.isdigit()] + for year in years: yearDir = join(baseLogDir, year) for servName in servNames: servDir = join(yearDir, servName) inFnames = glob.glob(join(servDir, "access_log.*.gz")) for inFname in inFnames: # access_log.20130317.gz day =os.path.basename(inFname).split(".")[1] outFname = "%s_%s.botIps.txt" % (day, servName) # this is also used a flag file. job writes most data to <day>_<server>_<realWeekDate>.tab.gz outFname = join(outDir, outFname) fnames.append ( (inFname, outFname) ) print "Found %d logfiles in %s" % (len(fnames), yearDir) return fnames @@ -323,32 +329,32 @@ runReduce(jobOutDir, True) sys.exit(0) # create list of cmd lines to run jobLines = [] skipCount = 0 for inFname, outFname in fnames: if not OVERWRITE and isfile(outFname): logging.debug("Skipping parasol job, output file %s already exists" % outFname) skipCount += 1 continue outPath = join(jobOutDir, outFname) cmdLine = "%s %s %s %s {check out exists %s}\n" % \ (sys.executable, abspath(__file__), job, inFname, outPath) jobLines.append(cmdLine) - logging.info("%d logfiles, skipped %d (already converted), %d files to convert" % \ - (len(fnames), skipCount, len(jobLines))) + logging.info("%d logfiles, skipped %d (already converted), %d files to convert, outdir %s" % \ + (len(fnames), skipCount, len(jobLines), jobOutDir)) if len(jobLines)==0: logging.info("No new logfiles to convert") return # create joblist file jlName = join(jobOutDir, "jobList") jlf = open(jlName, "w") for cmdLine in jobLines: jlf.write(cmdLine) jlf.close() # submitting joblist print("Running jobs in dir %s" % jobOutDir) cmd = "ssh %s 'cd %s; para freeBatch; para resetCounts; para clearSickNodes; para make jobList'" % \ @@ -715,33 +721,36 @@ ofh.close() logging.info("Wrote %s" % outName) def apacheToTab(inFname, outBotIpsFname): " parse apache log file to tab file. outBotIpsFname is something like 20130420_hgw4.botIps, write output to 20130420_hgw4_<realDate>.tab.gz " logging.info("Parsing %s" % inFname) if inFname.endswith(".gz"): ifh = gzip.open(inFname) else: ifh = open(inFname) baseOut = outBotIpsFname.replace(".botIps.txt","") fileHandles = {} # cache the file handles + # https://stackoverflow.com/a/27146123/233871 + lines = (line.replace('\0','') for line in ifh.read().splitlines()) + count = 0 botIps = set() - for row in csv.reader(ifh, delimiter = " ", escapechar='\\'): + for row in csv.reader(lines, delimiter = " ", escapechar='\\'): # parse apache log line count += 1 if count % 20000 == 0: print "parsed %d rows" % count log = parseApacheLine(row) if log==None: #logging.info("parse error: %s" % row) continue weekDate = dateToWeek(log.time) # skip if it's a bot if isBot(log.agent): #botCounts[a.agent]+=1