src/utils/apacheLogParse 8796e12036760665e5e7ae83ad30cbbb48d302c9

8796e12036760665e5e7ae83ad30cbbb48d302c9
max
  Thu Aug 18 11:16:56 2022 -0700
updating apacheLogParse, no redmine

diff --git src/utils/apacheLogParse src/utils/apacheLogParse
index 09a6f74..32b9988 100755
--- src/utils/apacheLogParse
+++ src/utils/apacheLogParse
@@ -8,33 +8,30 @@
 from os.path import basename, join, abspath, isfile, dirname, isdir
 from os import listdir
 import optparse, logging, sys, string
 from itertools import imap
 from operator import itemgetter
 import heapq
 
 #TEMPDIR = "/dev/shm"
 
 # filename to delete on exit
 removeTmpName = None
 
 # where do we store the raw apache logfiles
 baseLogDir = "/hive/data/inside/wwwstats/RR"
 
-# years to analyze
-years = ["2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017", "2018", "2019", "2020"]
-
 # list of servers to analyze
 # change these for debugging or if need to focus on single servers
 servNames = ["hgw1", "hgw2", "hgw3", "hgw4", "hgw5", "hgw6", "hgw7", "hgw8"]
 
 # directory for cluster job output
 jobOutDir = "/hive/data/inside/wwwstats/apacheLogParse"
 
 # directory to store csv files for C3 plot
 htmlDir = "/cluster/home/max/public_html/logParse/plot"
 
 # a struct for a cleaned apache log line
 apacheFields = ['ip', 'time', 'status', "reqType", 'speed', "filePath", "cgiParams", "agent", "referer"]
 ApacheLine = namedtuple("ApacheLine", apacheFields)
 
 # when writing the tab file, we need to replace some characters from the apache log file
@@ -106,44 +103,49 @@
             speed = fields[10]
         else:
             speed = "-1" # there was a time back in 2003 when we didn't have the speed field in the logs
     except IndexError:
         logging.warn("index error %s" % fields)
         return None
 
     if len(req)>10000:
         logging.warn("HTTP request from %s with more than 10000 chars: %s" % (agent, "".join(req[:200])))
         return None
 
     # parse http request: typically something like
     # GET cgi-bin/hgTracks?hgsid=xxxx&xxx HTTP1.1
     reqRest = req.split(" ", 1)
     if len(reqRest)!=2:
-        logging.warn("no space in request field %s" % fields)
+        logging.warn("not two fields: %s" % fields)
         return None
 
     reqType, rest = reqRest
     if not rest.endswith("HTTP/1.1") and not rest.endswith("HTTP/1.0"):
         logging.warn("request does not end with HTTP/1.1 %s, GET params probably too long" % fields)
         return None
     
     reqUrl = rest.replace(" HTTP/1.1", "")
     reqUrl = reqUrl.replace(" HTTP/1.0", "")
     #reqType, reqUrl, httpVer = reqFields
 
     # split into cgi as a string and the params as a dict, e.g. "cgi-bin/hgXXX" and {'hgsid':'1233'}
+    try:
         filePath, paramStr = urlparse.urlsplit(reqUrl)[2:4]
+    except ValueError:
+        logging.error("Invalid reqUrl %s" % reqUrl)
+        return None
+
     cgi = basename(filePath)
     params = urlparse.parse_qs(paramStr)
     paramList = []
     for key, val in params.iteritems():
         val = val[0]
         # get rid of the = and | characters in these strings
         key = key.translate(keyValReplTable)
         val = val.translate(keyValReplTable)
         kvStr = "%s=%s" % (key, val)
         if kvStr=="":
             continue
         paramList.append(kvStr)
     paramStr = "|".join(paramList)
 
     # we want to put this into a tab-sep file, so remove these chars
@@ -255,30 +257,34 @@
         ofh.close()
 
         outFname = join(outDir, date+".tab.gz")
         inOutFnames.append( (fileListFname, outFname) )
 
     logging.info("Found %d input files, assigned to %d output files " % (len(inFnames), len(inOutFnames)))
     return inOutFnames
         
 
 def inOutTab(outDir):
     " prep in/out file list for parasol log tab-sep reformatting, return list of tuples  (inFname, outFname)"
     if not os.path.isdir(outDir):
         os.makedirs(outDir)
 
     fnames = []
+
+    years = os.listdir(baseLogDir)
+    years = [x for x in years if x.isdigit()]
+
     for year in years:
         yearDir = join(baseLogDir, year)
         for servName in servNames:
             servDir = join(yearDir, servName)
             inFnames = glob.glob(join(servDir, "access_log.*.gz"))
             for inFname in inFnames:
                 # access_log.20130317.gz
                 day =os.path.basename(inFname).split(".")[1]
                 outFname = "%s_%s.botIps.txt" % (day, servName) # this is also used a flag file. job writes most data to <day>_<server>_<realWeekDate>.tab.gz
                 outFname = join(outDir, outFname)
                 fnames.append ( (inFname, outFname) )
 
     print "Found %d logfiles in %s" % (len(fnames), yearDir)
     return fnames
 
@@ -323,32 +329,32 @@
         runReduce(jobOutDir, True)
         sys.exit(0)
 
     # create list of cmd lines to run
     jobLines = []
     skipCount = 0
     for inFname, outFname in fnames:
         if not OVERWRITE and isfile(outFname):
             logging.debug("Skipping parasol job, output file %s already exists" % outFname)
             skipCount += 1
             continue
         outPath = join(jobOutDir, outFname)
         cmdLine = "%s %s %s %s {check out exists %s}\n" % \
                 (sys.executable, abspath(__file__), job, inFname, outPath)
         jobLines.append(cmdLine)
-    logging.info("%d logfiles, skipped %d (already converted), %d files to convert" % \
-        (len(fnames), skipCount, len(jobLines)))
+    logging.info("%d logfiles, skipped %d (already converted), %d files to convert, outdir %s" % \
+        (len(fnames), skipCount, len(jobLines), jobOutDir))
 
     if len(jobLines)==0:
         logging.info("No new logfiles to convert")
         return
 
     # create joblist file
     jlName = join(jobOutDir, "jobList")
     jlf = open(jlName, "w")
     for cmdLine in jobLines:
         jlf.write(cmdLine)
     jlf.close()
 
     # submitting joblist
     print("Running jobs in dir %s" % jobOutDir)
     cmd = "ssh %s 'cd %s; para freeBatch; para resetCounts; para clearSickNodes; para make jobList'" % \
@@ -715,33 +721,36 @@
 
         ofh.close()
         logging.info("Wrote %s" % outName)
 
 def apacheToTab(inFname, outBotIpsFname):
     " parse apache log file to tab file. outBotIpsFname is something like 20130420_hgw4.botIps, write output to 20130420_hgw4_<realDate>.tab.gz "
     logging.info("Parsing %s" % inFname)
     if inFname.endswith(".gz"):
         ifh = gzip.open(inFname)
     else:
         ifh = open(inFname)
 
     baseOut = outBotIpsFname.replace(".botIps.txt","")
     fileHandles = {} # cache the file handles
 
+    # https://stackoverflow.com/a/27146123/233871
+    lines = (line.replace('\0','') for line in ifh.read().splitlines())
+
     count = 0
     botIps = set()
-    for row in csv.reader(ifh, delimiter = " ", escapechar='\\'):
+    for row in csv.reader(lines, delimiter = " ", escapechar='\\'):
         # parse apache log line
         count += 1
         if count % 20000 == 0:
             print "parsed %d rows" % count
 
         log = parseApacheLine(row)
         if log==None:
             #logging.info("parse error: %s" % row)
             continue
 
         weekDate = dateToWeek(log.time)
 
         # skip if it's a bot
         if isBot(log.agent):
             #botCounts[a.agent]+=1