a700c66c67b692c50b3d1e69eadc593c14b5fe1e max Fri Aug 19 02:53:47 2022 -0700 updating apache log parse, no redmine diff --git src/utils/apacheLogParse src/utils/apacheLogParse index 32b9988..8d8807b 100755 --- src/utils/apacheLogParse +++ src/utils/apacheLogParse @@ -260,44 +260,49 @@ inOutFnames.append( (fileListFname, outFname) ) logging.info("Found %d input files, assigned to %d output files " % (len(inFnames), len(inOutFnames))) return inOutFnames def inOutTab(outDir): " prep in/out file list for parasol log tab-sep reformatting, return list of tuples (inFname, outFname)" if not os.path.isdir(outDir): os.makedirs(outDir) fnames = [] years = os.listdir(baseLogDir) years = [x for x in years if x.isdigit()] + years.sort() + logging.info("Processing years %s" % years) for year in years: yearDir = join(baseLogDir, year) for servName in servNames: servDir = join(yearDir, servName) inFnames = glob.glob(join(servDir, "access_log.*.gz")) + logging.info("Found %d infiles in %s" % (len(inFnames), servDir)) for inFname in inFnames: # access_log.20130317.gz day =os.path.basename(inFname).split(".")[1] - outFname = "%s_%s.botIps.txt" % (day, servName) # this is also used a flag file. job writes most data to <day>_<server>_<realWeekDate>.tab.gz + outFname = "%s_%s.botIps.txt" % (day, servName) # this has data, but is also used a flag file. + # jobs writes most data to <day>_<server>_<realWeekDate>.tab.gz, not to this file outFname = join(outDir, outFname) fnames.append ( (inFname, outFname) ) + logging.info("Found %d logfiles in %s" % (len(inFnames), yearDir)) - print "Found %d logfiles in %s" % (len(fnames), yearDir) + logging.info("Found %d input/output file pairs" % len(fnames)) return fnames def inOutCount(catDir, outDir): """ prepare (inFname, outFname) tuples for the count step outfiles go into one subDir per counting type, e.g. hgcClicks/2013-04-04.tab.gz""" if not os.path.isdir(outDir): os.makedirs(outDir) fnames = [] inFnames = glob.glob(join(catDir, "*.tab.gz")) for inFname in inFnames: fBase =os.path.basename(inFname) outPath = join(outDir, "hgcClicks", fBase) fnames.append ( (inFname, outPath) ) @@ -734,31 +739,30 @@ fileHandles = {} # cache the file handles # https://stackoverflow.com/a/27146123/233871 lines = (line.replace('\0','') for line in ifh.read().splitlines()) count = 0 botIps = set() for row in csv.reader(lines, delimiter = " ", escapechar='\\'): # parse apache log line count += 1 if count % 20000 == 0: print "parsed %d rows" % count log = parseApacheLine(row) if log==None: - #logging.info("parse error: %s" % row) continue weekDate = dateToWeek(log.time) # skip if it's a bot if isBot(log.agent): #botCounts[a.agent]+=1 botIps.add(log.ip) logging.debug("%s is a bot" % log.agent) continue outFname = baseOut+"_"+weekDate+".tab.gz" if outFname in fileHandles: ofh = fileHandles[outFname]