b22d8bfdf6dd0e96f0aab149fa505dfa4392ee08 max Wed Mar 29 17:38:53 2017 -0700 writing out bot IP addresses for Matt's new log analysis scripts, no redmine diff --git src/utils/apacheLogParse src/utils/apacheLogParse index e72c278..8260e17 100755 --- src/utils/apacheLogParse +++ src/utils/apacheLogParse @@ -9,31 +9,31 @@ from os import listdir import optparse, logging, sys, string from itertools import imap from operator import itemgetter import heapq #TEMPDIR = "/dev/shm" # filename to delete on exit removeTmpName = None # where do we store the raw apache logfiles baseLogDir = "/hive/data/inside/wwwstats/RR" # years to analyze -years = ["2009", "2010", "2011", "2012", "2013", "2014"] +years = ["2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017"] # list of servers to analyze # change these for debugging or if need to focus on single servers servNames = ["hgw1", "hgw2", "hgw3", "hgw4", "hgw5", "hgw6", "hgw7", "hgw8"] # directory for cluster job output jobOutDir = "/hive/data/inside/wwwstats/apacheLogParse" # directory to store csv files for C3 plot htmlDir = "/cluster/home/max/public_html/logParse/plot" # a struct for a cleaned apache log line apacheFields = ['ip', 'time', 'status', "reqType", 'speed', "filePath", "cgiParams", "agent", "referer"] ApacheLine = namedtuple("ApacheLine", apacheFields) @@ -150,34 +150,37 @@ filePath = filePath.translate ( tabSepReplTable) agent = agent.translate ( tabSepReplTable) referer = referer.translate ( tabSepReplTable) # put everything into a struct a = ApacheLine(ip, timeStr, status, reqType, speed, filePath, paramStr, agent, referer) return a def printBest(ofh, counts, topX=100000): " print top 10000 names and their counts " #ofh.write("- %s\n" % title) for name, count in counts: ofh.write("%s\t%s\n" % (name, count)) def isBot(agent): + " you can find most of these by looking for accesses to robots.txt " a = agent.lower() if "google." in a or "yahoo." in a or "bing." in a or "baidu." in a \ or "bot" in a or "spider" in a or "slurp" in a or "crawler" in a or \ "Commons-HttpClient" in a or "HTTPGrab" in a or "internal dummy" in a or \ + "Daum" in agent or "ltx71" in agent or "python-request" in agent or \ + "Scout" in agent or "Riddler" in agent or "wget" in agent or \ a.startswith("google") or a.startswith("yahoo") or a.startswith("bing"): return True else: return False def paramStrToDict(inStr): " convert string like hgsid=12343|key=val|key2=val2 to dictionary and return " d = {} if inStr=="": return d parts = inStr.split("|") for p in parts: k, v = p.split("=") d[k] = v @@ -260,31 +263,31 @@ def inOutTab(outDir): " prep in/out file list for parasol log tab-sep reformatting, return list of tuples (inFname, outFname)" if not os.path.isdir(outDir): os.makedirs(outDir) fnames = [] for year in years: yearDir = join(baseLogDir, year) for servName in servNames: servDir = join(yearDir, servName) inFnames = glob.glob(join(servDir, "access_log.*.gz")) for inFname in inFnames: # access_log.20130317.gz day =os.path.basename(inFname).split(".")[1] - outFname = "%s_%s.flag" % (day, servName) # this is just an empty flag file, job writes to __.tab.gz + outFname = "%s_%s.botIps.txt" % (day, servName) # this is also used a flag file. job writes most data to __.tab.gz outFname = join(outDir, outFname) fnames.append ( (inFname, outFname) ) print "Found %d logfiles in %s" % (len(fnames), yearDir) return fnames def inOutCount(catDir, outDir): """ prepare (inFname, outFname) tuples for the count step outfiles go into one subDir per counting type, e.g. hgcClicks/2013-04-04.tab.gz""" if not os.path.isdir(outDir): os.makedirs(outDir) fnames = [] inFnames = glob.glob(join(catDir, "*.tab.gz")) for inFname in inFnames: @@ -701,75 +704,79 @@ ofh.write(",".join(allTracks)) ofh.write("\n") for date in dates: dateTracks = dateTrackCounts[date] row = [date] for track in allTracks: count = dateTracks.get(track, 0) row.append(str(count)) ofh.write(",".join(row)) ofh.write("\n") ofh.close() logging.info("Wrote %s" % outName) -def apacheToTab(inFname, outFlagFname): - " parse apache log file to tab file. outFlagFname is something like 20130420_hgw4.flag, write output to 20130420_hgw4_.tab.gz " +def apacheToTab(inFname, outBotIpsFname): + " parse apache log file to tab file. outBotIpsFname is something like 20130420_hgw4.botIps, write output to 20130420_hgw4_.tab.gz " logging.info("Parsing %s" % inFname) if inFname.endswith(".gz"): ifh = gzip.open(inFname) else: ifh = open(inFname) - baseOut = outFlagFname.replace(".flag","") + baseOut = outBotIpsFname.replace(".botIps.txt","") fileHandles = {} # cache the file handles count = 0 + botIps = set() for row in csv.reader(ifh, delimiter = " ", escapechar='\\'): # parse apache log line count += 1 if count % 20000 == 0: print "parsed %d rows" % count log = parseApacheLine(row) if log==None: #logging.info("parse error: %s" % row) continue weekDate = dateToWeek(log.time) # skip if it's a bot if isBot(log.agent): #botCounts[a.agent]+=1 + botIps.add(log.ip) logging.debug("%s is a bot" % log.agent) continue outFname = baseOut+"_"+weekDate+".tab.gz" if outFname in fileHandles: ofh = fileHandles[outFname] else: ofh = gzip.open(outFname, "w") ofh.write("#"+"\t".join(apacheFields)+"\n") fileHandles[outFname] = ofh ofh.write("\t".join(log)) ofh.write("\n") - - open(outFlagFname, "w") # just create an empty flag file + ofh = open(outBotIpsFname, "w") + for ip in botIps: + ofh.write("%s\n" % ip) + ofh.close() def catFiles(inFnames, outFname): " cat all inFnames to outFname, taking care of header lines " ofh = gzip.open(outFname, "w") headerWritten = False for inFname in inFnames: ifh = gzip.open(inFname) headerLine = ifh.readline() if not headerWritten: ofh.write(headerLine) headerWritten=True ofh.write(ifh.read()) def countJob(inFname, outFnameParasol):