src/utils/qa/catchBot.csh 1.19

1.19 2009/04/11 00:37:01 rhead
Sourced new qaConfig file at the top. Changed -h hgwbeta lines to look for sql host stored in a variable, specified in the new qaConfig file.
Index: src/utils/qa/catchBot.csh
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/utils/qa/catchBot.csh,v
retrieving revision 1.18
retrieving revision 1.19
diff -b -B -U 1000000 -r1.18 -r1.19
--- src/utils/qa/catchBot.csh	21 Apr 2008 22:46:04 -0000	1.18
+++ src/utils/qa/catchBot.csh	11 Apr 2009 00:37:01 -0000	1.19
@@ -1,169 +1,170 @@
 #!/bin/tcsh
+source `which qaConfig.csh`
 
 ####################
 #  07-09-04 Bob Kuhn
 #
 #  Script to catch bots.
 #
 ####################
 
 set threshhold=10000  # the number of hits from a user to use as the alarm level
 set size=5            # the number of users to check
 set db=""
 set output="xxResultsxx"
 set hourLimit = 1000
 
 if ($#argv < 1 || $#argv > 2) then
   # not enough command line args
   echo
   echo "  script to catch bots."
   echo "  monitors the IP addresses of most common users."
   echo "    and squawks when someone gets too high."
   echo "  also checks the last hour."
   echo
   echo "    usage:  threshhold [report size] "
   echo "      where threshhold is number of hits from single source"
   echo "      report size is number of highest hitters listed (defaults to 5)"
   echo
   exit
 else
   set threshhold=$argv[1]
 endif
 
 if ($#argv == 2) then
   set size=$argv[2]
 endif
 
 set debug="false"
 # get some times
 set lastHit=`hgsql -N -h genome-log -e 'SELECT MAX(time_stamp) FROM access_log' apachelog`
 set firstHit=`hgsql -N -h genome-log -e 'SELECT MIN(time_stamp) FROM access_log' apachelog`
 set hitSpan=`echo $lastHit $firstHit | gawk '{printf "%.1f", ($1 - $2) / 3600}'`
 set hourAgo=`echo $lastHit | gawk '{printf "%.0f", $1 - 3600}'`
 
 if ($debug == "true") then
   echo "lastHit  = $lastHit (seconds)"
   echo "firstHit = $firstHit (seconds)"
   echo "hitSpan  = $hitSpan (hours)"
   echo "hourAgo  = $hourAgo (sec) (number of epochseconds an hour ago)"
 endif
 
 # get whole list from access_log
 #### 
 hgsql -N -h genome-log -e "SELECT remote_host FROM access_log" apachelog > remote_host
 #### 
 sort remote_host | uniq -c | sort -nr > xxUserCountxx
 set totalHits=`wc -l remote_host | gawk '{print $1}'`
 
 set checked=0
 set max=0
 
 # write header to file
 rm -f $output
 echo "\nfrom apachelogs.access_log covering the last $hitSpan hours" >> $output
 echo "\nUsage limit: 1 hit every 15 seconds & no more than 5,000 hits per day" >> $output
 echo "\ntotal hits: $totalHits" >> $output
 echo "users with the most hits:" >> $output
 echo "                                                          user     per    per" >> $output
 echo "   hits                                   remote_host    hours    hour    min" >> $output
 echo "-----------------------------------------------------------------------------" >> $output
 
 while ($checked < $size)
 
   # get next line 
   if ($checked == 0) then 
     set line=`head -1 xxUserCountxx`
   else
     set line=`sed "1,$checked d" xxUserCountxx | head -1`
   endif 
 
   # get timeSpan stats
   set num=`echo $line | gawk '{print $1}'`
   set host=`echo $line |gawk '{print $2}'`
   set timeSpan=`hgsql -N -h genome-log -e 'SELECT MAX(time_stamp) - MIN(time_stamp) \
       AS timeSpan FROM access_log WHERE remote_host = "'$host'"' apachelog`
   set timeHours=`echo $timeSpan | gawk '{printf  "%.1f", $1/3600}'`
   if ($num > 0 ) then
     set hitsPerHr=`echo $num $timeHours | gawk '{printf  "%.0f", $1/$2}'`
   else
     set hitsPerHr=`echo err  $timeHours | gawk '{printf  "%.0f", $1/$2}'`
   endif
   set hitsPerMin=`echo $hitsPerHr | gawk '{printf  "%.1f", $1/60}'`
   
   if ($debug == "true") then
     echo
     echo "host = $host"
     echo "hits = $num"
     echo "timeSpan = $timeSpan"
     echo "timeHours = $timeHours"
     echo "hitsPerHr = $hitsPerHr"
     echo "hitsPerMin = $hitsPerMin"
   endif
    
 
   if ($num > $max) then
     set max=$num
   endif
 
   # write a line in output file for this record
   # s is string, d is decimal
   echo "$num $host $timeHours $hitsPerHr $hitsPerMin " \
        | gawk '{printf("%7s %45s %8s %7s %6s\n", \
        $1, $2, $3, $4, $5)}' >> $output
   @ checked ++
 end
 echo >> $output
 
 # get IPs of largest hitter and fastest hitter
 set largest=`sort -nr $output | sed -e "2,$ d" | gawk '{print $2}'`
 set fastest=`sort -nr +4 $output | sed -e "2,$ d" | gawk '{print $2}'`
 
 echo "largest hitter = $largest" >> $output
 echo >> $output
 ipw $largest | sed -e "10,$ d" >>& $output
 echo >> $output
 
 echo "fastest hitter = $fastest" >> $output
 echo >> $output
 ipw $fastest | head -10 >>& $output
 echo >> $output
 
 # ----------------------------------------------
 # check last hour  
 
 hgsql -N -h genome-log -e "SELECT remote_host FROM access_log \
    WHERE time_stamp > $hourAgo" apachelog > xxLastHoursHostsxx
 sort -nr xxLastHoursHostsxx | uniq -c | sort -nr | head -5 > xxHoursHitsxx 
 echo "\n-------------------------------------------------------" >> $output
 echo " the busiest users in the last hour: \n" >> $output
 cat xxHoursHitsxx  >> $output
 echo >> $output
 
 set maxInHour=`head -1 xxHoursHitsxx | gawk '{print $1}'`
 if ($maxInHour > $hourLimit) then
   set maxHourUser=`head -1 xxHoursHitsxx | gawk '{print $2}'`
   set maxHrUserPerMin=`echo $maxInHour | gawk '{printf  "%.0f", $1/60}'`
   set maxHrUserPerSec=`echo $maxHrUserPerMin | gawk '{printf  "%.1f", $1/60}'`
 
   echo " this user is above $hourLimit in the last hour: $maxHourUser" >> $output
   echo " $maxHrUserPerMin per min, $maxHrUserPerSec per sec\n" >> $output
   ipw $maxHourUser | head -10  >>& $output
   echo >> $output
 endif
 
 # ----------------------------------------------
 # check criteria for output and print
 
 if ($max > $threshhold || $maxInHour > $hourLimit) then
   cat $output
 else
   echo "\n  no user  > $threshhold hits in last $hitSpan hours" 
   echo "  and none >  $hourLimit in last hour\n"
 endif
 
 rm remote_host
 rm xxUserCountxx
 rm  $output
 rm xxLastHoursHostsxx
 rm xxHoursHitsxx 
 exit