9461632185d991b472fb061651d21c358c5690ba
chmalee
  Mon Feb 10 10:52:53 2025 -0800
Make track log trim script use hgwdev as a cluster instead of ku, refs Lou email

diff --git src/hg/logCrawl/trimLogs/runTrimLogs.sh src/hg/logCrawl/trimLogs/runTrimLogs.sh
index 8372a73d7f9..c36332c5a5c 100755
--- src/hg/logCrawl/trimLogs/runTrimLogs.sh
+++ src/hg/logCrawl/trimLogs/runTrimLogs.sh
@@ -1,222 +1,210 @@
 #!/bin/bash
 
 ############################
 # meant to be run via cronjob
 ############################
 
 set -beEu -o pipefail
 
 WORKDIR="/hive/users/chmalee/logs/trimmedLogs"
-EMAIL="chmalee@ucsc.edu"
+EMAIL="browserqa-group@ucsc.edu"
 GENSUB="/cluster/bin/x86_64/gensub2"
 
 # work dir
 today=`date +%F`
 
 # which step of the script are we at
 trimStep=1
 
-# which cluster to use, default to ku but can use hgwdev in a pinch
-cluster="ku"
-
 # force a re-run on all files, not just the new ones
 force=1
 
 function usage()
 {
 cat << EOF
 Usage: `basename $0` [-htc]
 
 Optional Arguments:
 -h                  Show this help
 -t                  Trim error logs. Smart enough to only run on most recent additions.
 -f                  Force a re-run on all files, not just the newest ones
--c                  Use hgwdev instead of ku for cluster run
 
 This script is meant to be run via cronjob to check for new error logs and trim them
 down via parasol (the -t option). Checks /hive/data/inside/wwwstats/RR/ for new error_log files
 and trims them via the errorLogTrim script. Run that script with no args for more
 information.
 EOF
 }
 
 function combineTrimmed()
 {
     # when the jobs are done combine the result files into one:
     fileList=$1
     cd ${WORKDIR}/${today}
     for f in $(cat ${fileList})
     do
         fName=`echo $f | grep -o "hgw.*"`
         fName=`echo ${fName} | sed -e 's/\.gz$//'`
         #echo ${fName}
         cat ${WORKDIR}/result/${fName}.trimmed.gz >> ${WORKDIR}/result/full.gz
     done
 }
 
 function getMachName()
 {
     # the ## strips the longest match from beginning of string,
     # while the % strips the shortest match from the end, thus
     # fname=/hive/data/inside/wwwstats/RR/2020/hgw1/error_log.20200105.gz will
     # become: hgw1
     fname=$1
     mach=""
     if [[ "${fname}" == *"euroNode"* ]]
     then
         mach="euroNode"
     elif [[ "${fname}" == *"asiaNode"* ]]
     then
         mach="asiaNode"
     else
         mach=${f##*hgw}
         mach="hgw"${mach%/error_log*}
     fi
     # bash does not have return so we echo and capture in the caller
     echo $mach
 }
 
 
 # substitute for gensub2 because asia/euro node have different
 # format than RR machine files
 function makeJobList() {
     fname=$1
     machName=$(getMachName $fname)
     root1=${fname##*error_log.}
     root1=${root1%.gz}
     echo "../trimLogs.sh ${fname} {check out exists ../result/${machName}/${root1}.trimmed.gz}" >> jobList
 }
 
 function runPara()
 {
     cd ${WORKDIR}/${today}
     if [ -e jobFileList ]
     then
         for f in $(cat jobFileList); do
             makeJobList $f
         done
-        if [[ "${cluster}" == "ku" ]]
-        then
-            ssh ku "cd ${WORKDIR}/${today}; para create jobList; para push; exit"
-        else
         para create -ram=10g jobList
         para push -maxJob=10
     fi
-    fi
 }
 
 function getFileList()
 {
     cd ${WORKDIR}
     # find the most recent dir, example %T+ output
     #2019-10-21+09:41:17.5243280000  ./2019-10-21
     oldDir=`find . -maxdepth 1 -type d -name "20*" -printf "%T+\t%p\n" | sort -r | head -1 | cut -f2`
     mkdir -p ${WORKDIR}/${today}
     cd ${WORKDIR}/${today}
     set +e
     sort ${WORKDIR}/${oldDir}/jobFileList > jobFileList.prev
     find /hive/data/inside/wwwstats/RR/{2019,202*}/hgw{1,2,3,4,5,6}/error_log.*.gz -print 2>/dev/null | sort > rr.tmp
     find /hive/data/inside/wwwstats/{euroNode,asiaNode}/{2019,202*}/error_log.*.gz -print 2>/dev/null | sort > asiaEuro.tmp
     sort rr.tmp asiaEuro.tmp > allFileList
     rm rr.tmp asiaEuro.tmp
 
     set -e
     if [[ ${force} -ne 0 ]]
     then
         cp jobFileList.prev jobFileList.tmp
         # the most recently checked logs plus the new ones:
         comm -13 ../${oldDir}/allFileList allFileList >> jobFileList.tmp
         # every time this script is run, we need to force a re-run on the last weeks' logs,
         # as they may have only been partially complete when we last ran the cluster job:
         lastWeek=`date -d "21 days ago" +%s`
         for f in `cat allFileList`
         do
             machName=$(getMachName $f)
             r=${f##*error_log.}
             r=${r%.gz}
             range=`date -d "${r}" +%s`
             if [ ${range} -gt ${lastWeek} ]
             then
                 toRemove="../result/${machName}/${r}.trimmed.gz"
                 if [ -e ${toRemove} ]; then
                     rm ${toRemove}
                 fi
                 echo $f >> jobFileList
             fi
             # if it's a brand new file we need to run it!
             if [ ! -e "../result/${machName}/${r}.trimmed.gz" ];
             then
                 echo $f >> jobFileList
             fi
         done
         if [ -e jobFileList ]
         then
             rm jobFileList.tmp
         else
             mv jobFileList.tmp jobFileList
         fi
     else
         for f in `cat allFileList`
         do
             set +e
             machName=$(getMachName $f)
             r=${f##*error_log.}
             r=${r%.gz}
             toRemove="../result/${machName}/${r}.trimmed.gz"
             if [ -e ${toRemove} ]; then
                 rm ${toRemove}
             fi
             set -e
         done
         cp allFileList jobFileList
     fi
 }
 
 function doTrimStep()
 {
     getFileList
     runPara
 }
 
 while getopts "htcf" opt
 do
     case $opt in
         h)
             usage
             exit 0
             ;;
         t)
             trimStep=0
             ;;
-        c)
-            cluster="hgwdev"
-            ;;
         f)
             force=0
             ;;
         ?)
             printf "unknown option %s\n" "$opt"
             EXIT_STATUS=1
             ;;
     esac
 done
 
 if [[ $# -eq 0 && -n "${trimStep}" ]]
 then
     printf "please run with -t\n"
     usage
     trimStep=1
     force=1
 fi
 
 if [[ ${trimStep} -eq 0 ]]
 then
     doTrimStep
 fi
 
 if [[ $? -eq 0 ]]
 then
     echo "Done trimming logs"
 else
     echo "Potential error during log trimming. Check ${WORKDIR}/${today} for more information."
 fi