src/hg/logCrawl/trackCounts/runCountByDate.sh 26efd8d58625376a6f5f5304697c167db5870dcb

26efd8d58625376a6f5f5304697c167db5870dcb
chmalee
  Mon Jun 2 14:59:16 2025 -0700
Adding some missing track counting cronjob scripts to the source tree so qateam can start running them, refs Lou email

diff --git src/hg/logCrawl/trackCounts/runCountByDate.sh src/hg/logCrawl/trackCounts/runCountByDate.sh
new file mode 100755
index 00000000000..3ce8b5b1b78
--- /dev/null
+++ src/hg/logCrawl/trackCounts/runCountByDate.sh
@@ -0,0 +1,163 @@
+#!/bin/bash
+
+############################
+# meant to be run via cronjob
+############################
+
+set -beEu -o pipefail
+
+WORKDIR="/hive/users/chmalee/logs/byDate"
+EMAIL="chmalee@ucsc.edu"
+GENSUB="/cluster/bin/x86_64/gensub2"
+
+# work dir
+today=`date +%F`
+
+# force a re-run on all files, not just the new ones
+force=1
+
+# cluster to use, default to ku
+cluster=ku
+
+function usage()
+{
+cat << EOF
+Usage: `basename $0` [-hfc]
+
+Optional Arguments:
+-h                  Show this help
+-f                  Force a re-run on all files, not just the newest ones
+-c                  Use hgwdev as a cluster instead of ku
+
+This script is meant to be run via cronjob to check for new trimmed error logs and tally
+up track and database usage
+EOF
+}
+
+function runPara()
+{
+    cd ${WORKDIR}/${today}
+    if [ -e jobFileList ]
+    then
+        ${GENSUB} jobFileList single ../template jobList
+        if [[ ${cluster} == "ku" ]]
+        then
+            ssh ku "cd ${WORKDIR}/${today}; para create -ram=20g jobList; para push; exit"
+        else
+            /cluster/bin/x86_64/para create -ram=20g jobList
+            /cluster/bin/x86_64/para push -maxJob=10
+        fi
+    fi
+}
+
+function getFileList()
+{
+    cd ${WORKDIR}
+    # find the most recent dir, example %T+ output
+    #2019-10-21+09:41:17.5243280000  ./2019-10-21
+    oldDir=`find . -maxdepth 1 -type d -name "20*" -printf "%T+\t%p\n" | sort -r | head -1 | cut -f2`
+    mkdir -p ${WORKDIR}/${today}
+    cd ${WORKDIR}/${today}
+    if [ -e ${WORKDIR}/${oldDir}/jobFileList ]
+    then
+        sort ${WORKDIR}/${oldDir}/jobFileList > jobFileList.prev
+    else
+        touch jobFileList.prev
+    fi
+    set +e
+    find /hive/users/chmalee/logs/trimmedLogs/result/{asiaNode,euroNode,hgw{1,2,3,4,5,6}}/*trimmed.gz -print 2>/dev/null | sort > allFileList
+
+    set -e
+    if [[ ${force} -ne 0 ]]
+    then
+        cp jobFileList.prev jobFileList.tmp
+        # the most recently checked logs plus the new ones:
+        comm -13 ../${oldDir}/allFileList allFileList >> jobFileList.tmp
+        # every time this script is run, we need to force a re-run on the last weeks' logs,
+        # as they may have only been partially complete when we last ran the cluster job:
+        lastWeek=`date -d "50 days ago" +%s`
+        for f in `cat allFileList`
+        do
+            r=${f%.trimmed.gz}
+            r=${r##*/}
+            machName=${f##*result/}
+            machName=${machName%/*}
+            range=`date -d "${r}" +%s`
+            printf "checking if range %s > lastWeek %s for %s %s %s\n" $range $lastWeek $r $machName $f >> debug.log
+            if [ ${range} -gt ${lastWeek} ]
+            then
+                printf "is greater\n" >> debug.log
+                toRemoveDb="../result/${machName}/${r}.trimmed.dbUsage.gz"
+                toRemoveTrack="../result/${machName}/${r}.trimmed.trackUsage.gz"
+                printf "should be removing %s\n" $toRemoveDb >> debug.log
+                printf "should be removing %s\n" $toRemoveTrack >> debug.log
+                if [ -e ${toRemoveTrack} ]; then
+                    printf "removing %s\n" $toRemoveTrack >> debug.log
+                    rm ${toRemoveTrack}
+                fi
+                if [ -e ${toRemoveDb} ]; then
+                    printf "removing %s\n" $toRemoveDb >> debug.log
+                    rm ${toRemoveDb}
+                fi
+                #echo $f >> jobFileList
+            fi
+            # if it's a brand new file we need to run it!
+            if [ ! -e "../result/${machName}/${r}.trimmed.*.gz" ];
+            then
+                echo $f >> jobFileList
+            fi
+        done
+        if [ -e jobFileList ]
+        then
+            rm jobFileList.tmp
+        else
+            mv jobFileList.tmp jobFileList
+        fi
+    else
+        for f in `cat allFileList`
+        do
+            set +e
+            r=${f%.trimmed.gz}
+            r=${r##*/}
+            machName=${f##*result/}
+            machName=${machName%/*}
+            toRemoveDb="../result/${machName}/${r}.trimmed.dbUsage.gz"
+            toRemoveTrack="../result/${machName}/${r}.trimmed.trackUsage.gz"
+            if [ -e ${toRemoveTrack} ]; then
+                printf "removing %s\n" $toRemoveDb >> debug.log
+                rm ${toRemoveTrack}
+            fi
+            if [ -e ${toRemoveDb} ]; then
+                printf "removing %s\n" $toRemoveDb >> debug.log
+                rm ${toRemoveDb}
+            fi
+            set -e
+        done
+        cp allFileList jobFileList
+    fi
+}
+
+while getopts "hfc" opt
+do
+    case $opt in
+        h)
+            usage
+            exit 0
+            ;;
+        f)
+            force=0
+            ;;
+        c)
+            cluster="hgwdev"
+            ;;
+        ?)
+            printf "unknown option %s\n" "$opt"
+            EXIT_STATUS=1
+            ;;
+    esac
+done
+
+getFileList
+runPara
+
+echo "Done tallying usage"