0b7dfc66c9f922d96a37066655a01299a3c70de0
angie
  Tue Nov 2 12:56:59 2021 -0700
Run matOptimize on the cluster.  It uses MPI for job management; reserve cluster nodes with a fake parasol job and specify those nodes for MPI.

diff --git src/hg/utils/otto/sarscov2phylo/usherClusterRun.sh src/hg/utils/otto/sarscov2phylo/usherClusterRun.sh
index 2cd634c..d99b041 100755
--- src/hg/utils/otto/sarscov2phylo/usherClusterRun.sh
+++ src/hg/utils/otto/sarscov2phylo/usherClusterRun.sh
@@ -10,31 +10,30 @@
 
 if [ $# != 1 ]; then
   usage
   exit 1
 fi
 
 today=$1
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 usherDir=~angie/github/usher
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
-matOptimize=$usherDir/build/matOptimize
 
 cd $ottoDir/$today
 
 # 16 cluster jobs, 16 threads each --> 1/4 capacity of ku cluster.  (for many hours)
 jobCount=16
 threadCount=16
 
 sampleCount=$(vcfSamples new.masked.vcf.gz | wc -l)
 samplesPerJob=$(( ($sampleCount + ($jobCount-1)) / $jobCount ))
 
 echo $sampleCount samples, $jobCount jobs, $samplesPerJob samples per job
 for ((i=0;  $i < $jobCount;  i++)); do
     startIx=$(( 10 + ($i * $samplesPerJob) ))
     if [ $i == $(( $jobCount - 1 )) ]; then
         endIx=$(( 9 + $sampleCount ))
@@ -81,18 +80,50 @@
 $matUtils merge -T 8 --input-mat-1 preTrim.14.pb --input-mat-2 preTrim.15.pb -o merged.14.15.pb &
 wait
 
 time $matUtils merge -T 16 --input-mat-1 merged.0.1.pb --input-mat-2 merged.2.3.pb -o merged.0.3.pb &
 $matUtils merge -T 16 --input-mat-1 merged.4.5.pb --input-mat-2 merged.6.7.pb -o merged.4.7.pb &
 $matUtils merge -T 16 --input-mat-1 merged.8.9.pb --input-mat-2 merged.10.11.pb -o merged.8.11.pb &
 $matUtils merge -T 16 --input-mat-1 merged.12.13.pb --input-mat-2 merged.14.15.pb -o merged.12.15.pb &
 wait
 
 time $matUtils merge -T 32 --input-mat-1 merged.0.3.pb --input-mat-2 merged.4.7.pb -o merged.0.7.pb &
 $matUtils merge -T 32 --input-mat-1 merged.8.11.pb --input-mat-2 merged.12.15.pb -o merged.8.15.pb &
 wait
 
 time $matUtils merge -T 64 --input-mat-1 merged.0.7.pb --input-mat-2 merged.8.15.pb -o merged.pb
 
-# Optimize
-$matOptimize -T 40 -r 8 -S move_log -i merged.pb -M 6 -o gisaidAndPublic.$today.masked.preTrim.pb \
-    >& matOptimize.merged.log
+# Optimize on the cluster too, with Cheng's MPI-parallelized matOptimize
+# Reserve parasol nodes with a "cluster run" of sleep commands
+mkdir -p reserveKuNodes
+cd reserveKuNodes
+reserveHours=8
+reserveSeconds=$((3600 * $reserveHours))
+cp /dev/null jobList
+for ((i=0;  $i < $jobCount;  i++)); do
+    echo "sleep $reserveSeconds" >> jobList
+done
+ssh ku "cd $ottoDir/$today/reserveKuNodes && para create -cpu=$threadCount jobList && para push"
+
+# Wait for all jobs to start (i.e. all nodes to be reserved):
+echo "Waiting for parasol to assign ku nodes"
+while (( $(ssh ku "parasol list jobs | grep $USER" | awk '$2 != "none"' | wc -l) < $jobCount )); do
+    sleep 10
+done
+
+# Make hostfile for mpirun
+cd $ottoDir/$today
+cp /dev/null hostfile
+for h in $(ssh ku "parasol list jobs | grep $USER | grep sleep" | awk '{print $2;}'); do
+    echo "$h slots=$threadCount" >> hostfile
+done
+
+# mpirun matOptimize on first host in hostfile
+headNode=$(head -1 hostfile | awk '{print $1;}')
+radius=8
+ssh $headNode "cd $ottoDir/$today && \
+               $scriptDir/kuRunMatOptimize.sh -T $threadCount -r $radius \
+                   -S move_log.cluster -i merged.pb -o gisaidAndPublic.$today.masked.preTrim.pb \
+                   >& matOptimize.cluster.log"
+
+# Release the ku nodes by stopping the parasol batch
+ssh ku "cd $ottoDir/$today/reserveKuNodes && para stop"