665fe7dc2ae7d018811a15f79c42bb29904d8630
angie
  Tue Feb 22 11:12:28 2022 -0800
Add a para push to wait loop, to help with rare problem of a node failure.  Clean up intermediate protobuf files-to-be-merged.

diff --git src/hg/utils/otto/sarscov2phylo/usherClusterRun.sh src/hg/utils/otto/sarscov2phylo/usherClusterRun.sh
index 6eb0965..d12a558 100755
--- src/hg/utils/otto/sarscov2phylo/usherClusterRun.sh
+++ src/hg/utils/otto/sarscov2phylo/usherClusterRun.sh
@@ -122,37 +122,43 @@
 cd reserveKuNodes
 jobCount=16
 threadCount=16
 reserveHours=2
 reserveSeconds=$((3600 * $reserveHours))
 cp /dev/null jobList
 for ((i=0;  $i < $jobCount;  i++)); do
     echo "sleep $reserveSeconds" >> jobList
 done
 ssh ku "cd $ottoDir/$today/reserveKuNodes && para create -cpu=$threadCount jobList && para push"
 
 # Wait for all jobs to start (i.e. all nodes to be reserved):
 echo "Waiting for parasol to assign ku nodes"
 while (( $(ssh ku "parasol list jobs | grep $USER | grep sleep" | awk '$2 != "none"' | wc -l) < $jobCount )); do
     sleep 10
+    # Just in case there was a crash (we seem to have some borderline nodes):
+    ssh ku "cd $ottoDir/$today/reserveKuNodes && para push"
 done
 
 # Make hostfile for mpirun
 cd $ottoDir/$today
 ssh ku "parasol list jobs | grep $USER | grep sleep" \
 | awk '{print $2;}' \
 | sort \
 | uniq -c \
 | awk '{print $2, "slots=" '$threadCount'*$1;}' \
     > hostfile
 
 # mpirun matOptimize on first host in hostfile
 headNode=$(head -1 hostfile | awk '{print $1;}')
 radius=8
 ssh $headNode "cd $ottoDir/$today && \
                $scriptDir/kuRunMatOptimize.sh -T $threadCount -r $radius -M $reserveHours \
                    -S move_log.cluster -i merged.deltaMasked.pb \
                    -o gisaidAndPublic.$today.masked.preTrim.pb \
                    >& matOptimize.cluster.log"
 
 # Release the ku nodes by stopping the parasol batch
 ssh ku "cd $ottoDir/$today/reserveKuNodes && para stop || true"
+
+# Clean up
+rm -f preTrim.*.pb merged.*.*.pb
+chmod 664 gis*.preTrim*.pb