665fe7dc2ae7d018811a15f79c42bb29904d8630 angie Tue Feb 22 11:12:28 2022 -0800 Add a para push to wait loop, to help with rare problem of a node failure. Clean up intermediate protobuf files-to-be-merged. diff --git src/hg/utils/otto/sarscov2phylo/usherClusterRun.sh src/hg/utils/otto/sarscov2phylo/usherClusterRun.sh index 6eb0965..d12a558 100755 --- src/hg/utils/otto/sarscov2phylo/usherClusterRun.sh +++ src/hg/utils/otto/sarscov2phylo/usherClusterRun.sh @@ -122,37 +122,43 @@ cd reserveKuNodes jobCount=16 threadCount=16 reserveHours=2 reserveSeconds=$((3600 * $reserveHours)) cp /dev/null jobList for ((i=0; $i < $jobCount; i++)); do echo "sleep $reserveSeconds" >> jobList done ssh ku "cd $ottoDir/$today/reserveKuNodes && para create -cpu=$threadCount jobList && para push" # Wait for all jobs to start (i.e. all nodes to be reserved): echo "Waiting for parasol to assign ku nodes" while (( $(ssh ku "parasol list jobs | grep $USER | grep sleep" | awk '$2 != "none"' | wc -l) < $jobCount )); do sleep 10 + # Just in case there was a crash (we seem to have some borderline nodes): + ssh ku "cd $ottoDir/$today/reserveKuNodes && para push" done # Make hostfile for mpirun cd $ottoDir/$today ssh ku "parasol list jobs | grep $USER | grep sleep" \ | awk '{print $2;}' \ | sort \ | uniq -c \ | awk '{print $2, "slots=" '$threadCount'*$1;}' \ > hostfile # mpirun matOptimize on first host in hostfile headNode=$(head -1 hostfile | awk '{print $1;}') radius=8 ssh $headNode "cd $ottoDir/$today && \ $scriptDir/kuRunMatOptimize.sh -T $threadCount -r $radius -M $reserveHours \ -S move_log.cluster -i merged.deltaMasked.pb \ -o gisaidAndPublic.$today.masked.preTrim.pb \ >& matOptimize.cluster.log" # Release the ku nodes by stopping the parasol batch ssh ku "cd $ottoDir/$today/reserveKuNodes && para stop || true" + +# Clean up +rm -f preTrim.*.pb merged.*.*.pb +chmod 664 gis*.preTrim*.pb