7cef6a7083abdbdfb242575ccb0dcb7a73c42197
angie
  Thu Sep 30 09:58:01 2021 -0700
New script usherClusterRun.sh for running usher on ku, merging & optimizing results instead of single hgwdev job.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index 71798f2..8602463 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -24,50 +24,44 @@
 epiToPublic=$gisaidDir/epiToPublicAndDate.latest
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 mkdir -p $ottoDir/$today
 cd $ottoDir/$today
 
 usherDir=~angie/github/usher
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
 
 if [ ! -s new.masked.vcf.gz ]; then
     $scriptDir/makeNewMaskedVcf.sh $prevDate $today $problematicSitesVcf
 fi
 
-time $usher -u -T 80 \
-    -A \
-    -e 5 \
-    --max-parsimony-per-sample 20 \
-    -v new.masked.vcf.gz \
-    -i prevRenamed.pb \
-    -o gisaidAndPublic.$today.masked.preTrim.pb \
-    >& usher.addNew.log
-mv uncondensed-final-tree.nh gisaidAndPublic.$today.preTrim.nwk
+if [ ! -s gisaidAndPublic.$today.masked.pb ]; then
+    $scriptDir/usherClusterRun.sh $today
+    # Prune samples with too many private mutations and internal branches that are too long.
+    $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \
+        -b 30 \
+        -O -o gisaidAndPublic.$today.masked.pb
+fi
 
 # Exclude sequences with a very high number of EPPs from future runs
 grep ^Current usher.addNew.log \
 | awk '$16 >= 10 {print $8;}' \
 | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \
-    >> ../tooManyEpps.ids
-
-# Prune samples with too many private mutations and internal branches that are too long.
-$matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \
-    -b 30 \
-    -O -o gisaidAndPublic.$today.masked.pb
+    > tooManyEpps.ids
+cat tooManyEpps.ids >> ../tooManyEpps.ids
 
 $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today
 
 $scriptDir/combineMetadata.sh $prevDate $today
 
 # version/description files
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
     > version.plusGisaid.txt
 sampleCountComma=$(echo $(wc -l < samples.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.plusGisaid.description.txt
 
 # Add nextclade annotations to protobuf