d74f464864e06a125090ce91540e63f46791b4df
angie
  Mon Nov 29 16:30:48 2021 -0800
Filter on --max-parsimony after usher run; increase --max-branch-length to 45 to accommodate highly diverged B.1.1.5289/Omicron and add --max-path-length 100 to filter out sequences with a (currently) ridiculous amount of divergence.  Annotate Nextstrain clades from nextstrain.clade-mutations.tsv only, no sample-name input.  Error out if $prevDate/lineageToName isn't available.  Add Pango lineage from tree to Taxonium protobuf output.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index c643d9d..958a9ff 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -33,87 +33,78 @@
 mkdir -p $ottoDir/$today
 cd $ottoDir/$today
 
 usherDir=~angie/github/usher
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
 
 if [ ! -s new.masked.vcf.gz ]; then
     $scriptDir/makeNewMaskedVcf.sh $prevDate $today $problematicSitesVcf $baseProtobuf
 fi
 
 if [ ! -s gisaidAndPublic.$today.masked.pb ]; then
     $scriptDir/usherClusterRun.sh $today
     # Prune samples with too many private mutations and internal branches that are too long.
     $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \
-        -b 30 \
+        --max-parsimony 20 \
+        --max-branch-length 45 \
+        --max-path-length 100 \
         -O -o gisaidAndPublic.$today.masked.pb
 fi
 
 # Exclude sequences with a very high number of EPPs from future runs
 grep ^Current usher.addNew.log \
 | awk '$16 >= 10 {print $8;}' \
 | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \
     > tooManyEpps.ids
 cat tooManyEpps.ids >> ../tooManyEpps.ids
 
 $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today
 
 $scriptDir/combineMetadata.sh $prevDate $today
 
 # version/description files
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
     > version.plusGisaid.txt
 sampleCountComma=$(echo $(wc -l < samples.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.plusGisaid.description.txt
 
-# Add nextclade annotations to protobuf
-if [ -s $ottoDir/$prevDate/cladeToName ]; then
-    # Use yesterday's clade assignments to annotate clades on today's tree
+# Add nextclade annotations to protobuf (completely specified by nextstrain.clade-mutations.tsv)
 time $matUtils annotate -T 50 \
     -l \
     -i gisaidAndPublic.$today.masked.pb \
     -M $scriptDir/nextstrain.clade-mutations.tsv \
     -D details.nextclade \
     -o gisaidAndPublic.$today.masked.nextclade.pb \
     >& annotate.nextclade
-else
-    time $matUtils annotate -T 50 \
-        -l \
-        -i gisaidAndPublic.$today.masked.pb \
-        -P ../nextstrain.clade-paths.tsv \
-        -o gisaidAndPublic.$today.masked.nextclade.pb
-fi
 
 # Add pangolin lineage annotations to protobuf.
 if [ -s $ottoDir/$prevDate/lineageToName ]; then
     time $matUtils annotate -T 50 \
         -i gisaidAndPublic.$today.masked.nextclade.pb \
         -M $scriptDir/pango.clade-mutations.tsv \
         -c $ottoDir/$prevDate/lineageToName \
         -f 0.95 \
         -D details.pango \
         -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \
         >& annotate.pango
 else
-    time $matUtils annotate -T 50 \
-        -i gisaidAndPublic.$today.masked.nextclade.pb \
-        -P ../pango.clade-paths.tsv \
-        -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb
+    echo "Can't find $ottoDir/$prevDate/lineageToName for assigning lineages!"
+    exit 1
 fi
 
 # Replace protobuf with annotated protobuf.
 mv gisaidAndPublic.$today.masked{,.unannotated}.pb
 ln -f gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb
 
 # Save clade & lineage annotations for use tomorrow.
 $matUtils summary -i gisaidAndPublic.$today.masked.pb -C sample-clades
 tail -n+2 sample-clades \
 | tawk '{print $2, $1;}' \
 | sort > cladeToName
 tail -n+2 sample-clades \
 | tawk '{print $3, $1;}' \
 | sort > lineageToName
 
@@ -142,20 +133,23 @@
     ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \
         $dir/public.plusGisaid.latest.metadata.tsv.gz
     ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt
     ln -sf `pwd`/epiToPublic.latest $dir/
 done
 
 # Make Taxodium-formatted protobuf for display
 zcat /hive/data/genomes/wuhCor1/goldenPath/bigZips/genes/ncbiGenes.gtf.gz \
 | grep -v '"ORF1a"' > ncbiGenes.gtf
 zcat /hive/data/genomes/wuhCor1/wuhCor1.fa.gz > wuhCor1.fa
 zcat gisaidAndPublic.$today.metadata.tsv.gz > metadata.tmp.tsv
 time $matUtils extract -i gisaidAndPublic.$today.masked.pb \
     -f wuhCor1.fa \
     -g ncbiGenes.gtf \
     -M metadata.tmp.tsv \
+    --extra-fields pango_lineage_usher \
     --write-taxodium gisaidAndPublic.$today.masked.taxodium.pb
 rm metadata.tmp.tsv wuhCor1.fa
 gzip -f gisaidAndPublic.$today.masked.taxodium.pb
 
 $scriptDir/extractPublicTree.sh $today
+
+grep skipping annotate*