95c99e66b78b8b8c4e26bb1ea91a0113d64caf04
angie
  Thu Aug 19 11:54:39 2021 -0700
matUtils annotate -P fails too often because of little changes in the tree that affect which mutations are collapsed into the same code (or not).  More sophisticated path-finding logic in matUtils could help with that, but until then, use the previous day's clade assignments as input for matUtils annotate -c for the big tree, and use the big tree's clade assignments as input for -c for the public-only tree.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index fb3411d..2d900ab 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -21,30 +21,31 @@
 ncbiDir=$ottoDir/ncbi.latest
 cogUkDir=$ottoDir/cogUk.latest
 cncbDir=$ottoDir/cncb.latest
 gisaidDir=/hive/users/angie/gisaid
 minReal=20000
 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit
 epiToPublic=$gisaidDir/epiToPublicAndDate.latest
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 today=$(date +%F)
 
 cd $ottoDir/$today
 
 prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb
+prevMeta=$ottoDir/$prevDate/public-$prevDate.metadata.tsv.gz
 
 usherDir=~angie/github/usher
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
 
 renaming=oldAndNewNames
 
 if [ ! -s new.masked.vcf.gz ]; then
 
 # Make lists of sequences already in the tree.
 $matUtils extract -i $prevProtobufMasked -u prevNames
 
 # Before updating the tree with new sequences, update the names used in the tree:
 # * Sequences that are already in the tree with EPI_ IDs, but that have been mapped to public IDs
 # * COG-UK sequences that are in GenBank.  Per Sam Nicholls the isolate alone is enough to identify
@@ -261,34 +262,38 @@
         >> $renaming
     set -o pipefail
 fi
 if [ -s newGisaid.filtered.fa ]; then
     zcat $gisaidDir/metadata_batch_$today.tsv.gz \
     | grep -Fwf <(fastaNames newGisaid.filtered.fa) \
     | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \
         >> $renaming
 fi
 wc -l $renaming
 
 # Make masked VCF
 tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \
     $problematicSitesVcf > mask.vcf
 # Add masked VCF to previous protobuf
+#*** With horrible hack for time being, to mask 21987 (because it messes up the Delta branch)...
+#*** TODO: make hgPhyloPlace handle protobufs that don't have all of the latest problematic sites
+#*** masked more gracefully, and update the Problematic Sites track.
 time cat <(twoBitToFa $ref2bit stdout) $alignedFa \
 | faToVcf -maxDiff=1000 -excludeFile=../tooManyEpps.ids -verbose=2 stdin stdout \
 | vcfRenameAndPrune stdin $renaming stdout \
 | vcfFilter -excludeVcf=mask.vcf stdin \
+| tawk '$2 != 21987' \
 | gzip -c \
     > new.masked.vcf.gz
 
 fi # if [ ! -s new.masked.vcf.gz ]
 
 time $usher -u -T 80 \
     -A \
     -e 5 \
     -v new.masked.vcf.gz \
     -i prevRenamed.pb \
     -o gisaidAndPublic.$today.masked.preTrim.pb \
     >& usher.addNew.log
 mv uncondensed-final-tree.nh gisaidAndPublic.$today.preTrim.nwk
 
 # Exclude sequences with a very high number of EPPs from future runs
@@ -385,44 +390,76 @@
     >> gisaidAndPublic.$today.metadata.tsv
 wc -l gisaidAndPublic.$today.metadata.tsv
 gzip -f gisaidAndPublic.$today.metadata.tsv
 
 # version/description files
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
     > version.plusGisaid.txt
 $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today
 sampleCountComma=$(echo $(wc -l < samples.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.plusGisaid.description.txt
 
 # Add nextclade annotations to protobuf
+if [ -s $ottoDir/$prevDate/cladeToName ]; then
+    # Use yesterday's clade assignments to annotate clades on today's tree
+    time $matUtils annotate -T 50 \
+        -l \
+        -i gisaidAndPublic.$today.masked.pb \
+        -c $ottoDir/$prevDate/cladeToName \
+        -f 0.95 \
+        -D details.nextclade \
+        -o gisaidAndPublic.$today.masked.nextclade.pb \
+        >& annnotate.nextclade
+else
     time $matUtils annotate -T 50 \
         -l \
         -i gisaidAndPublic.$today.masked.pb \
         -P ../nextstrain.clade-paths.tsv \
         -o gisaidAndPublic.$today.masked.nextclade.pb
+fi
 
 # Add pangolin lineage annotations to protobuf.
+if [ -s $ottoDir/$prevDate/lineageToName ]; then
+    time $matUtils annotate -T 50 \
+        -i gisaidAndPublic.$today.masked.nextclade.pb \
+        -c $ottoDir/$prevDate/lineageToName \
+        -f 0.95 \
+        -D details.pango \
+        -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \
+        >& annotate.pango
+else
     time $matUtils annotate -T 50 \
         -i gisaidAndPublic.$today.masked.nextclade.pb \
         -P ../pango.clade-paths.tsv \
         -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb
+fi
 
+# Replace protobuf with annotated protobuf.
 mv gisaidAndPublic.$today.masked{,.unannotated}.pb
 ln -f gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb
 
+# Save clade & lineage annotations for use tomorrow.
+$matUtils summary -i gisaidAndPublic.$today.masked.pb -C sample-clades
+tail -n+2 sample-clades \
+| tawk '{print $2, $1;}' \
+| sort > cladeToName
+tail -n+2 sample-clades \
+| tawk '{print $3, $1;}' \
+| sort > lineageToName
+
 # EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have
 # public names & IDs, we can match them.
 cut -f 1,3 $epiToPublic > epiToPublic.latest
 
 # Update links to latest public+GISAID protobuf and metadata in hgwdev cgi-bin directories
 for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do
     ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb
     ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \
         $dir/public.plusGisaid.latest.metadata.tsv.gz
     ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt
     ln -sf `pwd`/epiToPublic.latest $dir/
 done
 
 $scriptDir/extractPublicTree.sh $today