092adf5dfa56fee387d3836b3493ac46ee2c0281
angie
  Fri Feb 26 14:48:50 2021 -0800
Add 'matUtils annotate' calls to add Nextclade and Pangolin assignments to protobuf for hgPhyloPlace.

diff --git src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh
index 7e26bbe..8adb374 100755
--- src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh
@@ -292,30 +292,56 @@
 cogUkDate=$(ls -l $cogUkDir | sed -re 's/.*cogUk\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 if [ $ncbiDate == $cogUkDate ]; then
     echo "sarscov2phylo release 13-11-20; NCBI and COG-UK sequences downloaded $ncbiDate; CNCB sequences downloaded $cncbDate" \
         > version.txt
 else
     echo "sarscov2phylo release 13-11-20; NCBI sequences downloaded $ncbiDate; COG-UK sequences downloaded $cogUkDate; CNCB sequences downloaded $cncbDate" \
         > version.txt
 fi
 
 sampleCountComma=$(echo $sampleCount \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.description.txt
 
+cp -p public-$today.all.masked.pb{,.bak}
+
+# Add nextclade annotations to protobuf
+zcat public-$today.metadata.tsv.gz \
+| tail -n+2 | tawk '$8 != "" {print $8, $1;}' \
+| sed -re 's/^20E \(EU1\)/20E.EU1/;' \
+    > cladeToPublicName
+time ~/github/usher/build/matUtils annotate -T 50 \
+    -i public-$today.all.masked.pb \
+    -c cladeToPublicName \
+    -o public-$today.all.masked.nextclade.pb \
+    >& annotate.nextclade.out
+
+# Add pangolin lineage annotations to protobuf
+zcat public-$today.metadata.tsv.gz \
+| tail -n+2 | tawk '$9 != "" {print $9, $1;}' \
+    > lineageToPublicName
+time ~/github/usher/build/matUtils annotate -T 50 \
+    -i public-$today.all.masked.nextclade.pb \
+    -c lineageToPublicName \
+    -o public-$today.all.masked.nextclade.pangolin.pb \
+    >& annotate.pangolin.out
+
+# Not all the Pangolin lineages can be assigned nodes so for now just use nextclade
+cp -p public-$today.all.masked.nextclade.pb public-$today.all.masked.pb
+
 # Update gbdb links -- not every day, too much churn for getting releases out and the
 # tracks are getting unmanageably large for VCF.
 if false; then
 for f in public-$today.all{,.minAf*}.vcf.gz ; do
     t=$(echo $f | sed -re "s/-$today//;")
     ln -sf `pwd`/$f /gbdb/wuhCor1/sarsCov2PhyloPub/$t
     ln -sf `pwd`/$f.tbi /gbdb/wuhCor1/sarsCov2PhyloPub/$t.tbi
 done
 ln -sf `pwd`/public-$today.all.nwk /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.nwk
 ln -sf `pwd`/public-$today.all.parsimony.bw \
     /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.parsimony.bw
 ln -sf `pwd`/public-$today.lineageColors.gz \
     /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.lineageColors.gz
 ln -sf `pwd`/public-$today.nextstrainColors.gz \
     /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.nextstrainColors.gz