df04c1f66cdd6895471998c6d9c70b1a413f7d4e
angie
  Sat Mar 12 20:33:25 2022 -0800
Use new --include-nt option when generating taxonium protobufs.  Also watch out for EPI_ISL as part of GenBank name.

diff --git src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh
index 11bca43..5396ad4 100755
--- src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh
+++ src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh
@@ -15,31 +15,31 @@
 fi
 
 today=$1
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 cncbDir=$ottoDir/cncb.latest
 
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 usherDir=~angie/github/usher
 matUtils=$usherDir/build/matUtils
 
 cd $ottoDir/$today
 
 # Extract public samples from tree
-grep -v EPI_ISL_ samples.$today > newPublicNames
+grep -v \|EPI_ISL_ samples.$today > newPublicNames
 # Dunno why, but when I tried using -s together with the filtering params, it ran for 3 hours
 # and I killed it -- stuck in a loop?  Run two commands:
 $matUtils extract -i gisaidAndPublic.$today.masked.pb \
     -s newPublicNames \
     -o public-$today.all.masked.preTrim.pb
 $matUtils extract -i public-$today.all.masked.preTrim.pb \
     --max-parsimony 20 \
     --max-branch-length 45 \
     --max-path-length 150 \
     -O -o public-$today.all.masked.pb
 
 # Add nextclade annotations to protobuf (completely specified by nextstrain.clade-mutations.tsv)
 grep -v EPI_ISL cladeToName > cladeToPublicName
 time $matUtils annotate -T 50 \
     -l \
@@ -81,30 +81,31 @@
 sampleCountComma=$(echo $(wc -l < samples.public.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.description.txt
 
 # Make Taxodium-formatted protobuf for display
 zcat /hive/data/genomes/wuhCor1/goldenPath/bigZips/genes/ncbiGenes.gtf.gz \
 | grep -v '"ORF1a"' > ncbiGenes.gtf
 zcat /hive/data/genomes/wuhCor1/wuhCor1.fa.gz > wuhCor1.fa
 zcat public-$today.metadata.tsv.gz > metadata.tmp.tsv
 time $matUtils extract -i public-$today.all.masked.pb \
     -f wuhCor1.fa \
     -g ncbiGenes.gtf \
     -M metadata.tmp.tsv \
     --extra-fields pango_lineage_usher \
+    --include-nt \
     --write-taxodium public-$today.all.masked.taxodium.pb
 rm metadata.tmp.tsv wuhCor1.fa
 gzip -f public-$today.all.masked.taxodium.pb
 
 # Link to public trees download directory hierarchy
 archiveRoot=/hive/users/angie/publicTrees
 read y m d < <(echo $today | sed -re 's/-/ /g')
 archive=$archiveRoot/$y/$m/$d
 mkdir -p $archive
 gzip -c public-$today.all.nwk > $archive/public-$today.all.nwk.gz
 ln -f `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/
 gzip -c public-$today.all.masked.pb > $archive/public-$today.all.masked.pb.gz
 ln -f `pwd`/public-$today.metadata.tsv.gz $archive/
 gzip -c public-$today.all.masked.nextclade.pangolin.pb \
     > $archive/public-$today.all.masked.nextclade.pangolin.pb.gz