d936f6666770ec1a86f10344a828dceb213ee6f8
angie
  Wed Mar 8 10:41:48 2023 -0800
Add public tree download limited to 6M sequences for ShUShER so it doesn't run out of browser mem.

diff --git src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh
index 0c09743..50a84d3 100755
--- src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh
+++ src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh
@@ -94,55 +94,62 @@
 sampleCountComma=$(echo $(wc -l < samples.public.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.description.txt
 
 # Make Taxonium V2 .jsonl.gz protobuf for display
 usher_to_taxonium --input public-$today.all.masked.pb \
     --metadata public-$today.metadata.tsv.gz \
     --genbank ~angie/github/taxonium/taxoniumtools/test_data/hu1.gb \
     --columns genbank_accession,country,date,pangolin_lineage,pango_lineage_usher \
     --clade_types=nextstrain,pango \
     --name_internal_nodes \
     --title "$today tree with sequences from GISAID, INSDC, COG-UK and CNCB" \
     --output public-$today.all.masked.taxonium.jsonl.gz
 
+# Make a size-limited public tree for ShUShER so it doesn't exceed browser memory limits
+$matUtils extract -i public-$today.all.masked.pb --set-size 6000000 \
+    -o public-$today.all.masked.ShUShER.pb.gz
+
 # Link to public trees download directory hierarchy
 archiveRoot=/hive/users/angie/publicTrees
 read y m d < <(echo $today | sed -re 's/-/ /g')
 archive=$archiveRoot/$y/$m/$d
 mkdir -p $archive
 gzip -c public-$today.all.nwk > $archive/public-$today.all.nwk.gz
 ln -f `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/
 gzip -c public-$today.all.masked.pb > $archive/public-$today.all.masked.pb.gz
 ln -f `pwd`/public-$today.metadata.tsv.gz $archive/
 gzip -c public-$today.all.masked.nextclade.pangolin.pb \
     > $archive/public-$today.all.masked.nextclade.pangolin.pb.gz
 gzip -c lineageToPublicName > $archive/lineageToPublicName.tsv.gz
 gzip -c cladeToPublicName > $archive/cladeToPublicName.tsv.gz
 ln -f `pwd`/hgPhyloPlace.description.txt $archive/public-$today.version.txt
 ln -f `pwd`/public-$today.all.masked.taxonium.jsonl.gz $archive/
+ln -f `pwd`/public-$today.all.masked.ShUShER.pb.gz $archive/
 
 # Update 'latest' in $archiveRoot
 ln -f $archive/public-$today.all.nwk.gz $archiveRoot/public-latest.all.nwk.gz
 ln -f $archive/public-$today.all.masked.pb $archiveRoot/public-latest.all.masked.pb
 ln -f $archive/public-$today.all.masked.pb.gz $archiveRoot/public-latest.all.masked.pb.gz
 ln -f $archive/public-$today.all.masked.vcf.gz $archiveRoot/public-latest.all.masked.vcf.gz
 ln -f $archive/public-$today.metadata.tsv.gz $archiveRoot/public-latest.metadata.tsv.gz
 ln -f $archive/public-$today.version.txt $archiveRoot/public-latest.version.txt
 ln -f $archive/public-$today.all.masked.taxonium.jsonl.gz \
     $archiveRoot/public-latest.all.masked.taxonium.jsonl.gz
+ln -f $archive/public-$today.all.masked.ShUShER.pb.gz \
+    $archiveRoot/public-latest.all.masked.ShUShER.pb.gz
 
 # Update hgdownload-test link for archive
 mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m
 ln -sf $archive /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m
 
 # Update links to latest public protobuf and metadata in hgwdev cgi-bin directories
 pigz -p 8 -c samples.public.$today > samples.public.$today.gz
 for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do
     ln -sf `pwd`/public-$today.all.masked.pb $dir/public-latest.all.masked.pb
     ln -sf `pwd`/public-$today.metadata.tsv.gz $dir/public-latest.metadata.tsv.gz
     ln -sf `pwd`/hgPhyloPlace.description.txt $dir/public-latest.version.txt
     ln -sf `pwd`/samples.public.$today.gz $dir/public-latest.names.gz
 done
 
 # Update MSA and make tree version with only MSA sequences