879c1d08bc8edc582bf06681567834590824ad4c
angie
  Fri Aug 26 11:51:10 2022 -0700
Update branch length filter, annotated clades & lineages using previous day's paths for speed, add Taxonium v2 output, add MSA of public sequences.

diff --git src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh
index 5396ad4..4c0becb 100755
--- src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh
+++ src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh
@@ -1,136 +1,190 @@
 #!/bin/bash
 set -beEu -x -o pipefail
 
 #	Do not modify this script, modify the source tree copy:
 #	kent/src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
 
 usage() {
-    echo "usage: $0 date"
+    echo "usage: $0 date prevDate"
     echo "This assumes that \$date/gisaidAndPublic.\$date.masked.pb has been generated."
 }
 
-if [ $# != 1 ]; then
+if [ $# != 2 ]; then
   usage
   exit 1
 fi
 
 today=$1
+prevDate=$2
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 cncbDir=$ottoDir/cncb.latest
 
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 usherDir=~angie/github/usher
 matUtils=$usherDir/build/matUtils
 
 cd $ottoDir/$today
 
 # Extract public samples from tree
 grep -v \|EPI_ISL_ samples.$today > newPublicNames
 # Dunno why, but when I tried using -s together with the filtering params, it ran for 3 hours
 # and I killed it -- stuck in a loop?  Run two commands:
 $matUtils extract -i gisaidAndPublic.$today.masked.pb \
     -s newPublicNames \
     -o public-$today.all.masked.preTrim.pb
 $matUtils extract -i public-$today.all.masked.preTrim.pb \
     --max-parsimony 20 \
-    --max-branch-length 45 \
+    --max-branch-length 50 \
     --max-path-length 150 \
     -O -o public-$today.all.masked.pb
 
 # Add nextclade annotations to protobuf (completely specified by nextstrain.clade-mutations.tsv)
-grep -v EPI_ISL cladeToName > cladeToPublicName
+grep -v \|EPI_ISL cladeToName > cladeToPublicName
 time $matUtils annotate -T 50 \
     -l \
     -i public-$today.all.masked.pb \
+    -P $ottoDir/$prevDate/cladeToPath.public \
     -M $scriptDir/nextstrain.clade-mutations.tsv \
     -D details.nextclade.public \
     -o public-$today.all.masked.nextclade.pb \
     >& annotate.nextclade.public
 
 # Add pangolin lineage annotations to public protobuf
-grep -v EPI_ISL lineageToName > lineageToPublicName
+grep -v \|EPI_ISL lineageToName > lineageToPublicName
 time $matUtils annotate -T 50 \
     -i public-$today.all.masked.nextclade.pb \
+    -P $ottoDir/$prevDate/lineageToPath.public \
     -M $scriptDir/pango.clade-mutations.tsv \
     -c lineageToPublicName \
     -f 0.95 \
     -D details.pango.public \
     -o public-$today.all.masked.nextclade.pangolin.pb \
     >& annotate.pango.public
 
 # Extract Newick and VCF from public-only tree
 time $matUtils extract -i public-$today.all.masked.pb \
     -t public-$today.all.nwk \
     -v public-$today.all.masked.vcf
 time pigz -p 8 -f public-$today.all.masked.vcf
 zcat gisaidAndPublic.$today.metadata.tsv.gz \
-| grep -v EPI_ISL_ \
+| grep -v \|EPI_ISL_ \
 | pigz -p 8 \
     > public-$today.metadata.tsv.gz
 
 rm public-$today.all.masked.pb
 ln -f public-$today.all.masked.nextclade.pangolin.pb public-$today.all.masked.pb
 
+# Save paths for use tomorrow.
+$matUtils extract -i public-$today.all.masked.pb -C clade-paths.public
+tail -n+2 clade-paths.public \
+| grep -E '^[12]' \
+| cut -f 1,3 > cladeToPath.public
+tail -n+2 clade-paths.public \
+| grep -E '^[A-Za-z]' \
+| cut -f 1,3 > lineageToPath.public
+
+
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 echo "sarscov2phylo release 13-11-20; NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
     > version.txt
 
 $matUtils extract -i public-$today.all.masked.pb -u samples.public.$today
 sampleCountComma=$(echo $(wc -l < samples.public.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.description.txt
 
-# Make Taxodium-formatted protobuf for display
+# Make Taxonium V1-formatted protobuf for display
 zcat /hive/data/genomes/wuhCor1/goldenPath/bigZips/genes/ncbiGenes.gtf.gz \
 | grep -v '"ORF1a"' > ncbiGenes.gtf
 zcat /hive/data/genomes/wuhCor1/wuhCor1.fa.gz > wuhCor1.fa
 zcat public-$today.metadata.tsv.gz > metadata.tmp.tsv
 time $matUtils extract -i public-$today.all.masked.pb \
     -f wuhCor1.fa \
     -g ncbiGenes.gtf \
     -M metadata.tmp.tsv \
     --extra-fields pango_lineage_usher \
     --include-nt \
     --write-taxodium public-$today.all.masked.taxodium.pb
 rm metadata.tmp.tsv wuhCor1.fa
 gzip -f public-$today.all.masked.taxodium.pb
 
+# Make Taxonium V2 .jsonl.gz protobuf for display
+usher_to_taxonium --input public-$today.all.masked.pb \
+    --metadata public-$today.metadata.tsv.gz \
+    --genbank ~angie/github/taxonium/taxoniumtools/test_data/hu1.gb \
+    --columns genbank_accession,country,date,pangolin_lineage,pango_lineage_usher \
+    --output public-$today.all.masked.taxonium.jsonl.gz
+
 # Link to public trees download directory hierarchy
 archiveRoot=/hive/users/angie/publicTrees
 read y m d < <(echo $today | sed -re 's/-/ /g')
 archive=$archiveRoot/$y/$m/$d
 mkdir -p $archive
 gzip -c public-$today.all.nwk > $archive/public-$today.all.nwk.gz
 ln -f `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/
 gzip -c public-$today.all.masked.pb > $archive/public-$today.all.masked.pb.gz
 ln -f `pwd`/public-$today.metadata.tsv.gz $archive/
 gzip -c public-$today.all.masked.nextclade.pangolin.pb \
     > $archive/public-$today.all.masked.nextclade.pangolin.pb.gz
 gzip -c lineageToPublicName > $archive/lineageToPublicName.tsv.gz
 gzip -c cladeToPublicName > $archive/cladeToPublicName.tsv.gz
 ln -f `pwd`/hgPhyloPlace.description.txt $archive/public-$today.version.txt
 ln -f `pwd`/public-$today.all.masked.taxodium.pb.gz $archive/
+ln -f `pwd`/public-$today.all.masked.taxonium.jsonl.gz $archive/
 
 # Update 'latest' in $archiveRoot
 ln -f $archive/public-$today.all.nwk.gz $archiveRoot/public-latest.all.nwk.gz
 ln -f $archive/public-$today.all.masked.pb $archiveRoot/public-latest.all.masked.pb
 ln -f $archive/public-$today.all.masked.pb.gz $archiveRoot/public-latest.all.masked.pb.gz
 ln -f $archive/public-$today.all.masked.vcf.gz $archiveRoot/public-latest.all.masked.vcf.gz
 ln -f $archive/public-$today.metadata.tsv.gz $archiveRoot/public-latest.metadata.tsv.gz
 ln -f $archive/public-$today.version.txt $archiveRoot/public-latest.version.txt
 ln -f $archive/public-$today.all.masked.taxodium.pb.gz \
     $archiveRoot/public-latest.all.masked.taxodium.pb.gz
+ln -f $archive/public-$today.all.masked.taxonium.jsonl.gz \
+    $archiveRoot/public-latest.all.masked.taxonium.jsonl.gz
 
 # Update hgdownload-test link for archive
 mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m
 ln -sf $archive /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m
 
 # Update links to latest public protobuf and metadata in hgwdev cgi-bin directories
 for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do
     ln -sf `pwd`/public-$today.all.masked.pb $dir/public-latest.all.masked.pb
     ln -sf `pwd`/public-$today.metadata.tsv.gz $dir/public-latest.metadata.tsv.gz
     ln -sf `pwd`/hgPhyloPlace.description.txt $dir/public-latest.version.txt
 done
+
+# Update MSA and make tree version with only MSA sequences
+awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' samples.public.$today \
+| sort > idToName.public
+time cat <(faSomeRecords <(xzcat $ottoDir/$prevDate/public-$prevDate.all.aligned.fa.xz) \
+                         <(cut -f 1 idToName.public) stdout) \
+         <(faSomeRecords new.aligned.fa <(cut -f 1 idToName.public) stdout) \
+| xz -T 30 \
+    > public-$today.all.aligned.fa.xz
+time faRenameRecords <(xzcat public-$today.all.aligned.fa.xz) idToName.public stdout \
+| faUniqify stdin stdout \
+| xz -T 30 \
+    > public-$today.all.msa.fa.xz
+fastaNames public-$today.all.msa.fa.xz | sort > msaFaNames
+wc -l msaFaNames
+$matUtils extract -i public-$today.all.masked.pb -s msaFaNames \
+    -o public-$today.all.masked.msa.pb
+$matUtils extract -i public-$today.all.masked.msa.pb -u samples.public.msa
+cmp msaFaNames <(sort samples.public.msa)
+pigz -f -p 8 public-$today.all.masked.msa.pb
+$matUtils extract -i public-$today.all.masked.msa.pb.gz -t public-$today.all.masked.msa.nwk
+pigz -f -p 8 public-$today.all.masked.msa.nwk
+archiveRoot=/hive/users/angie/publicTrees
+read y m d < <(echo $today | sed -re 's/-/ /g')
+archive=$archiveRoot/$y/$m/$d
+ln -f $(pwd)/public-$today.all.masked.msa.pb.gz $archive/
+ln -f $(pwd)/public-$today.all.msa.fa.xz $archive/
+ln -f $(pwd)/public-$today.all.masked.msa.nwk.gz $archive/
+ln -f $(pwd)/public-$today.all.masked.msa.pb.gz $archiveRoot/public-latest.all.masked.msa.pb.gz
+ln -f $(pwd)/public-$today.all.msa.fa.xz $archiveRoot/public-latest.all.msa.fa.xz
+ln -f $(pwd)/public-$today.all.masked.msa.nwk.gz $archiveRoot/public-latest.masked.msa.nwk.gz