src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh ea4eb88f661eb1d288f61981093d402830496603

ea4eb88f661eb1d288f61981093d402830496603
angie
  Mon Mar 1 01:01:43 2021 -0800
matToVcf has been replaced by 'matUtils convert'.  Also, use 'matUtils annotate's new -l option to clear old clade annotations before reassigning them on the updated protobuf.

diff --git src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh
index 8adb374..b108dfe 100755
--- src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh
@@ -26,31 +26,31 @@
 echo "prevProtobufMasked=$prevProtobufMasked"
 echo "prevProtobufUnmasked=$prevProtobufUnmasked"
 echo "prevMeta=$prevMeta"
 echo "problematicSitesVcf=$problematicSitesVcf"
 
 ncbiDir=$ottoDir/ncbi.latest
 cogUkDir=$ottoDir/cogUk.latest
 cncbDir=$ottoDir/cncb.latest
 today=$(date +%F)
 
 minReal=20000
 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit
 
 usherDir=~angie/github/usher
 usher=$usherDir/build/usher
-matToVcf=$usherDir/build/matToVcf
+matUtils=$usherDir/build/matUtils
 find_parsimonious_assignments=~angie/github/strain_phylogenetics/build/find_parsimonious_assignments
 
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 
 source $scriptDir/util.sh
 
 # Before we get started, make sure cog_metadata has the columns we're expecting:
 expectedHeaderStart='sequence_name,country,adm1,pillar_2,sample_date,epi_week,lineage,'
 cogUkMetHeader=$(head -1 $cogUkDir/cog_metadata.csv | grep ^$expectedHeaderStart)
 # The grep will fail if they change one of the first 7 fields.
 
 mkdir -p $ottoDir/$today
 cd $ottoDir/$today
 
 # Disregard pipefail just for this pipe because head prevents the cat from completing:
@@ -136,51 +136,51 @@
 wc -l $renaming
 
 # Make VCF -- first without masking, for hgTracks
 time cat <(twoBitToFa $ref2bit stdout) $alignedFa \
 | faToVcf stdin stdout \
 | vcfRenameAndPrune stdin $renaming stdout \
 | gzip -c \
     > new.vcf.gz
 # Use usher to add new public sequences to tree of mapped subset and infer missing/ambig bases.
 time $usher -u -T 50 \
     -v new.vcf.gz \
     -i $prevProtobufUnmasked \
     -o public-$today.all.notMasked.pb \
     >& usher.addNewUnmasked.log
 #~10 hours for 56k seqs, ~72m for 6k
-$matToVcf -i public-$today.all.notMasked.pb -v public-$today.all.vcf
+$matUtils convert -i public-$today.all.notMasked.pb -v public-$today.all.vcf
 ls -l public-$today.all.vcf
 wc -l public-$today.all.vcf
 bgzip -f public-$today.all.vcf
 tabix -p vcf public-$today.all.vcf.gz
 
 # Then with masking to collapse the tree & make protobuf for usher/hgPhyloPlace:
 tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \
     $problematicSitesVcf > mask.vcf
 time vcfFilter -excludeVcf=mask.vcf new.vcf.gz \
 | gzip -c \
     > new.masked.vcf.gz
 time $usher -u -T 50 \
     -v new.masked.vcf.gz \
     -i $prevProtobufMasked \
     -o public-$today.all.masked.pb \
     >& usher.addNew.log
 mv uncondensed-final-tree.nh public-$today.all.nwk
 # Masked VCF that goes with the masked protobuf, for public distribution for folks who want
 # to run UShER and also have the VCF.
-$matToVcf -i public-$today.all.masked.pb -v public-$today.all.masked.vcf
+$matUtils convert -i public-$today.all.masked.pb -v public-$today.all.masked.vcf
 gzip public-$today.all.masked.vcf
 
 # Make allele-frequency-filtered versions
 # Disregard pipefail just for this pipe because head prevents the cat from completing:
 set +o pipefail
 sampleCount=$(zcat public-$today.all.vcf.gz | head | grep ^#CHROM | sed -re 's/\t/\n/g' \
               | tail -n+10 | wc -l)
 set -o pipefail
 minAc001=$(( (($sampleCount + 999) / 1000) ))
 vcfFilter -minAc=$minAc001 -rename public-$today.all.vcf.gz \
     > public-$today.all.minAf.001.vcf
 wc -l public-$today.all.minAf.001.vcf
 bgzip -f public-$today.all.minAf.001.vcf
 tabix -p vcf public-$today.all.minAf.001.vcf.gz
 
@@ -299,41 +299,42 @@
         > version.txt
 fi
 
 sampleCountComma=$(echo $sampleCount \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.description.txt
 
 cp -p public-$today.all.masked.pb{,.bak}
 
 # Add nextclade annotations to protobuf
 zcat public-$today.metadata.tsv.gz \
 | tail -n+2 | tawk '$8 != "" {print $8, $1;}' \
 | sed -re 's/^20E \(EU1\)/20E.EU1/;' \
     > cladeToPublicName
-time ~/github/usher/build/matUtils annotate -T 50 \
+time $matUtils annotate -T 50 \
+    -l \
     -i public-$today.all.masked.pb \
     -c cladeToPublicName \
     -o public-$today.all.masked.nextclade.pb \
     >& annotate.nextclade.out
 
 # Add pangolin lineage annotations to protobuf
 zcat public-$today.metadata.tsv.gz \
 | tail -n+2 | tawk '$9 != "" {print $9, $1;}' \
     > lineageToPublicName
-time ~/github/usher/build/matUtils annotate -T 50 \
+time $matUtils annotate -T 50 \
     -i public-$today.all.masked.nextclade.pb \
     -c lineageToPublicName \
     -o public-$today.all.masked.nextclade.pangolin.pb \
     >& annotate.pangolin.out
 
 # Not all the Pangolin lineages can be assigned nodes so for now just use nextclade
 cp -p public-$today.all.masked.nextclade.pb public-$today.all.masked.pb
 
 # Update gbdb links -- not every day, too much churn for getting releases out and the
 # tracks are getting unmanageably large for VCF.
 if false; then
 for f in public-$today.all{,.minAf*}.vcf.gz ; do
     t=$(echo $f | sed -re "s/-$today//;")
     ln -sf `pwd`/$f /gbdb/wuhCor1/sarsCov2PhyloPub/$t
     ln -sf `pwd`/$f.tbi /gbdb/wuhCor1/sarsCov2PhyloPub/$t.tbi