src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh 4efa4aebadc39cb64a256c3489035b00bd2abdc6

4efa4aebadc39cb64a256c3489035b00bd2abdc6
angie
  Wed Sep 8 12:06:50 2021 -0700
Remove dependency on reference.fa from aligning new sequences; add tree-annotated Nextstrain clade and Pango lineage to metadata.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index fee47a9..f867a2c 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -1,47 +1,47 @@
 #!/bin/bash
 set -beEu -x -o pipefail
 
 #	Do not modify this script, modify the source tree copy:
 #	kent/src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
 
 usage() {
-    echo "usage: $0 prevDate problematicSitesVcf"
+    echo "usage: $0 prevDate today problematicSitesVcf"
     echo "This assumes that ncbi.latest and cogUk.latest links/directories have been updated."
 }
 
-if [ $# != 2 ]; then
+if [ $# != 3 ]; then
   usage
   exit 1
 fi
 
 prevDate=$1
-problematicSitesVcf=$2
+today=$2
+problematicSitesVcf=$3
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 ncbiDir=$ottoDir/ncbi.latest
 cogUkDir=$ottoDir/cogUk.latest
 cncbDir=$ottoDir/cncb.latest
 gisaidDir=/hive/users/angie/gisaid
 minReal=20000
 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit
 epiToPublic=$gisaidDir/epiToPublicAndDate.latest
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
-today=$(date +%F)
-
+mkdir -p $ottoDir/$today
 cd $ottoDir/$today
 
 prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb
 prevMeta=$ottoDir/$prevDate/public-$prevDate.metadata.tsv.gz
 
 usherDir=~angie/github/usher
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
 
 renaming=oldAndNewNames
 
 if [ ! -s new.masked.vcf.gz ]; then
 
 # Make lists of sequences already in the tree.
 $matUtils extract -i $prevProtobufMasked -u prevNames
@@ -298,38 +298,39 @@
     -o gisaidAndPublic.$today.masked.preTrim.pb \
     >& usher.addNew.log
 mv uncondensed-final-tree.nh gisaidAndPublic.$today.preTrim.nwk
 
 # Exclude sequences with a very high number of EPPs from future runs
 grep ^Current usher.addNew.log \
 | awk '$16 >= 10 {print $8;}' \
 | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \
     >> ../tooManyEpps.ids
 
 # Prune samples with too many private mutations and internal branches that are too long.
 $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \
     -b 30 \
     -O -o gisaidAndPublic.$today.masked.pb
 
+$matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today
+
 # Metadata for hgPhyloPlace:
 # Header names same as nextmeta (with strain first) so hgPhyloPlace recognizes them:
 echo -e "strain\tgenbank_accession\tdate\tcountry\thost\tcompleteness\tlength\tNextstrain_clade\tpangolin_lineage" \
     > gisaidAndPublic.$today.metadata.tsv
 # It's not always possible to recreate both old and new names correctly from metadata,
 # so make a file to translate accession or COG-UK to the name used in VCF, tree and protobufs.
-cut -f 2 $renaming \
-| awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' \
+awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' samples.$today \
 | sort \
     > idToName
 # NCBI metadata for COG-UK: strip COG-UK/ & United Kingdom:, add country & year to name
 grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
 | tawk '$8 >= '$minReal' {print $1, $3, $4, $5, $4 "/" $6 "/" $3 "|" $1 "|" $3, $8;}' \
 | sed -re 's@COG-UK/@@g; s/United Kingdom://g;  s/(\/[0-9]{4})(-[0-9]+)*/\1/;
            s@Northern Ireland/@NorthernIreland/@;' \
     > tmp
 # NCBI metadata for non-COG-UK (strip colon-separated location after country if present):
 grep -v COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
 | tawk '$8 >= '$minReal' { print $1, $3, $4, $5, $6, $8; }' \
 | sed -re 's@\t([A-Za-z -]+):[^\t]+\t@\t\1\t@;' \
 | perl -wpe '@w = split("\t"); $w[4] =~ s/ /_/g; $_ = join("\t", @w);' \
 | cleanGenbank \
 | sort tmp - > gb.metadata
@@ -375,41 +376,40 @@
 | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2,1.8 - cogUkToNextclade \
 | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
     >> gisaidAndPublic.$today.metadata.tsv
 # CNCB metadata:
 tail -n+2 $cncbDir/cncb.metadata.tsv \
 | tawk '{ if ($3 != "GISAID" && $3 != "GenBank" && $3 != "Genbank") {
             print $2, "", $10, $11, $9, $5, $6} }' \
 | sed -re 's@\t([A-Za-z -]+)( / [A-Za-z -'"'"']+)+\t@\t\1\t@;  s/Sapiens/sapiens/;' \
 | sort \
 | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 - $cncbDir/nextclade.tsv \
 | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.2 - $cncbDir/pangolin.tsv \
 | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
     >> gisaidAndPublic.$today.metadata.tsv
 wc -l gisaidAndPublic.$today.metadata.tsv
 zcat $gisaidDir/metadata_batch_$today.tsv.gz \
-| grep -Fwf <(cut -f 2 $renaming | grep EPI_ISL | cut -d\| -f 2) \
+| grep -Fwf <(grep EPI_ISL samples.$today | cut -d\| -f 2) \
 | tawk '{print $1 "|" $3 "|" $5, "", $5, $7, $15, $13, $14, $18, $19;}' \
     >> gisaidAndPublic.$today.metadata.tsv
 wc -l gisaidAndPublic.$today.metadata.tsv
 gzip -f gisaidAndPublic.$today.metadata.tsv
 
 # version/description files
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
     > version.plusGisaid.txt
-$matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today
 sampleCountComma=$(echo $(wc -l < samples.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.plusGisaid.description.txt
 
 # Add nextclade annotations to protobuf
 if [ -s $ottoDir/$prevDate/cladeToName ]; then
     # Use yesterday's clade assignments to annotate clades on today's tree
     time $matUtils annotate -T 50 \
         -l \
         -i gisaidAndPublic.$today.masked.pb \
         -c $ottoDir/$prevDate/cladeToName \
         -f 0.95 \
         -D details.nextclade \
         -o gisaidAndPublic.$today.masked.nextclade.pb \
@@ -439,40 +439,55 @@
 fi
 
 # Replace protobuf with annotated protobuf.
 mv gisaidAndPublic.$today.masked{,.unannotated}.pb
 ln -f gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb
 
 # Save clade & lineage annotations for use tomorrow.
 $matUtils summary -i gisaidAndPublic.$today.masked.pb -C sample-clades
 tail -n+2 sample-clades \
 | tawk '{print $2, $1;}' \
 | sort > cladeToName
 tail -n+2 sample-clades \
 | tawk '{print $3, $1;}' \
 | sort > lineageToName
 
+# Add clade & lineage from tree to metadata.
+zcat gisaidAndPublic.$today.metadata.tsv.gz \
+| tail -n+2 \
+| sort > tmp1
+tail -n+2 sample-clades \
+| sort > tmp2
+paste <(zcat gisaidAndPublic.$today.metadata.tsv.gz | head -1) \
+      <(echo -e "Nextstrain_clade_usher\tpango_lineage_usher") \
+    > gisaidAndPublic.$today.metadata.tsv
+join -t$'\t' tmp1 tmp2 \
+    >> gisaidAndPublic.$today.metadata.tsv
+gzip -f gisaidAndPublic.$today.metadata.tsv
+rm tmp1 tmp2
+
 # EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have
 # public names & IDs, we can match them.
 cut -f 1,3 $epiToPublic > epiToPublic.latest
 
 # Update links to latest public+GISAID protobuf and metadata in hgwdev cgi-bin directories
 for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do
     ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb
     ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \
         $dir/public.plusGisaid.latest.metadata.tsv.gz
     ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt
     ln -sf `pwd`/epiToPublic.latest $dir/
 done
 
 # Make Taxodium-formatted protobuf for display
 zcat /hive/data/genomes/wuhCor1/goldenPath/bigZips/genes/ncbiGenes.gtf.gz > ncbiGenes.gtf
+zcat /hive/data/genomes/wuhCor1/wuhCor1.fa.gz > wuhCor1.fa
 zcat gisaidAndPublic.$today.metadata.tsv.gz > metadata.tmp.tsv
 time $matUtils extract -i gisaidAndPublic.$today.masked.pb \
-    -f reference.fa \
+    -f wuhCor1.fa \
     -g ncbiGenes.gtf \
     -M metadata.tmp.tsv \
     --write-taxodium gisaidAndPublic.$today.masked.taxodium.pb
-rm metadata.tmp.tsv
+rm metadata.tmp.tsv wuhCor1.fa
 gzip gisaidAndPublic.$today.masked.taxodium.pb
 
 $scriptDir/extractPublicTree.sh $today