src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh 88c5c33485279c9687d7be15a85da2665c16e91d

88c5c33485279c9687d7be15a85da2665c16e91d
angie
  Wed Jun 16 17:52:44 2021 -0700
Lots of work from the past month.  De-dup COG-UK & GenBank; generate public tree from combined gisaidAndPublic tree.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index 64b644c..acd9fa0 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -17,69 +17,176 @@
 prevDate=$1
 problematicSitesVcf=$2
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 ncbiDir=$ottoDir/ncbi.latest
 cogUkDir=$ottoDir/cogUk.latest
 cncbDir=$ottoDir/cncb.latest
 gisaidDir=/hive/users/angie/gisaid
 minReal=20000
 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit
 epiToPublic=$gisaidDir/epiToPublicAndDate.latest
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 today=$(date +%F)
+y=$(date +%Y)
+m=$(date +%m)
+d=$(date +%d)
+
 cd $ottoDir/$today
 
 prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb
 
 usherDir=~angie/github/usher
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
 
 # Make lists of sequences already in the tree.
-# matUtils creates full paths for output files, defaulting to current directory, so can't
-# output to named pipe because it becomes $cwd//dev/fd/63 ...
-$matUtils summary -i $prevProtobufMasked -s prevSamples
-tail -n+2 prevSamples | cut -f 1 > prevNames
+$matUtils extract -i $prevProtobufMasked -u prevNames
+
+# Before updating the tree with new sequences, update the names used in the tree:
+# * Sequences that are already in the tree with EPI_ IDs, but that have been mapped to public IDs
+# * COG-UK sequences that are in GenBank.  Per Sam Nicholls the isolate alone is enough to identify
+#   them -- no need to match country & year which are sometimes incorrect at first.
+grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
+| tawk '{print $6, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \
+| sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \
+| sort \
+    > cogUkInGenBankIsolateToNewName
+fastaNames $cogUkDir/cog_all.fasta.xz > cogUk.names
+grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) cogUk.names \
+    > cogUkInGenBank
+set +o pipefail
+# From 2021-05-04 on there should be no more unrenamed COG-UK/ in prevNames, but doesn't hurt
+# to check.
+grep COG-UK/ prevNames \
+| awk -F\| '{print $1 "\t" $0;}' \
+| sed -re 's@COG-UK/@@;' \
+| sort \
+    > cogUkInGenBankIsolateToPrevName
+set -o pipefail
+join -t$'\t' cogUkInGenBankIsolateToPrevName cogUkInGenBankIsolateToNewName \
+| cut -f 2,3 \
+    > prevTree.renaming
+# Look for COG-UK isolates in prevNames that have just been added to GenBank and need to be renamed.
+# Unfortunately for now we are not getting those from $epiToPublic.
+grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) prevNames \
+| awk -F\| '$3 == ""' \
+| awk -F/ '{print $2 "\t" $0;}' \
+| sort \
+    > cogUkInGenBankIsolateToPrevName
+join -t$'\t' cogUkInGenBankIsolateToPrevName cogUkInGenBankIsolateToNewName \
+| cut -f 2,3 \
+    >> prevTree.renaming
+# Look for names with EPI_IDs that have just today been mapped to public sequences.
+grep EPI_ISL_ prevNames \
+| awk -F\| '{print $2 "\t" $0;}' \
+| sort \
+    > epiToPrevName
+set +o pipefail
+grep -Fwf <(cut -f 1 epiToPrevName) $epiToPublic \
+| grep -v COG-UK/ \
+| tawk '{if ($4 != "" && $3 == $2) { print $1, $2 "|" $4; } else if ($4 != "") { print $1, $3 "|" $2 "|" $4; }}' \
+| sort \
+    > epiToNewName
+set -o pipefail
+# Argh, missing sequences in COG-UK metadata can mean that a sequence may have been added to the
+# tree both with and without EPI ID... so renaming makes a name conflict.
+# If there are any of those then prune the sequences with EPI_ID or longer dates, so renaming doesn't
+# cause conflict.
+comm -12 <(cut -f 2 epiToNewName | sort) <(sort prevNames) > epiToNewNameAlreadyInTree
+join -t$'\t' epiToPrevName <(grep -vFwf epiToNewNameAlreadyInTree epiToNewName) \
+| cut -f 2,3 \
+    >> prevTree.renaming
+cp /dev/null dupsToRemove
+if [ -s epiToNewNameAlreadyInTree ]; then
+    grep -Fwf <(cut -d\| -f 1 epiToNewNameAlreadyInTree) prevNames \
+    | grep EPI_ISL \
+        >> dupsToRemove
+fi
+# And finally, sometimes there are duplicates due to country or date being corrected in COG-UK
+# metadata.
+set +o pipefail
+grep -vFwf dupsToRemove prevTree.renaming \
+| cut -f 2 | sort | uniq -c | awk '$1 > 1 {print $2;}' \
+| cut -d/ -f 2 \
+    > cogUkIsolateStillDup
+grep -Fwf cogUkIsolateStillDup $cogUkDir/cog_metadata.csv \
+| awk -F, '{print $1 "|" $5;}' \
+    > cogDupsLatestMetadata
+grep -Fwf cogUkIsolateStillDup prevNames \
+| grep -vFwf dupsToRemove \
+| grep -vFwf cogDupsLatestMetadata \
+| cat \
+    >> dupsToRemove
+set -o pipefail
+startingProtobuf=$prevProtobufMasked
+if [ -s dupsToRemove ]; then
+    startingProtobuf=prevDupTrimmed.pb
+    $matUtils extract -i $prevProtobufMasked -p -s dupsToRemove -o $startingProtobuf
+fi
+
+if [ -s prevTree.renaming ]; then
+    $matUtils mask -r prevTree.renaming -i $startingProtobuf -o prevRenamed.pb >& rename.out
+    $matUtils extract -i prevRenamed.pb -u prevNames
+else
+    ln -sf $startingProtobuf prevRenamed.pb
+fi
+
+# OK, now that the tree names are updated, figure out which seqs are already in there and
+# which need to be added.
 awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
 | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc
-awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
-| grep -E '^(England|Northern|Scotland|Wales)' > prevCogUk
+grep -E '^(England|Northern|Scotland|Wales)' prevNames \
+| cut -d\| -f1 > prevCogUk
 awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
 | grep -E '^EPI_ISL_' > prevGisaid
 # Add public sequences that have been mapped to GISAID sequences to prevGisaid.
 grep -Fwf prevGbAcc $epiToPublic | cut -f 1 >> prevGisaid
 grep -Fwf prevCogUk $epiToPublic | cut -f 1 >> prevGisaid
 wc -l prev*
 
 # Get new GenBank sequences with at least $minReal non-N bases.
-#*** TODO: exclude seqs already in tree in their GISAID forms
+# Exclude seqs in the tree with EPI IDs that that have been mapped in the very latest $epiToPublic.
+set +o pipefail
+egrep $'\t''[A-Z][A-Z][0-9]{6}\.[0-9]+' $epiToPublic \
+| grep -Fwf prevGisaid - \
+| grep -vFwf prevGbAcc \
+| cat \
+    >> prevGbAcc
+set -o pipefail
 xzcat $ncbiDir/genbank.fa.xz \
 | faSomeRecords -exclude stdin prevGbAcc newGenBank.fa
 faSize -veryDetailed newGenBank.fa \
 | tawk '$4 < '$minReal' {print $1;}' \
     > gbTooSmall
 # NCBI also includes NC_045512 in the download, but that's our reference, so... exclude that too.
+set +o pipefail
 fastaNames newGenBank.fa | grep NC_045512 >> gbTooSmall
+set -o pipefail
 faSomeRecords -exclude newGenBank.fa gbTooSmall newGenBank.filtered.fa
 faSize newGenBank.filtered.fa
 
 # Get new COG-UK sequences with at least $minReal non-N bases.
+# Also exclude cog_all.fasta sequences not found in cog_metadata.csv.
+comm -23 <(fastaNames $cogUkDir/cog_all.fasta.xz | sort) \
+    <(cut -d, -f1 $cogUkDir/cog_metadata.csv | sort) \
+    > cogFaNotMeta
+# Also exclude COG-UK sequences that have been added to GenBank (cogUkInGenBank, see above).
 xzcat $cogUkDir/cog_all.fasta.xz \
-| faSomeRecords -exclude stdin prevCogUk newCogUk.fa
+| faSomeRecords -exclude stdin <(cat prevCogUk cogFaNotMeta cogUkInGenBank) newCogUk.fa
 faSize -veryDetailed newCogUk.fa \
 | tawk '$4 < '$minReal' {print $1;}' \
     > cogUkTooSmall
 faSomeRecords -exclude newCogUk.fa cogUkTooSmall newCogUk.filtered.fa
 faSize newCogUk.filtered.fa
 
 # Get new GISAID sequences with at least $minReal non-N bases.
 xzcat $gisaidDir/gisaid_fullNames_$today.fa.xz \
 | sed -re 's/^>.*\|(EPI_ISL_[0-9]+)\|.*/>\1/' \
 | faSomeRecords -exclude stdin prevGisaid newGisaid.fa
 faSize -veryDetailed newGisaid.fa \
 | tawk '$4 < '$minReal' {print $1;}' \
     > gisaidTooSmall
 faSomeRecords -exclude newGisaid.fa gisaidTooSmall newGisaid.filtered.fa
 faSize newGisaid.filtered.fa
@@ -99,166 +206,328 @@
 cat new*.filtered.fa > new.fa
 faSize new.fa
 
 # Use Rob's script that aligns each sequence to NC_045512.2 and concatenates the results
 # as a multifasta alignment (from which we can extract VCF with SNVs):
 #conda install -c bioconda mafft
 alignedFa=new.aligned.fa
 rm -f $alignedFa
 export TMPDIR=/dev/shm
 time bash ~angie/github/sarscov2phylo/scripts/global_profile_alignment.sh \
   -i new.fa \
   -o $alignedFa \
   -t 50
 faSize $alignedFa
 
-#*** TODO: if EPIs have recently been mapped to public sequences, we should convert their
-#*** tree names from EPI to public.
-
-# Now make a renaming that keeps all the prevNames and adds full names (with shortened dates)
-# for the new seqs.
+# Now make a renaming that keeps all the prevNames and adds full names for the new seqs.
 renaming=oldAndNewNames
 tawk '{print $1, $1;}' prevNames > $renaming
 if [ -s newCogUk.filtered.fa ]; then
     # Sometimes all of the new COG-UK sequences are missing from cog_metadata.csv -- complained.
     set +o pipefail
     fastaNames newCogUk.filtered.fa \
     | grep -Fwf - $cogUkDir/cog_metadata.csv \
     | awk -F, '{print $1 "\t" $1 "|" $5;}' \
-    | sed -re 's/20([0-9][0-9])(-[0-9-]+)?$/\1\2/;' \
         >> $renaming
     set -o pipefail
 fi
 if [ -s newGenBank.filtered.fa ]; then
+    # Special renaming for COG-UK sequences: strip COG-UK/, add back country and year
+    set +o pipefail
+    fastaNames newGenBank.filtered.fa \
+    | grep COG-UK/ \
+    | sed -re 's/[ |].*//' \
+    | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \
+    | tawk '{print $1, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \
+    | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \
+        >> $renaming
     fastaNames newGenBank.filtered.fa \
+    | grep -v COG-UK/ \
     | sed -re 's/[ |].*//' \
     | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \
     | tawk '{ if ($3 == "") { $3 = "?"; }
               if ($6 != "") { print $1 "\t" $6 "|" $1 "|" $3; }
               else { print $1 "\t" $1 "|" $3; } }' \
     | cleanGenbank \
     | sed -re 's/ /_/g' \
-    | sed -re 's/20([0-9][0-9])(-[0-9-]+)?$/\1\2/;' \
         >> $renaming
+    set -o pipefail
 fi
 if [ -s newGisaid.filtered.fa ]; then
     zcat $gisaidDir/metadata_batch_$today.tsv.gz \
     | grep -Fwf <(fastaNames newGisaid.filtered.fa) \
     | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \
         >> $renaming
 fi
 wc -l $renaming
 
-# Remove old tree's GISAID items that have been mapped to public sequences so we don't have dups
-set +o pipefail
-cut -f 1 $epiToPublic \
-| grep -Fwf - prevNames \
-| cat \
-    > mappedGisaidToRemove
-set -o pipefail
-wc -l mappedGisaidToRemove
-
-if [ -s mappedGisaidToRemove ]; then
-    $matUtils extract -i $prevProtobufMasked \
-        -s mappedGisaidToRemove \
-        -p \
-        -o prevTrimmed.pb
-else
-    cp -p $prevProtobufMasked prevTrimmed.pb
-fi
-
 # Make masked VCF
 tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \
     $problematicSitesVcf > mask.vcf
 # Add masked VCF to previous protobuf
 time cat <(twoBitToFa $ref2bit stdout) $alignedFa \
 | faToVcf stdin stdout \
 | vcfRenameAndPrune stdin $renaming stdout \
 | vcfFilter -excludeVcf=mask.vcf stdin \
 | gzip -c \
     > new.masked.vcf.gz
 time $usher -u -T 50 \
+    -A \
+    -e 5 \
     -v new.masked.vcf.gz \
-    -i prevTrimmed.pb \
-    -o gisaidAndPublic.$today.masked.pb \
+    -i prevRenamed.pb \
+    -o gisaidAndPublic.$today.masked.preTrim.pb \
     >& usher.addNew.log
-mv uncondensed-final-tree.nh gisaidAndPublic.$today.nwk
+mv uncondensed-final-tree.nh gisaidAndPublic.$today.preTrim.nwk
+
+# Prune samples with too many private mutations and internal branches that are too long.
+$matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \
+    -a 20 \
+    -b 30 \
+    -O -o gisaidAndPublic.$today.masked.pb
 
-# Metadata for hgPhyloPlace: for now, start with already-built public metadata.
-zcat public-$today.metadata.tsv.gz > gisaidAndPublic.$today.metadata.tsv
+# Metadata for hgPhyloPlace:
+# Header names same as nextmeta (with strain first) so hgPhyloPlace recognizes them:
+echo -e "strain\tgenbank_accession\tdate\tcountry\thost\tcompleteness\tlength\tNextstrain_clade\tpangolin_lineage" \
+    > gisaidAndPublic.$today.metadata.tsv
+# It's not always possible to recreate both old and new names correctly from metadata,
+# so make a file to translate accession or COG-UK to the name used in VCF, tree and protobufs.
+cut -f 2 $renaming \
+| awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' \
+| sort \
+    > idToName
+# NCBI metadata for COG-UK: strip COG-UK/ & United Kingdom:, add country & year to name
+grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
+| tawk '$8 >= '$minReal' {print $1, $3, $4, $5, $4 "/" $6 "/" $3 "|" $1 "|" $3, $8;}' \
+| sed -re 's@COG-UK/@@g; s/United Kingdom://g;  s/(\/[0-9]{4})(-[0-9]+)*/\1/;
+           s@Northern Ireland/@NorthernIreland/@;' \
+    > tmp
+# NCBI metadata for non-COG-UK (strip colon-separated location after country if present):
+grep -v COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
+| tawk '$8 >= '$minReal' { print $1, $3, $4, $5, $6, $8; }' \
+| sed -re 's/\t([A-Za-z -]+):[A-Za-z0-9 ,()_-]+\t/\t\1\t/;' \
+| perl -wpe '@w = split("\t"); $w[4] =~ s/ /_/g; $_ = join("\t", @w);' \
+| cleanGenbank \
+| sort tmp - > gb.metadata
+if [ -e $ncbiDir/lineage_report.csv ]; then
+    echo Getting GenBank Pangolin lineages from $ncbiDir/lineage_report.csv
+    tail -n+2  $ncbiDir/lineage_report.csv \
+    | sed -re 's/^([A-Z][A-Z][0-9]{6}\.[0-9]+)[^,]*/\1/;' \
+    | awk -F, '$2 != "" && $2 != "None" {print $1 "\t" $2;}' \
+    | sort \
+        > gbToLineage
+else
+    echo Getting GenBank Pangolin lineages from $prevMeta
+    zcat $prevMeta \
+    | tail -n+2 \
+    | tawk '$2 != "" && $8 != "" { print $2, $8; }' \
+    | sort \
+        > gbToLineage
+fi
+wc -l gbToLineage
+if [ -e $ncbiDir/nextclade.tsv ]; then
+    sort $ncbiDir/nextclade.tsv > gbToNextclade
+else
+    touch gbToNextclade
+fi
+wc -l gbToNextclade
+join -t$'\t' -a 1 gb.metadata gbToNextclade \
+| join -t$'\t' -a 1 - gbToLineage \
+| tawk '{ if ($2 == "") { $2 = "?"; }
+          print $1, $1, $2, $3, $4, "", $6, $7, $8; }' \
+| join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
+    >> gisaidAndPublic.$today.metadata.tsv
+# COG-UK metadata:
+if [ -e $cogUkDir/nextclade.tsv ]; then
+    sort $cogUkDir/nextclade.tsv > cogUkToNextclade
+else
+    touch cogUkToNextclade
+fi
+#*** Could also add sequence length to metadata from faSizes output...
+tail -n+2 $cogUkDir/cog_metadata.csv \
+| awk -F, -v 'OFS=\t' '{print $1, "", $5, $2, "", "", "", $7; }' \
+| sort \
+| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2,1.8 - cogUkToNextclade \
+| join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
+    >> gisaidAndPublic.$today.metadata.tsv
+# CNCB metadata:
+tail -n+2 $cncbDir/cncb.metadata.tsv \
+| tawk '{ if ($3 != "GISAID" && $3 != "GenBank" && $3 != "Genbank") {
+            print $2, "", $10, $11, $9, $5, $6} }' \
+| sed -re 's@\t([A-Za-z -]+)( / [A-Za-z -'"'"']+)+\t@\t\1\t@;  s/Sapiens/sapiens/;' \
+| sort \
+| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 - $cncbDir/nextclade.tsv \
+| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.2 - $cncbDir/pangolin.tsv \
+| join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
+    >> gisaidAndPublic.$today.metadata.tsv
+wc -l gisaidAndPublic.$today.metadata.tsv
 zcat $gisaidDir/metadata_batch_$today.tsv.gz \
 | grep -Fwf <(cut -f 2 $renaming | grep EPI_ISL | cut -d\| -f 2) \
 | tawk '{print $1 "|" $3 "|" $5, "", $5, $7, $15, $13, $14, $18, $19;}' \
     >> gisaidAndPublic.$today.metadata.tsv
 wc -l gisaidAndPublic.$today.metadata.tsv
 gzip gisaidAndPublic.$today.metadata.tsv
 
 # version/description files
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
     > version.plusGisaid.txt
 $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today
 sampleCountComma=$(echo $(wc -l < samples.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.plusGisaid.description.txt
 
 # Add nextclade annotations to protobuf
 zcat gisaidAndPublic.$today.metadata.tsv.gz \
 | tail -n+2 | tawk '$8 != "" {print $8, $1;}' \
 | sed -re 's/^20E \(EU1\)/20E.EU1/;' \
     > cladeToName
 time $matUtils annotate -T 50 \
     -l \
     -i gisaidAndPublic.$today.masked.pb \
     -c cladeToName \
-    -m mutations.nextclade \
+    -u mutations.nextclade \
     -D details.nextclade \
     -o gisaidAndPublic.$today.masked.nextclade.pb \
     >& annotate.nextclade.out
 
 # Add pangolin lineage annotations to protobuf.  Use pangoLEARN training samples;
 # convert EPI IDs to public to match tree IDs.
-tail -n+2 ~angie/github/pangoLEARN/pangoLEARN/data/lineages.metadata.csv \
-| awk -F, '{print $7 "\t" $2;}' \
+tail -n+2 ~angie/github/pango-designation/lineages.metadata.csv \
+| grep -vFwf $ottoDir/clades.blackList \
+| awk -F, '{print $9 "\t" $2;}' \
 | sort > epiExemplarToLineage
 subColumn -miss=/dev/null 1 epiExemplarToLineage \
-    <(cut -f 1,2 $gisaidDir/epiToPublicAndDate.$today) stdout \
+    <(cut -f 1,2 $epiToPublic) stdout \
 | sort > idExemplarToLineage
-cut -f 2 $renaming | grep -Fwf <(cut -f 1 idExemplarToLineage) \
+grep -Fwf <(cut -f 1 idExemplarToLineage) samples.$today \
 | awk -F\| '{ if ($3 == "") { print $1 "\t" $0 } else { print $2 "\t" $0; } }' \
 | sort > idExemplarToName
 join -t$'\t' idExemplarToName idExemplarToLineage \
 | tawk '{print $3, $2;}' \
 | sort > lineageToName
-linFile=lineageToName
-
-# Until pangoLEARN training is done, use pango-designation file for now.
-#linFile=/hive/users/angie/matLineages/linToPublicPlusGisaidName.2021-03-16
+# $epiToPublic maps some cogUkInGenBank EPI IDs to their COG-UK names not GenBank IDs unfortunately
+# so some of those don't match between idExemplarToLineage and idExemplarToName.  Find missing
+# sequences and try a different means of adding those.
+comm -13 <( cut -f 1 idExemplarToName | sort) <(cut -f 1 idExemplarToLineage| sort) \
+| grep -Fwf - ~angie/github/pango-designation/lineages.metadata.csv \
+| sed -re 's/Northern_/Northern/;' \
+| awk -F, '{print $1 "\t" $2;}' \
+| sort > exemplarNameNotFoundToLineage
+grep -Fwf <(cut -f 1 exemplarNameNotFoundToLineage) samples.$today \
+| awk -F\| '{print $1 "\t" $0;}' \
+| sort > exemplarNameNotFoundToFullName
+join -t$'\t' exemplarNameNotFoundToLineage exemplarNameNotFoundToFullName \
+| cut -f 2,3 \
+| sort -u lineageToName - ../lineageToName.newLineages > tmp
+mv tmp lineageToName
 
 time $matUtils annotate -T 50 \
     -i gisaidAndPublic.$today.masked.nextclade.pb \
-    -c $linFile \
-    -m mutations.pangolin \
+    -c lineageToName \
+    -u mutations.pangolin \
     -D details.pangolin \
     -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \
     >& annotate.pangolin.out
 
 mv gisaidAndPublic.$today.masked{,.unannotated}.pb
 ln gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb
 
-# Extract public samples from tree
-$matUtils summary -i gisaidAndPublic.$today.masked.pb -s newSamples
-tail -n+2 newSamples | cut -f 1 > newNames
-grep -v EPI_ISL_ newNames > newPublicNames
-$matUtils extract -i gisaidAndPublic.$today.masked.pb \
-    -s newPublicNames \
-    -O -o public.$today.masked.pb
+# EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have
+# public names & IDs, we can match them.
+cut -f 1,3 $epiToPublic > epiToPublic.latest
 
+# Update links to latest public+GISAID protobuf and metadata in hgwdev cgi-bin directories
 for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do
     ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb
     ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \
         $dir/public.plusGisaid.latest.metadata.tsv.gz
     ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt
+    ln -sf `pwd`/epiToPublic.latest $dir/
 done
 
+# Extract public samples from tree
+$matUtils extract -i gisaidAndPublic.$today.masked.pb -u newNames
+grep -v EPI_ISL_ newNames > newPublicNames
+$matUtils extract -i gisaidAndPublic.$today.masked.pb \
+    -s newPublicNames \
+    -O -o public-$today.all.masked.pb
+# Extract Newick and VCF from public-only tree
+$matUtils extract -i public-$today.all.masked.pb \
+    -t public-$today.all.nwk \
+    -v public-$today.all.masked.vcf
+gzip -f public-$today.all.masked.vcf
+zcat gisaidAndPublic.$today.metadata.tsv.gz \
+| grep -v EPI_ISL_ \
+| gzip -c \
+    > public-$today.metadata.tsv.gz
+
+grep -v EPI_ISL_ cladeToName > cladeToPublicName
+grep -v EPI_ISL_ lineageToName > lineageToPublicName
+
+# Add nextclade annotations to public protobuf
+time $matUtils annotate -T 50 \
+    -l \
+    -i public-$today.all.masked.pb \
+    -c cladeToPublicName \
+    -u mutations.nextclade.public \
+    -D details.nextclade.public \
+    -o public-$today.all.masked.nextclade.pb \
+    >& annotate.nextclade.public.out
+
+# Add pangolin lineage annotations to public protobuf
+time $matUtils annotate -T 50 \
+    -i public-$today.all.masked.nextclade.pb \
+    -c lineageToPublicName \
+    -u mutations.pangolin.public \
+    -D details.pangolin.public \
+    -o public-$today.all.masked.nextclade.pangolin.pb \
+    >& annotate.pangolin.public.out
+
+# Not all the Pangolin lineages can be assigned nodes so for now just use nextclade
+rm public-$today.all.masked.pb
+ln public-$today.all.masked.nextclade.pb public-$today.all.masked.pb
+
+cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
+echo "sarscov2phylo release 13-11-20; NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
+    > version.txt
+
+$matUtils extract -i public-$today.all.masked.pb -u samples.public.$today
+sampleCountComma=$(echo $(wc -l < samples.public.$today) \
+                   | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
+echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
+    > hgPhyloPlace.description.txt
+
+
+# Link to public trees download directory hierarchy
+archiveRoot=/hive/users/angie/publicTrees
+archive=$archiveRoot/$y/$m/$d
+mkdir -p $archive
+gzip -c public-$today.all.nwk > $archive/public-$today.all.nwk.gz
+ln `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/
+gzip -c public-$today.all.masked.pb > $archive/public-$today.all.masked.pb.gz
+ln `pwd`/public-$today.metadata.tsv.gz $archive/
+gzip -c public-$today.all.masked.nextclade.pangolin.pb \
+    > $archive/public-$today.all.masked.nextclade.pangolin.pb.gz
+gzip -c cladeToPublicName > $archive/cladeToPublicName.gz
+gzip -c lineageToPublicName > $archive/lineageToPublicName.gz
+ln `pwd`/hgPhyloPlace.description.txt $archive/public-$today.version.txt
 
+# Update 'latest' in $archiveRoot
+ln -f $archive/public-$today.all.nwk.gz $archiveRoot/public-latest.all.nwk.gz
+ln -f $archive/public-$today.all.masked.pb $archiveRoot/public-latest.all.masked.pb
+ln -f $archive/public-$today.all.masked.pb.gz $archiveRoot/public-latest.all.masked.pb.gz
+ln -f $archive/public-$today.all.masked.vcf.gz $archiveRoot/public-latest.all.masked.vcf.gz
+ln -f $archive/public-$today.metadata.tsv.gz $archiveRoot/public-latest.metadata.tsv.gz
+ln -f $archive/public-$today.version.txt $archiveRoot/public-latest.version.txt
+
+# Update hgdownload-test link for archive
+mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m
+ln -sf $archive /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m
+
+# Update links to latest public protobuf and metadata in hgwdev cgi-bin directories
+for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do
+    ln -sf `pwd`/public-$today.all.masked.pb $dir/public-latest.all.masked.pb
+    ln -sf `pwd`/public-$today.metadata.tsv.gz $dir/public-latest.metadata.tsv.gz
+    ln -sf `pwd`/hgPhyloPlace.description.txt $dir/public-latest.version.txt
+done