src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh 83f9e814d5441d0ba86d5fa735aed67d264a771f

83f9e814d5441d0ba86d5fa735aed67d264a771f
angie
  Sun Jun 27 20:27:52 2021 -0700
Skip building new.masked.vcf.gz if it already exists (go straight to usher).  More prevention of empty-grep crashes.  Use faToVcf new -maxDiff option to prevent memory usage blowup for sequences that are so diverged/erroneous that we won't keep them anyway.  Exclude sequences with 10 or more equally parsimonious placements so we don't keep trying in vain (and wasting time).  Handle new nextstrain clade names with spaces etc.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index acd9fa0..63bbdc5 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -1,533 +1,554 @@
 #!/bin/bash
 set -beEu -x -o pipefail
 
 #	Do not modify this script, modify the source tree copy:
 #	kent/src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
 
 usage() {
     echo "usage: $0 prevDate problematicSitesVcf"
     echo "This assumes that ncbi.latest and cogUk.latest links/directories have been updated."
 }
 
 if [ $# != 2 ]; then
   usage
   exit 1
 fi
 
 prevDate=$1
 problematicSitesVcf=$2
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 ncbiDir=$ottoDir/ncbi.latest
 cogUkDir=$ottoDir/cogUk.latest
 cncbDir=$ottoDir/cncb.latest
 gisaidDir=/hive/users/angie/gisaid
 minReal=20000
 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit
 epiToPublic=$gisaidDir/epiToPublicAndDate.latest
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 today=$(date +%F)
 y=$(date +%Y)
 m=$(date +%m)
 d=$(date +%d)
 
 cd $ottoDir/$today
 
 prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb
 
 usherDir=~angie/github/usher
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
 
+renaming=oldAndNewNames
+
+if [ ! -s new.masked.vcf.gz ]; then
+
 # Make lists of sequences already in the tree.
 $matUtils extract -i $prevProtobufMasked -u prevNames
 
 # Before updating the tree with new sequences, update the names used in the tree:
 # * Sequences that are already in the tree with EPI_ IDs, but that have been mapped to public IDs
 # * COG-UK sequences that are in GenBank.  Per Sam Nicholls the isolate alone is enough to identify
 #   them -- no need to match country & year which are sometimes incorrect at first.
 grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
 | tawk '{print $6, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \
 | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \
 | sort \
     > cogUkInGenBankIsolateToNewName
 fastaNames $cogUkDir/cog_all.fasta.xz > cogUk.names
 grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) cogUk.names \
     > cogUkInGenBank
 set +o pipefail
 # From 2021-05-04 on there should be no more unrenamed COG-UK/ in prevNames, but doesn't hurt
 # to check.
 grep COG-UK/ prevNames \
 | awk -F\| '{print $1 "\t" $0;}' \
 | sed -re 's@COG-UK/@@;' \
 | sort \
     > cogUkInGenBankIsolateToPrevName
 set -o pipefail
 join -t$'\t' cogUkInGenBankIsolateToPrevName cogUkInGenBankIsolateToNewName \
 | cut -f 2,3 \
     > prevTree.renaming
 # Look for COG-UK isolates in prevNames that have just been added to GenBank and need to be renamed.
 # Unfortunately for now we are not getting those from $epiToPublic.
 grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) prevNames \
 | awk -F\| '$3 == ""' \
 | awk -F/ '{print $2 "\t" $0;}' \
 | sort \
     > cogUkInGenBankIsolateToPrevName
 join -t$'\t' cogUkInGenBankIsolateToPrevName cogUkInGenBankIsolateToNewName \
 | cut -f 2,3 \
     >> prevTree.renaming
 # Look for names with EPI_IDs that have just today been mapped to public sequences.
 grep EPI_ISL_ prevNames \
 | awk -F\| '{print $2 "\t" $0;}' \
 | sort \
     > epiToPrevName
 set +o pipefail
 grep -Fwf <(cut -f 1 epiToPrevName) $epiToPublic \
 | grep -v COG-UK/ \
 | tawk '{if ($4 != "" && $3 == $2) { print $1, $2 "|" $4; } else if ($4 != "") { print $1, $3 "|" $2 "|" $4; }}' \
 | sort \
     > epiToNewName
 set -o pipefail
 # Argh, missing sequences in COG-UK metadata can mean that a sequence may have been added to the
 # tree both with and without EPI ID... so renaming makes a name conflict.
 # If there are any of those then prune the sequences with EPI_ID or longer dates, so renaming doesn't
 # cause conflict.
 comm -12 <(cut -f 2 epiToNewName | sort) <(sort prevNames) > epiToNewNameAlreadyInTree
 join -t$'\t' epiToPrevName <(grep -vFwf epiToNewNameAlreadyInTree epiToNewName) \
 | cut -f 2,3 \
     >> prevTree.renaming
 cp /dev/null dupsToRemove
 if [ -s epiToNewNameAlreadyInTree ]; then
+    set +o pipefail
     grep -Fwf <(cut -d\| -f 1 epiToNewNameAlreadyInTree) prevNames \
     | grep EPI_ISL \
+    | cat \
         >> dupsToRemove
+    set -o pipefail
 fi
+# Don't rename if the final name is already in the tree; remove the dup with the old name.
+set +o pipefail
+cut -f 2 prevTree.renaming \
+| grep -Fwf - prevNames | cat \
+    > alreadyThere
+grep -Fwf alreadyThere prevTree.renaming | cut -f 1 \
+    >> dupsToRemove
 # And finally, sometimes there are duplicates due to country or date being corrected in COG-UK
 # metadata.
-set +o pipefail
 grep -vFwf dupsToRemove prevTree.renaming \
 | cut -f 2 | sort | uniq -c | awk '$1 > 1 {print $2;}' \
 | cut -d/ -f 2 \
     > cogUkIsolateStillDup
 grep -Fwf cogUkIsolateStillDup $cogUkDir/cog_metadata.csv \
 | awk -F, '{print $1 "|" $5;}' \
     > cogDupsLatestMetadata
 grep -Fwf cogUkIsolateStillDup prevNames \
 | grep -vFwf dupsToRemove \
 | grep -vFwf cogDupsLatestMetadata \
 | cat \
     >> dupsToRemove
 set -o pipefail
 startingProtobuf=$prevProtobufMasked
 if [ -s dupsToRemove ]; then
     startingProtobuf=prevDupTrimmed.pb
     $matUtils extract -i $prevProtobufMasked -p -s dupsToRemove -o $startingProtobuf
 fi
 
 if [ -s prevTree.renaming ]; then
     $matUtils mask -r prevTree.renaming -i $startingProtobuf -o prevRenamed.pb >& rename.out
     $matUtils extract -i prevRenamed.pb -u prevNames
 else
     ln -sf $startingProtobuf prevRenamed.pb
 fi
 
 # OK, now that the tree names are updated, figure out which seqs are already in there and
 # which need to be added.
 awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
 | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc
 grep -E '^(England|Northern|Scotland|Wales)' prevNames \
 | cut -d\| -f1 > prevCogUk
 awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
 | grep -E '^EPI_ISL_' > prevGisaid
 # Add public sequences that have been mapped to GISAID sequences to prevGisaid.
 grep -Fwf prevGbAcc $epiToPublic | cut -f 1 >> prevGisaid
 grep -Fwf prevCogUk $epiToPublic | cut -f 1 >> prevGisaid
 wc -l prev*
 
 # Get new GenBank sequences with at least $minReal non-N bases.
 # Exclude seqs in the tree with EPI IDs that that have been mapped in the very latest $epiToPublic.
 set +o pipefail
 egrep $'\t''[A-Z][A-Z][0-9]{6}\.[0-9]+' $epiToPublic \
 | grep -Fwf prevGisaid - \
 | grep -vFwf prevGbAcc \
 | cat \
     >> prevGbAcc
 set -o pipefail
 xzcat $ncbiDir/genbank.fa.xz \
 | faSomeRecords -exclude stdin prevGbAcc newGenBank.fa
 faSize -veryDetailed newGenBank.fa \
 | tawk '$4 < '$minReal' {print $1;}' \
     > gbTooSmall
 # NCBI also includes NC_045512 in the download, but that's our reference, so... exclude that too.
 set +o pipefail
 fastaNames newGenBank.fa | grep NC_045512 >> gbTooSmall
 set -o pipefail
 faSomeRecords -exclude newGenBank.fa gbTooSmall newGenBank.filtered.fa
 faSize newGenBank.filtered.fa
 
 # Get new COG-UK sequences with at least $minReal non-N bases.
 # Also exclude cog_all.fasta sequences not found in cog_metadata.csv.
 comm -23 <(fastaNames $cogUkDir/cog_all.fasta.xz | sort) \
     <(cut -d, -f1 $cogUkDir/cog_metadata.csv | sort) \
     > cogFaNotMeta
 # Also exclude COG-UK sequences that have been added to GenBank (cogUkInGenBank, see above).
 xzcat $cogUkDir/cog_all.fasta.xz \
 | faSomeRecords -exclude stdin <(cat prevCogUk cogFaNotMeta cogUkInGenBank) newCogUk.fa
 faSize -veryDetailed newCogUk.fa \
 | tawk '$4 < '$minReal' {print $1;}' \
     > cogUkTooSmall
 faSomeRecords -exclude newCogUk.fa cogUkTooSmall newCogUk.filtered.fa
 faSize newCogUk.filtered.fa
 
 # Get new GISAID sequences with at least $minReal non-N bases.
 xzcat $gisaidDir/gisaid_fullNames_$today.fa.xz \
 | sed -re 's/^>.*\|(EPI_ISL_[0-9]+)\|.*/>\1/' \
 | faSomeRecords -exclude stdin prevGisaid newGisaid.fa
 faSize -veryDetailed newGisaid.fa \
 | tawk '$4 < '$minReal' {print $1;}' \
     > gisaidTooSmall
 faSomeRecords -exclude newGisaid.fa gisaidTooSmall newGisaid.filtered.fa
 faSize newGisaid.filtered.fa
 
 # Exclude public-mapped sequences from newGisaid:
 set +o pipefail
 fastaNames newGisaid.filtered.fa \
 | grep -Fwf - $epiToPublic \
 | cut -f 1 \
     > newGisaid.public.names
 set -o pipefail
 if [ -s newGisaid.public.names ]; then
     faSomeRecords -exclude newGisaid.filtered.fa newGisaid.public.names tmp
     mv tmp newGisaid.filtered.fa
     faSize newGisaid.filtered.fa
 fi
 cat new*.filtered.fa > new.fa
 faSize new.fa
 
 # Use Rob's script that aligns each sequence to NC_045512.2 and concatenates the results
 # as a multifasta alignment (from which we can extract VCF with SNVs):
 #conda install -c bioconda mafft
 alignedFa=new.aligned.fa
 rm -f $alignedFa
 export TMPDIR=/dev/shm
 time bash ~angie/github/sarscov2phylo/scripts/global_profile_alignment.sh \
   -i new.fa \
   -o $alignedFa \
   -t 50
 faSize $alignedFa
 
 # Now make a renaming that keeps all the prevNames and adds full names for the new seqs.
-renaming=oldAndNewNames
 tawk '{print $1, $1;}' prevNames > $renaming
 if [ -s newCogUk.filtered.fa ]; then
     # Sometimes all of the new COG-UK sequences are missing from cog_metadata.csv -- complained.
     set +o pipefail
     fastaNames newCogUk.filtered.fa \
     | grep -Fwf - $cogUkDir/cog_metadata.csv \
     | awk -F, '{print $1 "\t" $1 "|" $5;}' \
         >> $renaming
     set -o pipefail
 fi
 if [ -s newGenBank.filtered.fa ]; then
     # Special renaming for COG-UK sequences: strip COG-UK/, add back country and year
     set +o pipefail
     fastaNames newGenBank.filtered.fa \
     | grep COG-UK/ \
     | sed -re 's/[ |].*//' \
     | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \
     | tawk '{print $1, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \
     | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \
         >> $renaming
     fastaNames newGenBank.filtered.fa \
     | grep -v COG-UK/ \
     | sed -re 's/[ |].*//' \
     | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \
     | tawk '{ if ($3 == "") { $3 = "?"; }
               if ($6 != "") { print $1 "\t" $6 "|" $1 "|" $3; }
               else { print $1 "\t" $1 "|" $3; } }' \
     | cleanGenbank \
     | sed -re 's/ /_/g' \
         >> $renaming
     set -o pipefail
 fi
 if [ -s newGisaid.filtered.fa ]; then
     zcat $gisaidDir/metadata_batch_$today.tsv.gz \
     | grep -Fwf <(fastaNames newGisaid.filtered.fa) \
     | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \
         >> $renaming
 fi
 wc -l $renaming
 
 # Make masked VCF
 tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \
     $problematicSitesVcf > mask.vcf
 # Add masked VCF to previous protobuf
 time cat <(twoBitToFa $ref2bit stdout) $alignedFa \
-| faToVcf stdin stdout \
+| faToVcf -maxDiff=1000 -excludeFile=../tooManyEpps.ids -verbose=2 stdin stdout \
 | vcfRenameAndPrune stdin $renaming stdout \
 | vcfFilter -excludeVcf=mask.vcf stdin \
 | gzip -c \
     > new.masked.vcf.gz
-time $usher -u -T 50 \
+
+fi # if [ ! -s new.masked.vcf.gz ]
+
+time $usher -u -T 80 \
     -A \
     -e 5 \
     -v new.masked.vcf.gz \
     -i prevRenamed.pb \
     -o gisaidAndPublic.$today.masked.preTrim.pb \
     >& usher.addNew.log
 mv uncondensed-final-tree.nh gisaidAndPublic.$today.preTrim.nwk
 
+# Exclude sequences with a very high number of EPPs from future runs
+grep ^Current usher.addNew.log \
+| awk '$16 >= 10 {print $8;}' \
+| awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \
+    >> ../tooManyEpps.ids
+
 # Prune samples with too many private mutations and internal branches that are too long.
 $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \
     -a 20 \
     -b 30 \
     -O -o gisaidAndPublic.$today.masked.pb
 
 # Metadata for hgPhyloPlace:
 # Header names same as nextmeta (with strain first) so hgPhyloPlace recognizes them:
 echo -e "strain\tgenbank_accession\tdate\tcountry\thost\tcompleteness\tlength\tNextstrain_clade\tpangolin_lineage" \
     > gisaidAndPublic.$today.metadata.tsv
 # It's not always possible to recreate both old and new names correctly from metadata,
 # so make a file to translate accession or COG-UK to the name used in VCF, tree and protobufs.
 cut -f 2 $renaming \
 | awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' \
 | sort \
     > idToName
 # NCBI metadata for COG-UK: strip COG-UK/ & United Kingdom:, add country & year to name
 grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
 | tawk '$8 >= '$minReal' {print $1, $3, $4, $5, $4 "/" $6 "/" $3 "|" $1 "|" $3, $8;}' \
 | sed -re 's@COG-UK/@@g; s/United Kingdom://g;  s/(\/[0-9]{4})(-[0-9]+)*/\1/;
            s@Northern Ireland/@NorthernIreland/@;' \
     > tmp
 # NCBI metadata for non-COG-UK (strip colon-separated location after country if present):
 grep -v COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
 | tawk '$8 >= '$minReal' { print $1, $3, $4, $5, $6, $8; }' \
 | sed -re 's/\t([A-Za-z -]+):[A-Za-z0-9 ,()_-]+\t/\t\1\t/;' \
 | perl -wpe '@w = split("\t"); $w[4] =~ s/ /_/g; $_ = join("\t", @w);' \
 | cleanGenbank \
 | sort tmp - > gb.metadata
 if [ -e $ncbiDir/lineage_report.csv ]; then
     echo Getting GenBank Pangolin lineages from $ncbiDir/lineage_report.csv
     tail -n+2  $ncbiDir/lineage_report.csv \
     | sed -re 's/^([A-Z][A-Z][0-9]{6}\.[0-9]+)[^,]*/\1/;' \
     | awk -F, '$2 != "" && $2 != "None" {print $1 "\t" $2;}' \
     | sort \
         > gbToLineage
 else
     echo Getting GenBank Pangolin lineages from $prevMeta
     zcat $prevMeta \
     | tail -n+2 \
     | tawk '$2 != "" && $8 != "" { print $2, $8; }' \
     | sort \
         > gbToLineage
 fi
 wc -l gbToLineage
 if [ -e $ncbiDir/nextclade.tsv ]; then
     sort $ncbiDir/nextclade.tsv > gbToNextclade
 else
     touch gbToNextclade
 fi
 wc -l gbToNextclade
 join -t$'\t' -a 1 gb.metadata gbToNextclade \
 | join -t$'\t' -a 1 - gbToLineage \
 | tawk '{ if ($2 == "") { $2 = "?"; }
           print $1, $1, $2, $3, $4, "", $6, $7, $8; }' \
 | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
     >> gisaidAndPublic.$today.metadata.tsv
 # COG-UK metadata:
 if [ -e $cogUkDir/nextclade.tsv ]; then
     sort $cogUkDir/nextclade.tsv > cogUkToNextclade
 else
     touch cogUkToNextclade
 fi
 #*** Could also add sequence length to metadata from faSizes output...
 tail -n+2 $cogUkDir/cog_metadata.csv \
 | awk -F, -v 'OFS=\t' '{print $1, "", $5, $2, "", "", "", $7; }' \
 | sort \
 | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2,1.8 - cogUkToNextclade \
 | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
     >> gisaidAndPublic.$today.metadata.tsv
 # CNCB metadata:
 tail -n+2 $cncbDir/cncb.metadata.tsv \
 | tawk '{ if ($3 != "GISAID" && $3 != "GenBank" && $3 != "Genbank") {
             print $2, "", $10, $11, $9, $5, $6} }' \
 | sed -re 's@\t([A-Za-z -]+)( / [A-Za-z -'"'"']+)+\t@\t\1\t@;  s/Sapiens/sapiens/;' \
 | sort \
 | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 - $cncbDir/nextclade.tsv \
 | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.2 - $cncbDir/pangolin.tsv \
 | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
     >> gisaidAndPublic.$today.metadata.tsv
 wc -l gisaidAndPublic.$today.metadata.tsv
 zcat $gisaidDir/metadata_batch_$today.tsv.gz \
 | grep -Fwf <(cut -f 2 $renaming | grep EPI_ISL | cut -d\| -f 2) \
 | tawk '{print $1 "|" $3 "|" $5, "", $5, $7, $15, $13, $14, $18, $19;}' \
     >> gisaidAndPublic.$today.metadata.tsv
 wc -l gisaidAndPublic.$today.metadata.tsv
 gzip gisaidAndPublic.$today.metadata.tsv
 
 # version/description files
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
     > version.plusGisaid.txt
 $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today
 sampleCountComma=$(echo $(wc -l < samples.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.plusGisaid.description.txt
 
 # Add nextclade annotations to protobuf
 zcat gisaidAndPublic.$today.metadata.tsv.gz \
 | tail -n+2 | tawk '$8 != "" {print $8, $1;}' \
-| sed -re 's/^20E \(EU1\)/20E.EU1/;' \
-    > cladeToName
+| subColumn -miss=/dev/null 1 stdin ../nextcladeToShort cladeToName
+
 time $matUtils annotate -T 50 \
     -l \
     -i gisaidAndPublic.$today.masked.pb \
     -c cladeToName \
     -u mutations.nextclade \
     -D details.nextclade \
     -o gisaidAndPublic.$today.masked.nextclade.pb \
     >& annotate.nextclade.out
 
 # Add pangolin lineage annotations to protobuf.  Use pangoLEARN training samples;
 # convert EPI IDs to public to match tree IDs.
 tail -n+2 ~angie/github/pango-designation/lineages.metadata.csv \
 | grep -vFwf $ottoDir/clades.blackList \
 | awk -F, '{print $9 "\t" $2;}' \
 | sort > epiExemplarToLineage
 subColumn -miss=/dev/null 1 epiExemplarToLineage \
     <(cut -f 1,2 $epiToPublic) stdout \
 | sort > idExemplarToLineage
 grep -Fwf <(cut -f 1 idExemplarToLineage) samples.$today \
 | awk -F\| '{ if ($3 == "") { print $1 "\t" $0 } else { print $2 "\t" $0; } }' \
 | sort > idExemplarToName
 join -t$'\t' idExemplarToName idExemplarToLineage \
 | tawk '{print $3, $2;}' \
 | sort > lineageToName
 # $epiToPublic maps some cogUkInGenBank EPI IDs to their COG-UK names not GenBank IDs unfortunately
 # so some of those don't match between idExemplarToLineage and idExemplarToName.  Find missing
 # sequences and try a different means of adding those.
 comm -13 <( cut -f 1 idExemplarToName | sort) <(cut -f 1 idExemplarToLineage| sort) \
 | grep -Fwf - ~angie/github/pango-designation/lineages.metadata.csv \
 | sed -re 's/Northern_/Northern/;' \
 | awk -F, '{print $1 "\t" $2;}' \
 | sort > exemplarNameNotFoundToLineage
 grep -Fwf <(cut -f 1 exemplarNameNotFoundToLineage) samples.$today \
 | awk -F\| '{print $1 "\t" $0;}' \
 | sort > exemplarNameNotFoundToFullName
 join -t$'\t' exemplarNameNotFoundToLineage exemplarNameNotFoundToFullName \
 | cut -f 2,3 \
 | sort -u lineageToName - ../lineageToName.newLineages > tmp
 mv tmp lineageToName
 
 time $matUtils annotate -T 50 \
     -i gisaidAndPublic.$today.masked.nextclade.pb \
     -c lineageToName \
     -u mutations.pangolin \
     -D details.pangolin \
     -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \
     >& annotate.pangolin.out
 
 mv gisaidAndPublic.$today.masked{,.unannotated}.pb
 ln gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb
 
 # EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have
 # public names & IDs, we can match them.
 cut -f 1,3 $epiToPublic > epiToPublic.latest
 
 # Update links to latest public+GISAID protobuf and metadata in hgwdev cgi-bin directories
 for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do
     ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb
     ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \
         $dir/public.plusGisaid.latest.metadata.tsv.gz
     ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt
     ln -sf `pwd`/epiToPublic.latest $dir/
 done
 
 # Extract public samples from tree
 $matUtils extract -i gisaidAndPublic.$today.masked.pb -u newNames
 grep -v EPI_ISL_ newNames > newPublicNames
 $matUtils extract -i gisaidAndPublic.$today.masked.pb \
     -s newPublicNames \
     -O -o public-$today.all.masked.pb
 # Extract Newick and VCF from public-only tree
 $matUtils extract -i public-$today.all.masked.pb \
     -t public-$today.all.nwk \
     -v public-$today.all.masked.vcf
 gzip -f public-$today.all.masked.vcf
 zcat gisaidAndPublic.$today.metadata.tsv.gz \
 | grep -v EPI_ISL_ \
 | gzip -c \
     > public-$today.metadata.tsv.gz
 
 grep -v EPI_ISL_ cladeToName > cladeToPublicName
 grep -v EPI_ISL_ lineageToName > lineageToPublicName
 
 # Add nextclade annotations to public protobuf
 time $matUtils annotate -T 50 \
     -l \
     -i public-$today.all.masked.pb \
     -c cladeToPublicName \
     -u mutations.nextclade.public \
     -D details.nextclade.public \
     -o public-$today.all.masked.nextclade.pb \
     >& annotate.nextclade.public.out
 
 # Add pangolin lineage annotations to public protobuf
 time $matUtils annotate -T 50 \
     -i public-$today.all.masked.nextclade.pb \
     -c lineageToPublicName \
     -u mutations.pangolin.public \
     -D details.pangolin.public \
     -o public-$today.all.masked.nextclade.pangolin.pb \
     >& annotate.pangolin.public.out
 
 # Not all the Pangolin lineages can be assigned nodes so for now just use nextclade
 rm public-$today.all.masked.pb
 ln public-$today.all.masked.nextclade.pb public-$today.all.masked.pb
 
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 echo "sarscov2phylo release 13-11-20; NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
     > version.txt
 
 $matUtils extract -i public-$today.all.masked.pb -u samples.public.$today
 sampleCountComma=$(echo $(wc -l < samples.public.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.description.txt
 
 
 # Link to public trees download directory hierarchy
 archiveRoot=/hive/users/angie/publicTrees
 archive=$archiveRoot/$y/$m/$d
 mkdir -p $archive
 gzip -c public-$today.all.nwk > $archive/public-$today.all.nwk.gz
 ln `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/
 gzip -c public-$today.all.masked.pb > $archive/public-$today.all.masked.pb.gz
 ln `pwd`/public-$today.metadata.tsv.gz $archive/
 gzip -c public-$today.all.masked.nextclade.pangolin.pb \
     > $archive/public-$today.all.masked.nextclade.pangolin.pb.gz
 gzip -c cladeToPublicName > $archive/cladeToPublicName.gz
 gzip -c lineageToPublicName > $archive/lineageToPublicName.gz
 ln `pwd`/hgPhyloPlace.description.txt $archive/public-$today.version.txt
 
 # Update 'latest' in $archiveRoot
 ln -f $archive/public-$today.all.nwk.gz $archiveRoot/public-latest.all.nwk.gz
 ln -f $archive/public-$today.all.masked.pb $archiveRoot/public-latest.all.masked.pb
 ln -f $archive/public-$today.all.masked.pb.gz $archiveRoot/public-latest.all.masked.pb.gz
 ln -f $archive/public-$today.all.masked.vcf.gz $archiveRoot/public-latest.all.masked.vcf.gz
 ln -f $archive/public-$today.metadata.tsv.gz $archiveRoot/public-latest.metadata.tsv.gz
 ln -f $archive/public-$today.version.txt $archiveRoot/public-latest.version.txt
 
 # Update hgdownload-test link for archive
 mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m
 ln -sf $archive /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m
 
 # Update links to latest public protobuf and metadata in hgwdev cgi-bin directories
 for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do
     ln -sf `pwd`/public-$today.all.masked.pb $dir/public-latest.all.masked.pb
     ln -sf `pwd`/public-$today.metadata.tsv.gz $dir/public-latest.metadata.tsv.gz
     ln -sf `pwd`/hgPhyloPlace.description.txt $dir/public-latest.version.txt
 done