src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh 83f9e814d5441d0ba86d5fa735aed67d264a771f

83f9e814d5441d0ba86d5fa735aed67d264a771f
angie
  Sun Jun 27 20:27:52 2021 -0700
Skip building new.masked.vcf.gz if it already exists (go straight to usher).  More prevention of empty-grep crashes.  Use faToVcf new -maxDiff option to prevent memory usage blowup for sequences that are so diverged/erroneous that we won't keep them anyway.  Exclude sequences with 10 or more equally parsimonious placements so we don't keep trying in vain (and wasting time).  Handle new nextstrain clade names with spaces etc.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index acd9fa0..63bbdc5 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -29,30 +29,34 @@
 source $scriptDir/util.sh
 
 today=$(date +%F)
 y=$(date +%Y)
 m=$(date +%m)
 d=$(date +%d)
 
 cd $ottoDir/$today
 
 prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb
 
 usherDir=~angie/github/usher
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
 
+renaming=oldAndNewNames
+
+if [ ! -s new.masked.vcf.gz ]; then
+
 # Make lists of sequences already in the tree.
 $matUtils extract -i $prevProtobufMasked -u prevNames
 
 # Before updating the tree with new sequences, update the names used in the tree:
 # * Sequences that are already in the tree with EPI_ IDs, but that have been mapped to public IDs
 # * COG-UK sequences that are in GenBank.  Per Sam Nicholls the isolate alone is enough to identify
 #   them -- no need to match country & year which are sometimes incorrect at first.
 grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
 | tawk '{print $6, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \
 | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \
 | sort \
     > cogUkInGenBankIsolateToNewName
 fastaNames $cogUkDir/cog_all.fasta.xz > cogUk.names
 grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) cogUk.names \
     > cogUkInGenBank
@@ -88,37 +92,46 @@
 | grep -v COG-UK/ \
 | tawk '{if ($4 != "" && $3 == $2) { print $1, $2 "|" $4; } else if ($4 != "") { print $1, $3 "|" $2 "|" $4; }}' \
 | sort \
     > epiToNewName
 set -o pipefail
 # Argh, missing sequences in COG-UK metadata can mean that a sequence may have been added to the
 # tree both with and without EPI ID... so renaming makes a name conflict.
 # If there are any of those then prune the sequences with EPI_ID or longer dates, so renaming doesn't
 # cause conflict.
 comm -12 <(cut -f 2 epiToNewName | sort) <(sort prevNames) > epiToNewNameAlreadyInTree
 join -t$'\t' epiToPrevName <(grep -vFwf epiToNewNameAlreadyInTree epiToNewName) \
 | cut -f 2,3 \
     >> prevTree.renaming
 cp /dev/null dupsToRemove
 if [ -s epiToNewNameAlreadyInTree ]; then
+    set +o pipefail
     grep -Fwf <(cut -d\| -f 1 epiToNewNameAlreadyInTree) prevNames \
     | grep EPI_ISL \
+    | cat \
         >> dupsToRemove
+    set -o pipefail
 fi
+# Don't rename if the final name is already in the tree; remove the dup with the old name.
+set +o pipefail
+cut -f 2 prevTree.renaming \
+| grep -Fwf - prevNames | cat \
+    > alreadyThere
+grep -Fwf alreadyThere prevTree.renaming | cut -f 1 \
+    >> dupsToRemove
 # And finally, sometimes there are duplicates due to country or date being corrected in COG-UK
 # metadata.
-set +o pipefail
 grep -vFwf dupsToRemove prevTree.renaming \
 | cut -f 2 | sort | uniq -c | awk '$1 > 1 {print $2;}' \
 | cut -d/ -f 2 \
     > cogUkIsolateStillDup
 grep -Fwf cogUkIsolateStillDup $cogUkDir/cog_metadata.csv \
 | awk -F, '{print $1 "|" $5;}' \
     > cogDupsLatestMetadata
 grep -Fwf cogUkIsolateStillDup prevNames \
 | grep -vFwf dupsToRemove \
 | grep -vFwf cogDupsLatestMetadata \
 | cat \
     >> dupsToRemove
 set -o pipefail
 startingProtobuf=$prevProtobufMasked
 if [ -s dupsToRemove ]; then
@@ -207,31 +220,30 @@
 faSize new.fa
 
 # Use Rob's script that aligns each sequence to NC_045512.2 and concatenates the results
 # as a multifasta alignment (from which we can extract VCF with SNVs):
 #conda install -c bioconda mafft
 alignedFa=new.aligned.fa
 rm -f $alignedFa
 export TMPDIR=/dev/shm
 time bash ~angie/github/sarscov2phylo/scripts/global_profile_alignment.sh \
   -i new.fa \
   -o $alignedFa \
   -t 50
 faSize $alignedFa
 
 # Now make a renaming that keeps all the prevNames and adds full names for the new seqs.
-renaming=oldAndNewNames
 tawk '{print $1, $1;}' prevNames > $renaming
 if [ -s newCogUk.filtered.fa ]; then
     # Sometimes all of the new COG-UK sequences are missing from cog_metadata.csv -- complained.
     set +o pipefail
     fastaNames newCogUk.filtered.fa \
     | grep -Fwf - $cogUkDir/cog_metadata.csv \
     | awk -F, '{print $1 "\t" $1 "|" $5;}' \
         >> $renaming
     set -o pipefail
 fi
 if [ -s newGenBank.filtered.fa ]; then
     # Special renaming for COG-UK sequences: strip COG-UK/, add back country and year
     set +o pipefail
     fastaNames newGenBank.filtered.fa \
     | grep COG-UK/ \
@@ -253,44 +265,53 @@
     set -o pipefail
 fi
 if [ -s newGisaid.filtered.fa ]; then
     zcat $gisaidDir/metadata_batch_$today.tsv.gz \
     | grep -Fwf <(fastaNames newGisaid.filtered.fa) \
     | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \
         >> $renaming
 fi
 wc -l $renaming
 
 # Make masked VCF
 tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \
     $problematicSitesVcf > mask.vcf
 # Add masked VCF to previous protobuf
 time cat <(twoBitToFa $ref2bit stdout) $alignedFa \
-| faToVcf stdin stdout \
+| faToVcf -maxDiff=1000 -excludeFile=../tooManyEpps.ids -verbose=2 stdin stdout \
 | vcfRenameAndPrune stdin $renaming stdout \
 | vcfFilter -excludeVcf=mask.vcf stdin \
 | gzip -c \
     > new.masked.vcf.gz
-time $usher -u -T 50 \
+
+fi # if [ ! -s new.masked.vcf.gz ]
+
+time $usher -u -T 80 \
     -A \
     -e 5 \
     -v new.masked.vcf.gz \
     -i prevRenamed.pb \
     -o gisaidAndPublic.$today.masked.preTrim.pb \
     >& usher.addNew.log
 mv uncondensed-final-tree.nh gisaidAndPublic.$today.preTrim.nwk
 
+# Exclude sequences with a very high number of EPPs from future runs
+grep ^Current usher.addNew.log \
+| awk '$16 >= 10 {print $8;}' \
+| awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \
+    >> ../tooManyEpps.ids
+
 # Prune samples with too many private mutations and internal branches that are too long.
 $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \
     -a 20 \
     -b 30 \
     -O -o gisaidAndPublic.$today.masked.pb
 
 # Metadata for hgPhyloPlace:
 # Header names same as nextmeta (with strain first) so hgPhyloPlace recognizes them:
 echo -e "strain\tgenbank_accession\tdate\tcountry\thost\tcompleteness\tlength\tNextstrain_clade\tpangolin_lineage" \
     > gisaidAndPublic.$today.metadata.tsv
 # It's not always possible to recreate both old and new names correctly from metadata,
 # so make a file to translate accession or COG-UK to the name used in VCF, tree and protobufs.
 cut -f 2 $renaming \
 | awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' \
 | sort \
@@ -368,32 +389,32 @@
 gzip gisaidAndPublic.$today.metadata.tsv
 
 # version/description files
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
     > version.plusGisaid.txt
 $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today
 sampleCountComma=$(echo $(wc -l < samples.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.plusGisaid.description.txt
 
 # Add nextclade annotations to protobuf
 zcat gisaidAndPublic.$today.metadata.tsv.gz \
 | tail -n+2 | tawk '$8 != "" {print $8, $1;}' \
-| sed -re 's/^20E \(EU1\)/20E.EU1/;' \
-    > cladeToName
+| subColumn -miss=/dev/null 1 stdin ../nextcladeToShort cladeToName
+
 time $matUtils annotate -T 50 \
     -l \
     -i gisaidAndPublic.$today.masked.pb \
     -c cladeToName \
     -u mutations.nextclade \
     -D details.nextclade \
     -o gisaidAndPublic.$today.masked.nextclade.pb \
     >& annotate.nextclade.out
 
 # Add pangolin lineage annotations to protobuf.  Use pangoLEARN training samples;
 # convert EPI IDs to public to match tree IDs.
 tail -n+2 ~angie/github/pango-designation/lineages.metadata.csv \
 | grep -vFwf $ottoDir/clades.blackList \
 | awk -F, '{print $9 "\t" $2;}' \
 | sort > epiExemplarToLineage