src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh 30ebf52c1ccbd592b814c18d3a68c30624a0f85d

30ebf52c1ccbd592b814c18d3a68c30624a0f85d
angie
  Wed Mar 17 11:55:37 2021 -0700
Adding new updateCombinedTree.sh after updatePublicTree.sh for GISAID+public tree.  Eventually, public tree will be derived from combined tree.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
new file mode 100755
index 0000000..8e6cf6e
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -0,0 +1,252 @@
+#!/bin/bash
+set -beEu -x -o pipefail
+
+#	Do not modify this script, modify the source tree copy:
+#	kent/src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+
+usage() {
+    echo "usage: $0 prevDate problematicSitesVcf"
+    echo "This assumes that ncbi.latest and cogUk.latest links/directories have been updated."
+}
+
+if [ $# != 2 ]; then
+  usage
+  exit 1
+fi
+
+prevDate=$1
+problematicSitesVcf=$2
+
+ottoDir=/hive/data/outside/otto/sarscov2phylo
+ncbiDir=$ottoDir/ncbi.latest
+cogUkDir=$ottoDir/cogUk.latest
+cncbDir=$ottoDir/cncb.latest
+gisaidDir=/hive/users/angie/gisaid
+minReal=20000
+ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit
+epiToPublic=$gisaidDir/epiToPublicAndDate.latest
+scriptDir=$(dirname "${BASH_SOURCE[0]}")
+source $scriptDir/util.sh
+
+today=$(date +%F)
+cd $ottoDir/$today
+
+prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb
+
+# Use Yatish's latest usher and matUtils
+usherDir=~angie/github/yatish_usher
+usher=$usherDir/build/usher
+matUtils=$usherDir/build/matUtils
+
+# Make lists of sequences already in the tree.
+$matUtils summary -i $prevProtobufMasked -s >(tail -n+2 | cut -f 1 > prevNames)
+awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
+| grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc
+awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
+| grep -E '^(England|Northern|Scotland|Wales)' > prevCogUk
+awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
+| grep -E '^EPI_ISL_' > prevGisaid
+# Add public sequences that have been mapped to GISAID sequences to prevGisaid.
+grep -Fwf prevGbAcc $epiToPublic | cut -f 1 >> prevGisaid
+grep -Fwf prevCogUk $epiToPublic | cut -f 1 >> prevGisaid
+wc -l prev*
+
+# Get new GenBank sequences with at least $minReal non-N bases.
+xzcat $ncbiDir/genbank.fa.xz \
+| faSomeRecords -exclude stdin prevGbAcc newGenBank.fa
+faSize -veryDetailed newGenBank.fa \
+| tawk '$4 < '$minReal' {print $1;}' \
+    > gbTooSmall
+# NCBI also includes NC_045512 in the download, but that's our reference, so... exclude that too.
+fastaNames newGenBank.fa | grep NC_045512 >> gbTooSmall
+faSomeRecords -exclude newGenBank.fa gbTooSmall newGenBank.filtered.fa
+faSize newGenBank.filtered.fa
+
+# Get new COG-UK sequences with at least $minReal non-N bases.
+xzcat $cogUkDir/cog_all.fasta.xz \
+| faSomeRecords -exclude stdin prevCogUk newCogUk.fa
+faSize -veryDetailed newCogUk.fa \
+| tawk '$4 < '$minReal' {print $1;}' \
+    > cogUkTooSmall
+faSomeRecords -exclude newCogUk.fa cogUkTooSmall newCogUk.filtered.fa
+faSize newCogUk.filtered.fa
+
+# Get new GISAID sequences with at least $minReal non-N bases.
+xzcat $gisaidDir/gisaid_fullNames_$today.fa.xz \
+| sed -re 's/^>.*\|(EPI_ISL_[0-9]+)\|.*/>\1/' \
+| faSomeRecords -exclude stdin prevGisaid newGisaid.fa
+faSize -veryDetailed newGisaid.fa \
+| tawk '$4 < '$minReal' {print $1;}' \
+    > gisaidTooSmall
+faSomeRecords -exclude newGisaid.fa gisaidTooSmall newGisaid.filtered.fa
+faSize newGisaid.filtered.fa
+
+# Exclude public-mapped sequences from newGisaid:
+set +o pipefail
+fastaNames newGisaid.filtered.fa \
+| grep -Fwf - $epiToPublic \
+| cut -f 1 \
+    > newGisaid.public.names
+set -o pipefail
+if [ -s newGisaid.public.names ]; then
+    faSomeRecords -exclude newGisaid.filtered.fa newGisaid.public.names tmp
+    mv tmp newGisaid.filtered.fa
+    faSize newGisaid.filtered.fa
+fi
+cat new*.filtered.fa > new.fa
+faSize new.fa
+
+# Use Rob's script that aligns each sequence to NC_045512.2 and concatenates the results
+# as a multifasta alignment (from which we can extract VCF with SNVs):
+#conda install -c bioconda mafft
+alignedFa=new.aligned.fa
+rm -f $alignedFa
+export TMPDIR=/dev/shm
+time bash ~angie/github/sarscov2phylo/scripts/global_profile_alignment.sh \
+  -i new.fa \
+  -o $alignedFa \
+  -t 50
+faSize $alignedFa
+
+#*** TODO: if EPIs have recently been mapped to public sequences, we should convert their
+#*** tree names from EPI to public.
+
+# Now make a renaming that keeps all the prevNames and adds full names (with shortened dates)
+# for the new seqs.
+renaming=oldAndNewNames
+tawk '{print $1, $1;}' prevNames > $renaming
+if [ -s newCogUk.filtered.fa ]; then
+    # Sometimes all of the new COG-UK sequences are missing from cog_metadata.csv -- complained.
+    set +o pipefail
+    fastaNames newCogUk.filtered.fa \
+    | grep -Fwf - $cogUkDir/cog_metadata.csv \
+    | awk -F, '{print $1 "\t" $1 "|" $5;}' \
+    | sed -re 's/20([0-9][0-9])(-[0-9-]+)?$/\1\2/;' \
+        >> $renaming
+    set -o pipefail
+fi
+if [ -s newGenBank.filtered.fa ]; then
+    fastaNames newGenBank.filtered.fa \
+    | sed -re 's/[ |].*//' \
+    | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \
+    | tawk '{ if ($3 == "") { $3 = "?"; }
+              if ($6 != "") { print $1 "\t" $6 "|" $1 "|" $3; }
+              else { print $1 "\t" $1 "|" $3; } }' \
+    | cleanGenbank \
+    | sed -re 's/ /_/g' \
+    | sed -re 's/20([0-9][0-9])(-[0-9-]+)?$/\1\2/;' \
+        >> $renaming
+fi
+if [ -s newGisaid.filtered.fa ]; then
+    zcat $gisaidDir/metadata_batch_$today.tsv.gz \
+    | grep -Fwf <(fastaNames newGisaid.filtered.fa) \
+    | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \
+        >> $renaming
+fi
+wc -l $renaming
+
+# Remove old tree's GISAID items that have been mapped to public sequences so we don't have dups
+set +o pipefail
+cut -f 1 $epiToPublic \
+| grep -Fwf - prevNames \
+| cat \
+    > mappedGisaidToRemove
+set -o pipefail
+wc -l mappedGisaidToRemove
+
+if [ -s mappedGisaidToRemove ]; then
+    $matUtils extract -i $prevProtobufMasked \
+        -s mappedGisaidToRemove \
+        -p \
+        -o prevTrimmed.pb
+else
+    cp -p $prevProtobufMasked prevTrimmed.pb
+fi
+
+# Make masked VCF
+tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \
+    $problematicSitesVcf > mask.vcf
+# Add masked VCF to previous protobuf
+time cat <(twoBitToFa $ref2bit stdout) $alignedFa \
+| faToVcf stdin stdout \
+| vcfRenameAndPrune stdin $renaming stdout \
+| vcfFilter -excludeVcf=mask.vcf stdin \
+| gzip -c \
+    > new.masked.vcf.gz
+time $usher -u -T 50 \
+    -v new.masked.vcf.gz \
+    -i prevTrimmed.pb \
+    -o gisaidAndPublic.$today.masked.pb \
+    >& usher.addNew.log
+mv uncondensed-final-tree.nh gisaidAndPublic.$today.nwk
+
+# Metadata for hgPhyloPlace: for now, start with already-built public metadata.
+zcat public-$today.metadata.tsv.gz > gisaidAndPublic.$today.metadata.tsv
+zcat $gisaidDir/metadata_batch_$today.tsv.gz \
+| grep -Fwf <(cut -f 2 $renaming | grep EPI_ISL | cut -d\| -f 2) \
+| tawk '{print $1 "|" $3 "|" $5, "", $5, $7, $15, $13, $14, $18, $19;}' \
+    >> gisaidAndPublic.$today.metadata.tsv
+wc -l gisaidAndPublic.$today.metadata.tsv
+gzip gisaidAndPublic.$today.metadata.tsv
+
+# version/description files
+cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
+echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
+    > version.plusGisaid.txt
+sampleCountComma=$(echo $(wc -l < $renaming) \
+                   | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
+echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
+    > hgPhyloPlace.plusGisaid.description.txt
+
+# Add nextclade annotations to protobuf
+zcat gisaidAndPublic.$today.metadata.tsv.gz \
+| tail -n+2 | tawk '$8 != "" {print $8, $1;}' \
+| sed -re 's/^20E \(EU1\)/20E.EU1/;' \
+    > cladeToName
+time $matUtils annotate -T 50 \
+    -l \
+    -i gisaidAndPublic.$today.masked.pb \
+    -c cladeToName \
+    -o gisaidAndPublic.$today.masked.nextclade.pb \
+    >& annotate.nextclade.out
+
+# Add pangolin lineage annotations to protobuf.  Use pangoLEARN training samples;
+# convert EPI IDs to public to match tree IDs.
+tail -n+2 ~angie/github/pangoLEARN/pangoLEARN/data/lineages.metadata.csv \
+| awk -F, '{print $7 "\t" $2;}' \
+| sort > epiExemplarToLineage
+subColumn -miss=/dev/null 1 epiExemplarToLineage \
+    <(cut -f 1,2 $gisaidDir/epiToPublicAndDate.$today) stdout \
+| sort > idExemplarToLineage
+cut -f 2 $renaming | grep -Fwf <(cut -f 1 idExemplarToLineage) \
+| awk -F\| '{ if ($3 == "") { print $1 "\t" $0 } else { print $2 "\t" $0; } }' \
+| sort > idExemplarToName
+join -t$'\t' idExemplarToName idExemplarToLineage \
+| tawk '{print $3, $2;}' \
+| sort > lineageToName
+time $matUtils annotate -T 50 \
+    -i gisaidAndPublic.$today.masked.nextclade.pb \
+    -c lineageToName \
+    -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \
+    >& annotate.pangolin.out
+
+mv gisaidAndPublic.$today.masked{,.unannotated}.pb
+ln gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb
+
+# Extract public samples from tree
+$matUtils summary -i gisaidAndPublic.$today.masked.pb -s >(tail -n+2 | cut -f 1 > newNames)
+grep -v EPI_ISL_ newNames > newPublicNames
+$matUtils extract -i gisaidAndPublic.$today.masked.pb \
+    -s newPublicNames \
+    -O -o public.$today.masked.pb
+
+for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do
+    ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb
+    ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \
+        $dir/public.plusGisaid.latest.metadata.tsv.gz
+    ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt
+done
+
+# Clean up
+nice xz -f new*fa &
+