src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh acd9854d3c6537ff237c597534abd5b9afbb7a75

acd9854d3c6537ff237c597534abd5b9afbb7a75
angie
  Thu Feb 25 10:40:45 2021 -0800
Adding cron scripts for nightly update of public tree, VCFs, metadata and protobufs for hgPhyloPlace.

diff --git src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh
new file mode 100755
index 0000000..ffbf6b3
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh
@@ -0,0 +1,329 @@
+#!/bin/bash
+set -beEu -x -o pipefail
+
+#	Do not modify this script, modify the source tree copy:
+#	kent/src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh
+
+usage() {
+    echo "usage: $0 prevDate problematicSitesVcf"
+    echo "This assumes that ncbi.latest and cogUk.latest links/directories have been updated."
+}
+
+if [ $# != 2 ]; then
+  usage
+  exit 1
+fi
+
+ottoDir=/hive/data/outside/otto/sarscov2phylo
+prevDate=$1
+problematicSitesVcf=$2
+prevVcf=$ottoDir/$prevDate/public-$prevDate.all.vcf.gz
+prevProtobufMasked=$ottoDir/$prevDate/public-$prevDate.all.masked.pb
+prevProtobufUnmasked=$ottoDir/$prevDate/public-$prevDate.all.notMasked.pb
+prevMeta=$ottoDir/$prevDate/public-$prevDate.metadata.tsv.gz
+
+echo "prevVcf=$prevVcf"
+echo "prevProtobufMasked=$prevProtobufMasked"
+echo "prevProtobufUnmasked=$prevProtobufUnmasked"
+echo "prevMeta=$prevMeta"
+echo "problematicSitesVcf=$problematicSitesVcf"
+
+ncbiDir=$ottoDir/ncbi.latest
+cogUkDir=$ottoDir/cogUk.latest
+cncbDir=$ottoDir/cncb.latest
+today=$(date +%F)
+
+minReal=20000
+ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit
+
+usherDir=~angie/github/usher
+usher=$usherDir/build/usher
+matToVcf=$usherDir/build/matToVcf
+find_parsimonious_assignments=~angie/github/strain_phylogenetics/build/find_parsimonious_assignments
+
+scriptDir=$(dirname "${BASH_SOURCE[0]}")
+
+source $scriptDir/util.sh
+
+# Before we get started, make sure cog_metadata has the columns we're expecting:
+expectedHeaderStart='sequence_name,country,adm1,pillar_2,sample_date,epi_week,lineage,'
+cogUkMetHeader=$(head -1 $cogUkDir/cog_metadata.csv | grep ^$expectedHeaderStart)
+# The grep will fail if they change one of the first 7 fields.
+
+mkdir -p $ottoDir/$today
+cd $ottoDir/$today
+
+# Disregard pipefail just for this pipe because head prevents the cat from completing:
+set +o pipefail
+xcat $prevVcf | head | grep ^#CHROM | sed -re 's/\t/\n/g;' | tail -n+10 > prevNames
+set -o pipefail
+awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
+| grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc
+awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
+| grep -E '^(England|Northern|Scotland|Wales)' > prevCogUk
+
+# Get new GenBank sequences with at least $minReal non-N bases.
+xzcat $ncbiDir/genbank.fa.xz \
+| faSomeRecords -exclude stdin prevGbAcc newGenBank.fa
+faSize -veryDetailed newGenBank.fa \
+| tawk '$4 < '$minReal' {print $1;}' \
+    > gbTooSmall
+# NCBI also includes NC_045512 in the download, but that's our reference, so... exclude that too.
+fastaNames newGenBank.fa | grep NC_045512 >> gbTooSmall
+faSomeRecords -exclude newGenBank.fa gbTooSmall newGenBank.filtered.fa
+faSize newGenBank.filtered.fa
+
+# Get new COG-UK sequences with at least $minReal non-N bases.
+xzcat $cogUkDir/cog_all.fasta.xz \
+| faSomeRecords -exclude stdin prevCogUk newCogUk.fa
+faSize -veryDetailed newCogUk.fa \
+| tawk '$4 < '$minReal' {print $1;}' \
+    > cogUkTooSmall
+faSomeRecords -exclude newCogUk.fa cogUkTooSmall newCogUk.filtered.fa
+faSize newCogUk.filtered.fa
+
+cat newGenBank.filtered.fa newCogUk.filtered.fa > new.fa
+
+# Some days there are no new public sequences; just link to yesterday's results.
+# Temporarily disable pipefail because fastaNames uses grep which fails when there are no sequences.
+set +o pipefail
+newSeqCount=$(fastaNames new.fa | wc -l)
+set -o pipefail
+if [ $newSeqCount == 0 ] ; then
+    ln -sf $prevVcf public-$today.all.vcf.gz
+    ln -sf $prevProtobufMasked public-$today.all.masked.pb
+    ln -sf $prevProtobufUnmasked public-$today.all.notMasked.pb
+    ln -sf $prevMeta public-$today.metadata.tsv.gz
+    echo "No new sequences; linking to yesterday's results."
+    exit 0
+fi
+
+# Use Rob's script that aligns each sequence to NC_045512.2 and concatenates the results
+# as a multifasta alignment (from which we can extract VCF with SNVs):
+#conda install -c bioconda mafft
+alignedFa=new.aligned.fa
+rm -f $alignedFa
+export TMPDIR=/dev/shm
+time bash ~angie/github/sarscov2phylo/scripts/global_profile_alignment.sh \
+  -i new.fa \
+  -o $alignedFa \
+  -t 50
+faSize $alignedFa
+
+# Now make a renaming that keeps all the prevNames and adds full names (with shortened dates)
+# for the new seqs.
+renaming=oldAndNewNames
+tawk '{print $1, $1;}' prevNames > $renaming
+if (( $(fastaNames newCogUk.filtered.fa | wc -l) > 0 )) ; then
+    fastaNames newCogUk.filtered.fa \
+    | grep -Fwf - $cogUkDir/cog_metadata.csv \
+    | awk -F, '{print $1 "\t" $1 "|" $5;}' \
+    | sed -re 's/20([0-9][0-9])(-[0-9-]+)?$/\1\2/;' \
+        >> $renaming
+fi
+if (( $(fastaNames newGenBank.filtered.fa | wc -l) > 0 )) ; then
+    fastaNames newGenBank.filtered.fa \
+    | sed -re 's/[ |].*//' \
+    | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \
+    | tawk '{ if ($3 == "") { $3 = "?"; }
+              if ($6 != "") { print $1 "\t" $6 "|" $1 "|" $3; }
+              else { print $1 "\t" $1 "|" $3; } }' \
+    | cleanGenbank \
+    | sed -re 's/ /_/g' \
+    | sed -re 's/20([0-9][0-9])(-[0-9-]+)?$/\1\2/;' \
+        >> $renaming
+fi
+wc -l $renaming
+
+# Make VCF -- first without masking, for hgTracks
+time cat <(twoBitToFa $ref2bit stdout) $alignedFa \
+| faToVcf stdin stdout \
+| vcfRenameAndPrune stdin $renaming stdout \
+| gzip -c \
+    > new.vcf.gz
+# Use usher to add new public sequences to tree of mapped subset and infer missing/ambig bases.
+time $usher -u -T 50 \
+    -v new.vcf.gz \
+    -i $prevProtobufUnmasked \
+    -o public-$today.all.notMasked.pb \
+    >& usher.addNewUnmasked.log
+#~10 hours for 56k seqs, ~72m for 6k
+$matToVcf -i public-$today.all.notMasked.pb -v public-$today.all.vcf
+ls -l public-$today.all.vcf
+wc -l public-$today.all.vcf
+bgzip -f public-$today.all.vcf
+tabix -p vcf public-$today.all.vcf.gz
+
+# Then with masking to collapse the tree & make protobuf for usher/hgPhyloPlace:
+tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \
+    $problematicSitesVcf > mask.vcf
+time vcfFilter -excludeVcf=mask.vcf new.vcf.gz \
+| gzip -c \
+    > new.masked.vcf.gz
+time $usher -u -T 50 \
+    -v new.masked.vcf.gz \
+    -i $prevProtobufMasked \
+    -o public-$today.all.masked.pb \
+    >& usher.addNew.log
+mv uncondensed-final-tree.nh public-$today.all.nwk
+# Masked VCF that goes with the masked protobuf, for public distribution for folks who want
+# to run UShER and also have the VCF.
+$matToVcf -i public-$today.all.masked.pb -v public-$today.all.masked.vcf
+gzip public-$today.all.masked.vcf
+
+# Make allele-frequency-filtered versions
+# Disregard pipefail just for this pipe because head prevents the cat from completing:
+set +o pipefail
+sampleCount=$(zcat public-$today.all.vcf.gz | head | grep ^#CHROM | sed -re 's/\t/\n/g' \
+              | tail -n+10 | wc -l)
+set -o pipefail
+minAc001=$(( (($sampleCount + 999) / 1000) ))
+vcfFilter -minAc=$minAc001 -rename public-$today.all.vcf.gz \
+    > public-$today.all.minAf.001.vcf
+wc -l public-$today.all.minAf.001.vcf
+bgzip -f public-$today.all.minAf.001.vcf
+tabix -p vcf public-$today.all.minAf.001.vcf.gz
+
+minAc01=$(( (($sampleCount + 99) / 100) ))
+vcfFilter -minAc=$minAc01 -rename public-$today.all.minAf.001.vcf.gz \
+    > public-$today.all.minAf.01.vcf
+wc -l public-$today.all.minAf.01.vcf
+bgzip -f public-$today.all.minAf.01.vcf
+tabix -p vcf public-$today.all.minAf.01.vcf.gz
+
+# Parsimony scores on collapsed tree
+time $find_parsimonious_assignments --tree public-$today.all.nwk \
+    --vcf <(gunzip -c public-$today.all.vcf.gz) \
+> fpa.out
+tail -n+2 fpa.out \
+| sed -re 's/^[A-Z]([0-9]+)[A-Z,]+.*parsimony_score=([0-9]+).*/\1\t\2/;' \
+| tawk '{print "NC_045512v2", $1-1, $1, $2;}' \
+| sort -k2n,2n \
+    > public-$today.all.parsimony.bg
+bedGraphToBigWig public-$today.all.parsimony.bg /hive/data/genomes/wuhCor1/chrom.sizes \
+    public-$today.all.parsimony.bw
+
+# Metadata for hgPhyloPlace:
+# Header names same as nextmeta (with strain first) so hgPhyloPlace recognizes them:
+echo -e "strain\tgenbank_accession\tdate\tcountry\thost\tcompleteness\tlength\tNextstrain_clade\tpangolin_lineage" \
+    > public-$today.metadata.tsv
+# It's not always possible to recreate both old and new names correctly from metadata,
+# so make a file to translate accession or COG-UK to the name used in VCF, tree and protobufs.
+cut -f 2 $renaming \
+| awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' \
+| sort \
+    > idToName
+# NCBI metadata (strip colon-separated location after country if present):
+tawk '$8 >= 20000 { print $1, $3, $4, $5, $6, $8; }' \
+    $ncbiDir/ncbi_dataset.plusBioSample.tsv \
+| sed -re 's/\t([A-Za-z -]+):[A-Za-z0-9 ,()_-]+\t/\t\1\t/;' \
+| perl -wpe '@w = split("\t"); $w[4] =~ s/ /_/g; $_ = join("\t", @w);' \
+| cleanGenbank \
+| sort > gb.metadata
+if [ -e $ncbiDir/lineage_report.csv ]; then
+    echo Getting GenBank Pangolin lineages from $ncbiDir/lineage_report.csv
+    tail -n+2  $ncbiDir/lineage_report.csv \
+    | sed -re 's/^([A-Z][A-Z][0-9]{6}\.[0-9]+)[^,]*/\1/;' \
+    | awk -F, '$2 != "" && $2 != "None" {print $1 "\t" $2;}' \
+    | sort \
+        > gbToLineage
+else
+    echo Getting GenBank Pangolin lineages from $prevMeta
+    zcat $prevMeta \
+    | tail -n+2 \
+    | tawk '$2 != "" && $8 != "" { print $2, $8; }' \
+    | sort \
+        > gbToLineage
+fi
+wc -l gbToLineage
+if [ -e $ncbiDir/nextclade.tsv ]; then
+    sort $ncbiDir/nextclade.tsv > gbToNextclade
+else
+    touch gbToNextclade
+fi
+wc -l gbToNextclade
+join -t$'\t' -a 1 gb.metadata gbToNextclade \
+| join -t$'\t' -a 1 - gbToLineage \
+| tawk '{ if ($2 == "") { $2 = "?"; }
+          print $1, $1, $2, $3, $4, "", $6, $7, $8; }' \
+| join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
+    >> public-$today.metadata.tsv
+# COG-UK metadata:
+if [ -e $cogUkDir/nextclade.tsv ]; then
+    sort $cogUkDir/nextclade.tsv > cogUkToNextclade
+else
+    touch cogUkToNextclade
+fi
+#*** Could also add sequence length to metadata from faSizes output...
+tail -n+2 $cogUkDir/cog_metadata.csv \
+| awk -F, -v 'OFS=\t' '{print $1, "", $5, $2, "", "", "", $7; }' \
+| sort \
+| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2,1.8 - cogUkToNextclade \
+| join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
+    >> public-$today.metadata.tsv
+# CNCB metadata:
+tail -n+2 $cncbDir/cncb.metadata.tsv \
+| tawk '{ if ($3 != "GISAID" && $3 != "GenBank" && $3 != "Genbank") {
+            print $2, "", $10, $11, $9, $5, $6} }' \
+| sed -re 's@\t([A-Za-z -]+)( / [A-Za-z -'"'"']+)+\t@\t\1\t@;  s/Sapiens/sapiens/;' \
+| sort \
+| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 - $cncbDir/nextclade.tsv \
+| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.2 - $cncbDir/pangolin.tsv \
+| join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
+    >> public-$today.metadata.tsv
+wc -l public-$today.metadata.tsv
+gzip -f public-$today.metadata.tsv
+
+# Lineage colors:
+zcat public-$today.metadata.tsv.gz \
+| tail -n+2 \
+| tawk '$8 != "" || $9 != "" { print $1, $8, $9, ""; }' \
+| sort -u > sampleToLineage
+wc -l sampleToLineage
+~/kent/src/hg/utils/otto/sarscov2phylo/cladeLineageColors.pl sampleToLineage
+rm gisaidColors
+mv lineageColors public-$today.lineageColors
+mv nextstrainColors public-$today.nextstrainColors
+gzip -f public-$today.lineageColors public-$today.nextstrainColors
+
+#***TODO: make acknowledgements.tsv.gz! (see publicCredits.sh)
+
+ncbiDate=$(ls -l $ncbiDir | sed -re 's/.*ncbi\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
+cogUkDate=$(ls -l $cogUkDir | sed -re 's/.*cogUk\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
+cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
+if [ $ncbiDate == $cogUkDate ]; then
+    echo "sarscov2phylo release 13-11-20; NCBI and COG-UK sequences downloaded $ncbiDate; CNCB sequences downloaded $cncbDate" \
+        > version.txt
+else
+    echo "sarscov2phylo release 13-11-20; NCBI sequences downloaded $ncbiDate; COG-UK sequences downloaded $cogUkDate; CNCB sequences downloaded $cncbDate" \
+        > version.txt
+fi
+
+# Update gbdb links -- not every day, too much churn for getting releases out and the
+# tracks are getting unmanageably large for VCF.
+if false; then
+for f in public-$today.all{,.minAf*}.vcf.gz ; do
+    t=$(echo $f | sed -re "s/-$today//;")
+    ln -sf `pwd`/$f /gbdb/wuhCor1/sarsCov2PhyloPub/$t
+    ln -sf `pwd`/$f.tbi /gbdb/wuhCor1/sarsCov2PhyloPub/$t.tbi
+done
+ln -sf `pwd`/public-$today.all.nwk /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.nwk
+ln -sf `pwd`/public-$today.all.parsimony.bw \
+    /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.parsimony.bw
+ln -sf `pwd`/public-$today.lineageColors.gz \
+    /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.lineageColors.gz
+ln -sf `pwd`/public-$today.nextstrainColors.gz \
+    /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.nextstrainColors.gz
+ln -sf `pwd`/version.txt /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.version.txt
+fi
+
+# Link to public trees download directory hierarchy
+y=$(date +%Y)
+m=$(date +%m)
+d=$(date +%d)
+archive=/hive/users/angie/publicTrees/$y/$m/$d
+mkdir -p $archive
+ln `pwd`/public-$today.all.nwk $archive/
+ln `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/
+ln `pwd`/public-$today.metadata.tsv.gz $archive/
+