src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh 6068fc9cbd3f8aa5f70f0535a5086423cf0c8d4c

6068fc9cbd3f8aa5f70f0535a5086423cf0c8d4c
angie
  Wed Sep 22 20:42:55 2021 -0700
Move the construction of new.masked.vcf.gz out of updateCombinedTree.sh into makeNewMaskedVcf.sh and move out the metadata stuff into combineMetadata.sh.

diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
new file mode 100755
index 0000000..bb426e6
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
@@ -0,0 +1,287 @@
+#!/bin/bash
+set -beEu -x -o pipefail
+
+#	Do not modify this script, modify the source tree copy:
+#	kent/src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
+
+usage() {
+    echo "usage: $0 prevDate today problematicSitesVcf"
+    echo "This assumes that ncbi.latest and cogUk.latest links/directories have been updated."
+}
+
+if [ $# != 3 ]; then
+  usage
+  exit 1
+fi
+
+prevDate=$1
+today=$2
+problematicSitesVcf=$3
+
+ottoDir=/hive/data/outside/otto/sarscov2phylo
+ncbiDir=$ottoDir/ncbi.latest
+cogUkDir=$ottoDir/cogUk.latest
+cncbDir=$ottoDir/cncb.latest
+gisaidDir=/hive/users/angie/gisaid
+minReal=20000
+ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit
+epiToPublic=$gisaidDir/epiToPublicAndDate.latest
+scriptDir=$(dirname "${BASH_SOURCE[0]}")
+source $scriptDir/util.sh
+
+mkdir -p $ottoDir/$today
+cd $ottoDir/$today
+
+prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb
+prevMeta=$ottoDir/$prevDate/public-$prevDate.metadata.tsv.gz
+
+usherDir=~angie/github/usher
+usher=$usherDir/build/usher
+matUtils=$usherDir/build/matUtils
+
+renaming=oldAndNewNames
+
+# Make lists of sequences already in the tree.
+$matUtils extract -i $prevProtobufMasked -u prevNames
+
+# Before updating the tree with new sequences, update the names used in the tree:
+# * Sequences that are already in the tree with EPI_ IDs, but that have been mapped to public IDs
+# * COG-UK sequences that are in GenBank.  Per Sam Nicholls the isolate alone is enough to identify
+#   them -- no need to match country & year which are sometimes incorrect at first.
+grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
+| tawk '{print $6, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \
+| sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \
+| sort \
+    > cogUkInGenBankIsolateToNewName
+fastaNames $cogUkDir/cog_all.fasta.xz > cogUk.names
+grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) cogUk.names \
+    > cogUkInGenBank
+set +o pipefail
+# From 2021-05-04 on there should be no more unrenamed COG-UK/ in prevNames, but doesn't hurt
+# to check.
+grep COG-UK/ prevNames \
+| awk -F\| '{print $1 "\t" $0;}' \
+| sed -re 's@COG-UK/@@;' \
+| sort \
+    > cogUkInGenBankIsolateToPrevName
+set -o pipefail
+join -t$'\t' cogUkInGenBankIsolateToPrevName cogUkInGenBankIsolateToNewName \
+| cut -f 2,3 \
+    > prevTree.renaming
+# Look for COG-UK isolates in prevNames that have just been added to GenBank and need to be renamed.
+# Unfortunately for now we are not getting those from $epiToPublic.
+grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) prevNames \
+| awk -F\| '$3 == ""' \
+| awk -F/ '{print $2 "\t" $0;}' \
+| sort \
+    > cogUkInGenBankIsolateToPrevName
+join -t$'\t' cogUkInGenBankIsolateToPrevName cogUkInGenBankIsolateToNewName \
+| cut -f 2,3 \
+    >> prevTree.renaming
+# Look for names with EPI_IDs that have just today been mapped to public sequences.
+grep EPI_ISL_ prevNames \
+| awk -F\| '{print $2 "\t" $0;}' \
+| sort \
+    > epiToPrevName
+set +o pipefail
+grep -Fwf <(cut -f 1 epiToPrevName) $epiToPublic \
+| grep -v COG-UK/ \
+| tawk '{if ($4 != "" && $3 == $2) { print $1, $2 "|" $4; } else if ($4 != "") { print $1, $3 "|" $2 "|" $4; }}' \
+| sort \
+    > epiToNewName
+set -o pipefail
+# Argh, missing sequences in COG-UK metadata can mean that a sequence may have been added to the
+# tree both with and without EPI ID... so renaming makes a name conflict.
+# If there are any of those then prune the sequences with EPI_ID or longer dates, so renaming doesn't
+# cause conflict.
+comm -12 <(cut -f 2 epiToNewName | sort) <(sort prevNames) > epiToNewNameAlreadyInTree
+join -t$'\t' epiToPrevName <(grep -vFwf epiToNewNameAlreadyInTree epiToNewName) \
+| cut -f 2,3 \
+    >> prevTree.renaming
+cp /dev/null dupsToRemove
+if [ -s epiToNewNameAlreadyInTree ]; then
+    set +o pipefail
+    grep -Fwf <(cut -d\| -f 1 epiToNewNameAlreadyInTree) prevNames \
+    | grep EPI_ISL \
+    | cat \
+        >> dupsToRemove
+    set -o pipefail
+fi
+# Don't rename if the final name is already in the tree; remove the dup with the old name.
+set +o pipefail
+cut -f 2 prevTree.renaming \
+| grep -Fwf - prevNames | cat \
+    > alreadyThere
+grep -Fwf alreadyThere prevTree.renaming | cut -f 1 \
+    >> dupsToRemove
+# And finally, sometimes there are duplicates due to country or date being corrected in COG-UK
+# metadata.
+grep -vFwf dupsToRemove prevTree.renaming \
+| cut -f 2 | sort | uniq -c | awk '$1 > 1 {print $2;}' \
+| cut -d/ -f 2 \
+    > cogUkIsolateStillDup
+grep -Fwf cogUkIsolateStillDup $cogUkDir/cog_metadata.csv \
+| awk -F, '{print $1 "|" $5;}' \
+    > cogDupsLatestMetadata
+grep -Fwf cogUkIsolateStillDup prevNames \
+| grep -vFwf dupsToRemove \
+| grep -vFwf cogDupsLatestMetadata \
+| cat \
+    >> dupsToRemove
+set -o pipefail
+startingProtobuf=$prevProtobufMasked
+if [ -s dupsToRemove ]; then
+    startingProtobuf=prevDupTrimmed.pb
+    $matUtils extract -i $prevProtobufMasked -p -s dupsToRemove -o $startingProtobuf
+fi
+
+if [ -s prevTree.renaming ]; then
+    $matUtils mask -r prevTree.renaming -i $startingProtobuf -o prevRenamed.pb >& rename.out
+    $matUtils extract -i prevRenamed.pb -u prevNames
+else
+    ln -sf $startingProtobuf prevRenamed.pb
+fi
+
+# OK, now that the tree names are updated, figure out which seqs are already in there and
+# which need to be added.
+awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
+| grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc
+grep -E '^(England|Northern|Scotland|Wales)' prevNames \
+| cut -d\| -f1 > prevCogUk
+awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
+| grep -E '^EPI_ISL_' > prevGisaid
+# Add public sequences that have been mapped to GISAID sequences to prevGisaid.
+grep -Fwf prevGbAcc $epiToPublic | cut -f 1 >> prevGisaid
+grep -Fwf prevCogUk $epiToPublic | cut -f 1 >> prevGisaid
+wc -l prev*
+
+# Get new GenBank sequences with at least $minReal non-N bases.
+# Exclude seqs in the tree with EPI IDs that that have been mapped in the very latest $epiToPublic.
+set +o pipefail
+egrep $'\t''[A-Z][A-Z][0-9]{6}\.[0-9]+' $epiToPublic \
+| grep -Fwf prevGisaid - \
+| grep -vFwf prevGbAcc \
+| cat \
+    >> prevGbAcc
+set -o pipefail
+xzcat $ncbiDir/genbank.fa.xz \
+| faSomeRecords -exclude stdin prevGbAcc newGenBank.fa
+faSize -veryDetailed newGenBank.fa \
+| tawk '$4 < '$minReal' {print $1;}' \
+    > gbTooSmall
+# NCBI also includes NC_045512 in the download, but that's our reference, so... exclude that too.
+set +o pipefail
+fastaNames newGenBank.fa | grep NC_045512 >> gbTooSmall
+set -o pipefail
+faSomeRecords -exclude newGenBank.fa gbTooSmall newGenBank.filtered.fa
+faSize newGenBank.filtered.fa
+
+# Get new COG-UK sequences with at least $minReal non-N bases.
+# Also exclude cog_all.fasta sequences not found in cog_metadata.csv.
+comm -23 <(fastaNames $cogUkDir/cog_all.fasta.xz | sort) \
+    <(cut -d, -f1 $cogUkDir/cog_metadata.csv | sort) \
+    > cogFaNotMeta
+# Also exclude COG-UK sequences that have been added to GenBank (cogUkInGenBank, see above).
+xzcat $cogUkDir/cog_all.fasta.xz \
+| faSomeRecords -exclude stdin <(cat prevCogUk cogFaNotMeta cogUkInGenBank) newCogUk.fa
+faSize -veryDetailed newCogUk.fa \
+| tawk '$4 < '$minReal' {print $1;}' \
+    > cogUkTooSmall
+faSomeRecords -exclude newCogUk.fa cogUkTooSmall newCogUk.filtered.fa
+faSize newCogUk.filtered.fa
+
+# Get new GISAID sequences with at least $minReal non-N bases.
+xzcat $gisaidDir/gisaid_fullNames_$today.fa.xz \
+| sed -re 's/^>.*\|(EPI_ISL_[0-9]+)\|.*/>\1/' \
+| faSomeRecords -exclude stdin prevGisaid newGisaid.fa
+faSize -veryDetailed newGisaid.fa \
+| tawk '$4 < '$minReal' {print $1;}' \
+    > gisaidTooSmall
+faSomeRecords -exclude newGisaid.fa gisaidTooSmall newGisaid.filtered.fa
+faSize newGisaid.filtered.fa
+
+# Exclude public-mapped sequences from newGisaid:
+set +o pipefail
+fastaNames newGisaid.filtered.fa \
+| grep -Fwf - $epiToPublic \
+| cut -f 1 \
+    > newGisaid.public.names
+set -o pipefail
+if [ -s newGisaid.public.names ]; then
+    faSomeRecords -exclude newGisaid.filtered.fa newGisaid.public.names tmp
+    mv tmp newGisaid.filtered.fa
+    faSize newGisaid.filtered.fa
+fi
+cat new*.filtered.fa > new.fa
+faSize new.fa
+
+# Use Rob's script that aligns each sequence to NC_045512.2 and concatenates the results
+# as a multifasta alignment (from which we can extract VCF with SNVs):
+#conda install -c bioconda mafft
+alignedFa=new.aligned.fa
+rm -f $alignedFa
+export TMPDIR=/dev/shm
+time bash ~angie/github/sarscov2phylo/scripts/global_profile_alignment.sh \
+  -i new.fa \
+  -o $alignedFa \
+  -t 50
+faSize $alignedFa
+
+# Now make a renaming that keeps all the prevNames and adds full names for the new seqs.
+tawk '{print $1, $1;}' prevNames > $renaming
+if [ -s newCogUk.filtered.fa ]; then
+    # Sometimes all of the new COG-UK sequences are missing from cog_metadata.csv -- complained.
+    set +o pipefail
+    fastaNames newCogUk.filtered.fa \
+    | grep -Fwf - $cogUkDir/cog_metadata.csv \
+    | awk -F, '{print $1 "\t" $1 "|" $5;}' \
+        >> $renaming
+    set -o pipefail
+fi
+if [ -s newGenBank.filtered.fa ]; then
+    # Special renaming for COG-UK sequences: strip COG-UK/, add back country and year
+    set +o pipefail
+    fastaNames newGenBank.filtered.fa \
+    | grep COG-UK/ \
+    | sed -re 's/[ |].*//' \
+    | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \
+    | tawk '{print $1, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \
+    | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \
+        >> $renaming
+    fastaNames newGenBank.filtered.fa \
+    | grep -v COG-UK/ \
+    | sed -re 's/[ |].*//' \
+    | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \
+    | tawk '{ if ($3 == "") { $3 = "?"; }
+              if ($6 != "") { print $1 "\t" $6 "|" $1 "|" $3; }
+              else { print $1 "\t" $1 "|" $3; } }' \
+    | cleanGenbank \
+    | sed -re 's/ /_/g' \
+        >> $renaming
+    set -o pipefail
+fi
+if [ -s newGisaid.filtered.fa ]; then
+    zcat $gisaidDir/metadata_batch_$today.tsv.gz \
+    | grep -Fwf <(fastaNames newGisaid.filtered.fa) \
+    | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \
+        >> $renaming
+fi
+wc -l $renaming
+
+# Make masked VCF
+tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \
+    $problematicSitesVcf > mask.vcf
+# Add masked VCF to previous protobuf
+#*** With horrible hack for time being, to mask 21987 (because it messes up the Delta branch)...
+#*** TODO: make hgPhyloPlace handle protobufs that don't have all of the latest problematic sites
+#*** masked more gracefully, and update the Problematic Sites track.
+time cat <(twoBitToFa $ref2bit stdout) $alignedFa \
+| faToVcf -maxDiff=1000 \
+    -excludeFile=<(cat ../tooManyEpps.ids ../badBranchSeed.ids) \
+    -verbose=2 stdin stdout \
+| vcfRenameAndPrune stdin $renaming stdout \
+| vcfFilter -excludeVcf=mask.vcf stdin \
+| tawk '$2 != 21987' \
+| gzip -c \
+    > new.masked.vcf.gz
+