6068fc9cbd3f8aa5f70f0535a5086423cf0c8d4c angie Wed Sep 22 20:42:55 2021 -0700 Move the construction of new.masked.vcf.gz out of updateCombinedTree.sh into makeNewMaskedVcf.sh and move out the metadata stuff into combineMetadata.sh. diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh new file mode 100755 index 0000000..bb426e6 --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh @@ -0,0 +1,287 @@ +#!/bin/bash +set -beEu -x -o pipefail + +# Do not modify this script, modify the source tree copy: +# kent/src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh + +usage() { + echo "usage: $0 prevDate today problematicSitesVcf" + echo "This assumes that ncbi.latest and cogUk.latest links/directories have been updated." +} + +if [ $# != 3 ]; then + usage + exit 1 +fi + +prevDate=$1 +today=$2 +problematicSitesVcf=$3 + +ottoDir=/hive/data/outside/otto/sarscov2phylo +ncbiDir=$ottoDir/ncbi.latest +cogUkDir=$ottoDir/cogUk.latest +cncbDir=$ottoDir/cncb.latest +gisaidDir=/hive/users/angie/gisaid +minReal=20000 +ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit +epiToPublic=$gisaidDir/epiToPublicAndDate.latest +scriptDir=$(dirname "${BASH_SOURCE[0]}") +source $scriptDir/util.sh + +mkdir -p $ottoDir/$today +cd $ottoDir/$today + +prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb +prevMeta=$ottoDir/$prevDate/public-$prevDate.metadata.tsv.gz + +usherDir=~angie/github/usher +usher=$usherDir/build/usher +matUtils=$usherDir/build/matUtils + +renaming=oldAndNewNames + +# Make lists of sequences already in the tree. +$matUtils extract -i $prevProtobufMasked -u prevNames + +# Before updating the tree with new sequences, update the names used in the tree: +# * Sequences that are already in the tree with EPI_ IDs, but that have been mapped to public IDs +# * COG-UK sequences that are in GenBank. Per Sam Nicholls the isolate alone is enough to identify +# them -- no need to match country & year which are sometimes incorrect at first. +grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \ +| tawk '{print $6, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \ +| sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \ +| sort \ + > cogUkInGenBankIsolateToNewName +fastaNames $cogUkDir/cog_all.fasta.xz > cogUk.names +grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) cogUk.names \ + > cogUkInGenBank +set +o pipefail +# From 2021-05-04 on there should be no more unrenamed COG-UK/ in prevNames, but doesn't hurt +# to check. +grep COG-UK/ prevNames \ +| awk -F\| '{print $1 "\t" $0;}' \ +| sed -re 's@COG-UK/@@;' \ +| sort \ + > cogUkInGenBankIsolateToPrevName +set -o pipefail +join -t$'\t' cogUkInGenBankIsolateToPrevName cogUkInGenBankIsolateToNewName \ +| cut -f 2,3 \ + > prevTree.renaming +# Look for COG-UK isolates in prevNames that have just been added to GenBank and need to be renamed. +# Unfortunately for now we are not getting those from $epiToPublic. +grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) prevNames \ +| awk -F\| '$3 == ""' \ +| awk -F/ '{print $2 "\t" $0;}' \ +| sort \ + > cogUkInGenBankIsolateToPrevName +join -t$'\t' cogUkInGenBankIsolateToPrevName cogUkInGenBankIsolateToNewName \ +| cut -f 2,3 \ + >> prevTree.renaming +# Look for names with EPI_IDs that have just today been mapped to public sequences. +grep EPI_ISL_ prevNames \ +| awk -F\| '{print $2 "\t" $0;}' \ +| sort \ + > epiToPrevName +set +o pipefail +grep -Fwf <(cut -f 1 epiToPrevName) $epiToPublic \ +| grep -v COG-UK/ \ +| tawk '{if ($4 != "" && $3 == $2) { print $1, $2 "|" $4; } else if ($4 != "") { print $1, $3 "|" $2 "|" $4; }}' \ +| sort \ + > epiToNewName +set -o pipefail +# Argh, missing sequences in COG-UK metadata can mean that a sequence may have been added to the +# tree both with and without EPI ID... so renaming makes a name conflict. +# If there are any of those then prune the sequences with EPI_ID or longer dates, so renaming doesn't +# cause conflict. +comm -12 <(cut -f 2 epiToNewName | sort) <(sort prevNames) > epiToNewNameAlreadyInTree +join -t$'\t' epiToPrevName <(grep -vFwf epiToNewNameAlreadyInTree epiToNewName) \ +| cut -f 2,3 \ + >> prevTree.renaming +cp /dev/null dupsToRemove +if [ -s epiToNewNameAlreadyInTree ]; then + set +o pipefail + grep -Fwf <(cut -d\| -f 1 epiToNewNameAlreadyInTree) prevNames \ + | grep EPI_ISL \ + | cat \ + >> dupsToRemove + set -o pipefail +fi +# Don't rename if the final name is already in the tree; remove the dup with the old name. +set +o pipefail +cut -f 2 prevTree.renaming \ +| grep -Fwf - prevNames | cat \ + > alreadyThere +grep -Fwf alreadyThere prevTree.renaming | cut -f 1 \ + >> dupsToRemove +# And finally, sometimes there are duplicates due to country or date being corrected in COG-UK +# metadata. +grep -vFwf dupsToRemove prevTree.renaming \ +| cut -f 2 | sort | uniq -c | awk '$1 > 1 {print $2;}' \ +| cut -d/ -f 2 \ + > cogUkIsolateStillDup +grep -Fwf cogUkIsolateStillDup $cogUkDir/cog_metadata.csv \ +| awk -F, '{print $1 "|" $5;}' \ + > cogDupsLatestMetadata +grep -Fwf cogUkIsolateStillDup prevNames \ +| grep -vFwf dupsToRemove \ +| grep -vFwf cogDupsLatestMetadata \ +| cat \ + >> dupsToRemove +set -o pipefail +startingProtobuf=$prevProtobufMasked +if [ -s dupsToRemove ]; then + startingProtobuf=prevDupTrimmed.pb + $matUtils extract -i $prevProtobufMasked -p -s dupsToRemove -o $startingProtobuf +fi + +if [ -s prevTree.renaming ]; then + $matUtils mask -r prevTree.renaming -i $startingProtobuf -o prevRenamed.pb >& rename.out + $matUtils extract -i prevRenamed.pb -u prevNames +else + ln -sf $startingProtobuf prevRenamed.pb +fi + +# OK, now that the tree names are updated, figure out which seqs are already in there and +# which need to be added. +awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ +| grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc +grep -E '^(England|Northern|Scotland|Wales)' prevNames \ +| cut -d\| -f1 > prevCogUk +awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ +| grep -E '^EPI_ISL_' > prevGisaid +# Add public sequences that have been mapped to GISAID sequences to prevGisaid. +grep -Fwf prevGbAcc $epiToPublic | cut -f 1 >> prevGisaid +grep -Fwf prevCogUk $epiToPublic | cut -f 1 >> prevGisaid +wc -l prev* + +# Get new GenBank sequences with at least $minReal non-N bases. +# Exclude seqs in the tree with EPI IDs that that have been mapped in the very latest $epiToPublic. +set +o pipefail +egrep $'\t''[A-Z][A-Z][0-9]{6}\.[0-9]+' $epiToPublic \ +| grep -Fwf prevGisaid - \ +| grep -vFwf prevGbAcc \ +| cat \ + >> prevGbAcc +set -o pipefail +xzcat $ncbiDir/genbank.fa.xz \ +| faSomeRecords -exclude stdin prevGbAcc newGenBank.fa +faSize -veryDetailed newGenBank.fa \ +| tawk '$4 < '$minReal' {print $1;}' \ + > gbTooSmall +# NCBI also includes NC_045512 in the download, but that's our reference, so... exclude that too. +set +o pipefail +fastaNames newGenBank.fa | grep NC_045512 >> gbTooSmall +set -o pipefail +faSomeRecords -exclude newGenBank.fa gbTooSmall newGenBank.filtered.fa +faSize newGenBank.filtered.fa + +# Get new COG-UK sequences with at least $minReal non-N bases. +# Also exclude cog_all.fasta sequences not found in cog_metadata.csv. +comm -23 <(fastaNames $cogUkDir/cog_all.fasta.xz | sort) \ + <(cut -d, -f1 $cogUkDir/cog_metadata.csv | sort) \ + > cogFaNotMeta +# Also exclude COG-UK sequences that have been added to GenBank (cogUkInGenBank, see above). +xzcat $cogUkDir/cog_all.fasta.xz \ +| faSomeRecords -exclude stdin <(cat prevCogUk cogFaNotMeta cogUkInGenBank) newCogUk.fa +faSize -veryDetailed newCogUk.fa \ +| tawk '$4 < '$minReal' {print $1;}' \ + > cogUkTooSmall +faSomeRecords -exclude newCogUk.fa cogUkTooSmall newCogUk.filtered.fa +faSize newCogUk.filtered.fa + +# Get new GISAID sequences with at least $minReal non-N bases. +xzcat $gisaidDir/gisaid_fullNames_$today.fa.xz \ +| sed -re 's/^>.*\|(EPI_ISL_[0-9]+)\|.*/>\1/' \ +| faSomeRecords -exclude stdin prevGisaid newGisaid.fa +faSize -veryDetailed newGisaid.fa \ +| tawk '$4 < '$minReal' {print $1;}' \ + > gisaidTooSmall +faSomeRecords -exclude newGisaid.fa gisaidTooSmall newGisaid.filtered.fa +faSize newGisaid.filtered.fa + +# Exclude public-mapped sequences from newGisaid: +set +o pipefail +fastaNames newGisaid.filtered.fa \ +| grep -Fwf - $epiToPublic \ +| cut -f 1 \ + > newGisaid.public.names +set -o pipefail +if [ -s newGisaid.public.names ]; then + faSomeRecords -exclude newGisaid.filtered.fa newGisaid.public.names tmp + mv tmp newGisaid.filtered.fa + faSize newGisaid.filtered.fa +fi +cat new*.filtered.fa > new.fa +faSize new.fa + +# Use Rob's script that aligns each sequence to NC_045512.2 and concatenates the results +# as a multifasta alignment (from which we can extract VCF with SNVs): +#conda install -c bioconda mafft +alignedFa=new.aligned.fa +rm -f $alignedFa +export TMPDIR=/dev/shm +time bash ~angie/github/sarscov2phylo/scripts/global_profile_alignment.sh \ + -i new.fa \ + -o $alignedFa \ + -t 50 +faSize $alignedFa + +# Now make a renaming that keeps all the prevNames and adds full names for the new seqs. +tawk '{print $1, $1;}' prevNames > $renaming +if [ -s newCogUk.filtered.fa ]; then + # Sometimes all of the new COG-UK sequences are missing from cog_metadata.csv -- complained. + set +o pipefail + fastaNames newCogUk.filtered.fa \ + | grep -Fwf - $cogUkDir/cog_metadata.csv \ + | awk -F, '{print $1 "\t" $1 "|" $5;}' \ + >> $renaming + set -o pipefail +fi +if [ -s newGenBank.filtered.fa ]; then + # Special renaming for COG-UK sequences: strip COG-UK/, add back country and year + set +o pipefail + fastaNames newGenBank.filtered.fa \ + | grep COG-UK/ \ + | sed -re 's/[ |].*//' \ + | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \ + | tawk '{print $1, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \ + | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \ + >> $renaming + fastaNames newGenBank.filtered.fa \ + | grep -v COG-UK/ \ + | sed -re 's/[ |].*//' \ + | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \ + | tawk '{ if ($3 == "") { $3 = "?"; } + if ($6 != "") { print $1 "\t" $6 "|" $1 "|" $3; } + else { print $1 "\t" $1 "|" $3; } }' \ + | cleanGenbank \ + | sed -re 's/ /_/g' \ + >> $renaming + set -o pipefail +fi +if [ -s newGisaid.filtered.fa ]; then + zcat $gisaidDir/metadata_batch_$today.tsv.gz \ + | grep -Fwf <(fastaNames newGisaid.filtered.fa) \ + | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \ + >> $renaming +fi +wc -l $renaming + +# Make masked VCF +tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \ + $problematicSitesVcf > mask.vcf +# Add masked VCF to previous protobuf +#*** With horrible hack for time being, to mask 21987 (because it messes up the Delta branch)... +#*** TODO: make hgPhyloPlace handle protobufs that don't have all of the latest problematic sites +#*** masked more gracefully, and update the Problematic Sites track. +time cat <(twoBitToFa $ref2bit stdout) $alignedFa \ +| faToVcf -maxDiff=1000 \ + -excludeFile=<(cat ../tooManyEpps.ids ../badBranchSeed.ids) \ + -verbose=2 stdin stdout \ +| vcfRenameAndPrune stdin $renaming stdout \ +| vcfFilter -excludeVcf=mask.vcf stdin \ +| tawk '$2 != 21987' \ +| gzip -c \ + > new.masked.vcf.gz +