src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh 6068fc9cbd3f8aa5f70f0535a5086423cf0c8d4c

6068fc9cbd3f8aa5f70f0535a5086423cf0c8d4c
angie
  Wed Sep 22 20:42:55 2021 -0700
Move the construction of new.masked.vcf.gz out of updateCombinedTree.sh into makeNewMaskedVcf.sh and move out the metadata stuff into combineMetadata.sh.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index 55347b0..71798f2 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -7,404 +7,71 @@
 usage() {
     echo "usage: $0 prevDate today problematicSitesVcf"
     echo "This assumes that ncbi.latest and cogUk.latest links/directories have been updated."
 }
 
 if [ $# != 3 ]; then
   usage
   exit 1
 fi
 
 prevDate=$1
 today=$2
 problematicSitesVcf=$3
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
-ncbiDir=$ottoDir/ncbi.latest
-cogUkDir=$ottoDir/cogUk.latest
 cncbDir=$ottoDir/cncb.latest
 gisaidDir=/hive/users/angie/gisaid
-minReal=20000
-ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit
 epiToPublic=$gisaidDir/epiToPublicAndDate.latest
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 mkdir -p $ottoDir/$today
 cd $ottoDir/$today
 
-prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb
-prevMeta=$ottoDir/$prevDate/public-$prevDate.metadata.tsv.gz
-
 usherDir=~angie/github/usher
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
 
-renaming=oldAndNewNames
-
 if [ ! -s new.masked.vcf.gz ]; then
-
-# Make lists of sequences already in the tree.
-$matUtils extract -i $prevProtobufMasked -u prevNames
-
-# Before updating the tree with new sequences, update the names used in the tree:
-# * Sequences that are already in the tree with EPI_ IDs, but that have been mapped to public IDs
-# * COG-UK sequences that are in GenBank.  Per Sam Nicholls the isolate alone is enough to identify
-#   them -- no need to match country & year which are sometimes incorrect at first.
-grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
-| tawk '{print $6, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \
-| sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \
-| sort \
-    > cogUkInGenBankIsolateToNewName
-fastaNames $cogUkDir/cog_all.fasta.xz > cogUk.names
-grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) cogUk.names \
-    > cogUkInGenBank
-set +o pipefail
-# From 2021-05-04 on there should be no more unrenamed COG-UK/ in prevNames, but doesn't hurt
-# to check.
-grep COG-UK/ prevNames \
-| awk -F\| '{print $1 "\t" $0;}' \
-| sed -re 's@COG-UK/@@;' \
-| sort \
-    > cogUkInGenBankIsolateToPrevName
-set -o pipefail
-join -t$'\t' cogUkInGenBankIsolateToPrevName cogUkInGenBankIsolateToNewName \
-| cut -f 2,3 \
-    > prevTree.renaming
-# Look for COG-UK isolates in prevNames that have just been added to GenBank and need to be renamed.
-# Unfortunately for now we are not getting those from $epiToPublic.
-grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) prevNames \
-| awk -F\| '$3 == ""' \
-| awk -F/ '{print $2 "\t" $0;}' \
-| sort \
-    > cogUkInGenBankIsolateToPrevName
-join -t$'\t' cogUkInGenBankIsolateToPrevName cogUkInGenBankIsolateToNewName \
-| cut -f 2,3 \
-    >> prevTree.renaming
-# Look for names with EPI_IDs that have just today been mapped to public sequences.
-grep EPI_ISL_ prevNames \
-| awk -F\| '{print $2 "\t" $0;}' \
-| sort \
-    > epiToPrevName
-set +o pipefail
-grep -Fwf <(cut -f 1 epiToPrevName) $epiToPublic \
-| grep -v COG-UK/ \
-| tawk '{if ($4 != "" && $3 == $2) { print $1, $2 "|" $4; } else if ($4 != "") { print $1, $3 "|" $2 "|" $4; }}' \
-| sort \
-    > epiToNewName
-set -o pipefail
-# Argh, missing sequences in COG-UK metadata can mean that a sequence may have been added to the
-# tree both with and without EPI ID... so renaming makes a name conflict.
-# If there are any of those then prune the sequences with EPI_ID or longer dates, so renaming doesn't
-# cause conflict.
-comm -12 <(cut -f 2 epiToNewName | sort) <(sort prevNames) > epiToNewNameAlreadyInTree
-join -t$'\t' epiToPrevName <(grep -vFwf epiToNewNameAlreadyInTree epiToNewName) \
-| cut -f 2,3 \
-    >> prevTree.renaming
-cp /dev/null dupsToRemove
-if [ -s epiToNewNameAlreadyInTree ]; then
-    set +o pipefail
-    grep -Fwf <(cut -d\| -f 1 epiToNewNameAlreadyInTree) prevNames \
-    | grep EPI_ISL \
-    | cat \
-        >> dupsToRemove
-    set -o pipefail
-fi
-# Don't rename if the final name is already in the tree; remove the dup with the old name.
-set +o pipefail
-cut -f 2 prevTree.renaming \
-| grep -Fwf - prevNames | cat \
-    > alreadyThere
-grep -Fwf alreadyThere prevTree.renaming | cut -f 1 \
-    >> dupsToRemove
-# And finally, sometimes there are duplicates due to country or date being corrected in COG-UK
-# metadata.
-grep -vFwf dupsToRemove prevTree.renaming \
-| cut -f 2 | sort | uniq -c | awk '$1 > 1 {print $2;}' \
-| cut -d/ -f 2 \
-    > cogUkIsolateStillDup
-grep -Fwf cogUkIsolateStillDup $cogUkDir/cog_metadata.csv \
-| awk -F, '{print $1 "|" $5;}' \
-    > cogDupsLatestMetadata
-grep -Fwf cogUkIsolateStillDup prevNames \
-| grep -vFwf dupsToRemove \
-| grep -vFwf cogDupsLatestMetadata \
-| cat \
-    >> dupsToRemove
-set -o pipefail
-startingProtobuf=$prevProtobufMasked
-if [ -s dupsToRemove ]; then
-    startingProtobuf=prevDupTrimmed.pb
-    $matUtils extract -i $prevProtobufMasked -p -s dupsToRemove -o $startingProtobuf
-fi
-
-if [ -s prevTree.renaming ]; then
-    $matUtils mask -r prevTree.renaming -i $startingProtobuf -o prevRenamed.pb >& rename.out
-    $matUtils extract -i prevRenamed.pb -u prevNames
-else
-    ln -sf $startingProtobuf prevRenamed.pb
-fi
-
-# OK, now that the tree names are updated, figure out which seqs are already in there and
-# which need to be added.
-awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
-| grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc
-grep -E '^(England|Northern|Scotland|Wales)' prevNames \
-| cut -d\| -f1 > prevCogUk
-awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
-| grep -E '^EPI_ISL_' > prevGisaid
-# Add public sequences that have been mapped to GISAID sequences to prevGisaid.
-grep -Fwf prevGbAcc $epiToPublic | cut -f 1 >> prevGisaid
-grep -Fwf prevCogUk $epiToPublic | cut -f 1 >> prevGisaid
-wc -l prev*
-
-# Get new GenBank sequences with at least $minReal non-N bases.
-# Exclude seqs in the tree with EPI IDs that that have been mapped in the very latest $epiToPublic.
-set +o pipefail
-egrep $'\t''[A-Z][A-Z][0-9]{6}\.[0-9]+' $epiToPublic \
-| grep -Fwf prevGisaid - \
-| grep -vFwf prevGbAcc \
-| cat \
-    >> prevGbAcc
-set -o pipefail
-xzcat $ncbiDir/genbank.fa.xz \
-| faSomeRecords -exclude stdin prevGbAcc newGenBank.fa
-faSize -veryDetailed newGenBank.fa \
-| tawk '$4 < '$minReal' {print $1;}' \
-    > gbTooSmall
-# NCBI also includes NC_045512 in the download, but that's our reference, so... exclude that too.
-set +o pipefail
-fastaNames newGenBank.fa | grep NC_045512 >> gbTooSmall
-set -o pipefail
-faSomeRecords -exclude newGenBank.fa gbTooSmall newGenBank.filtered.fa
-faSize newGenBank.filtered.fa
-
-# Get new COG-UK sequences with at least $minReal non-N bases.
-# Also exclude cog_all.fasta sequences not found in cog_metadata.csv.
-comm -23 <(fastaNames $cogUkDir/cog_all.fasta.xz | sort) \
-    <(cut -d, -f1 $cogUkDir/cog_metadata.csv | sort) \
-    > cogFaNotMeta
-# Also exclude COG-UK sequences that have been added to GenBank (cogUkInGenBank, see above).
-xzcat $cogUkDir/cog_all.fasta.xz \
-| faSomeRecords -exclude stdin <(cat prevCogUk cogFaNotMeta cogUkInGenBank) newCogUk.fa
-faSize -veryDetailed newCogUk.fa \
-| tawk '$4 < '$minReal' {print $1;}' \
-    > cogUkTooSmall
-faSomeRecords -exclude newCogUk.fa cogUkTooSmall newCogUk.filtered.fa
-faSize newCogUk.filtered.fa
-
-# Get new GISAID sequences with at least $minReal non-N bases.
-xzcat $gisaidDir/gisaid_fullNames_$today.fa.xz \
-| sed -re 's/^>.*\|(EPI_ISL_[0-9]+)\|.*/>\1/' \
-| faSomeRecords -exclude stdin prevGisaid newGisaid.fa
-faSize -veryDetailed newGisaid.fa \
-| tawk '$4 < '$minReal' {print $1;}' \
-    > gisaidTooSmall
-faSomeRecords -exclude newGisaid.fa gisaidTooSmall newGisaid.filtered.fa
-faSize newGisaid.filtered.fa
-
-# Exclude public-mapped sequences from newGisaid:
-set +o pipefail
-fastaNames newGisaid.filtered.fa \
-| grep -Fwf - $epiToPublic \
-| cut -f 1 \
-    > newGisaid.public.names
-set -o pipefail
-if [ -s newGisaid.public.names ]; then
-    faSomeRecords -exclude newGisaid.filtered.fa newGisaid.public.names tmp
-    mv tmp newGisaid.filtered.fa
-    faSize newGisaid.filtered.fa
-fi
-cat new*.filtered.fa > new.fa
-faSize new.fa
-
-# Use Rob's script that aligns each sequence to NC_045512.2 and concatenates the results
-# as a multifasta alignment (from which we can extract VCF with SNVs):
-#conda install -c bioconda mafft
-alignedFa=new.aligned.fa
-rm -f $alignedFa
-export TMPDIR=/dev/shm
-time bash ~angie/github/sarscov2phylo/scripts/global_profile_alignment.sh \
-  -i new.fa \
-  -o $alignedFa \
-  -t 50
-faSize $alignedFa
-
-# Now make a renaming that keeps all the prevNames and adds full names for the new seqs.
-tawk '{print $1, $1;}' prevNames > $renaming
-if [ -s newCogUk.filtered.fa ]; then
-    # Sometimes all of the new COG-UK sequences are missing from cog_metadata.csv -- complained.
-    set +o pipefail
-    fastaNames newCogUk.filtered.fa \
-    | grep -Fwf - $cogUkDir/cog_metadata.csv \
-    | awk -F, '{print $1 "\t" $1 "|" $5;}' \
-        >> $renaming
-    set -o pipefail
+    $scriptDir/makeNewMaskedVcf.sh $prevDate $today $problematicSitesVcf
 fi
-if [ -s newGenBank.filtered.fa ]; then
-    # Special renaming for COG-UK sequences: strip COG-UK/, add back country and year
-    set +o pipefail
-    fastaNames newGenBank.filtered.fa \
-    | grep COG-UK/ \
-    | sed -re 's/[ |].*//' \
-    | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \
-    | tawk '{print $1, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \
-    | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \
-        >> $renaming
-    fastaNames newGenBank.filtered.fa \
-    | grep -v COG-UK/ \
-    | sed -re 's/[ |].*//' \
-    | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \
-    | tawk '{ if ($3 == "") { $3 = "?"; }
-              if ($6 != "") { print $1 "\t" $6 "|" $1 "|" $3; }
-              else { print $1 "\t" $1 "|" $3; } }' \
-    | cleanGenbank \
-    | sed -re 's/ /_/g' \
-        >> $renaming
-    set -o pipefail
-fi
-if [ -s newGisaid.filtered.fa ]; then
-    zcat $gisaidDir/metadata_batch_$today.tsv.gz \
-    | grep -Fwf <(fastaNames newGisaid.filtered.fa) \
-    | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \
-        >> $renaming
-fi
-wc -l $renaming
-
-# Make masked VCF
-tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \
-    $problematicSitesVcf > mask.vcf
-# Add masked VCF to previous protobuf
-#*** With horrible hack for time being, to mask 21987 (because it messes up the Delta branch)...
-#*** TODO: make hgPhyloPlace handle protobufs that don't have all of the latest problematic sites
-#*** masked more gracefully, and update the Problematic Sites track.
-time cat <(twoBitToFa $ref2bit stdout) $alignedFa \
-| faToVcf -maxDiff=1000 \
-    -excludeFile=<(cat ../tooManyEpps.ids ../badBranchSeed.ids) \
-    -verbose=2 stdin stdout \
-| vcfRenameAndPrune stdin $renaming stdout \
-| vcfFilter -excludeVcf=mask.vcf stdin \
-| tawk '$2 != 21987' \
-| gzip -c \
-    > new.masked.vcf.gz
-
-fi # if [ ! -s new.masked.vcf.gz ]
 
 time $usher -u -T 80 \
     -A \
     -e 5 \
     --max-parsimony-per-sample 20 \
     -v new.masked.vcf.gz \
     -i prevRenamed.pb \
     -o gisaidAndPublic.$today.masked.preTrim.pb \
     >& usher.addNew.log
 mv uncondensed-final-tree.nh gisaidAndPublic.$today.preTrim.nwk
 
 # Exclude sequences with a very high number of EPPs from future runs
 grep ^Current usher.addNew.log \
 | awk '$16 >= 10 {print $8;}' \
 | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \
     >> ../tooManyEpps.ids
 
 # Prune samples with too many private mutations and internal branches that are too long.
 $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \
     -b 30 \
     -O -o gisaidAndPublic.$today.masked.pb
 
 $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today
 
-# Metadata for hgPhyloPlace:
-# Header names same as nextmeta (with strain first) so hgPhyloPlace recognizes them:
-echo -e "strain\tgenbank_accession\tdate\tcountry\thost\tcompleteness\tlength\tNextstrain_clade\tpangolin_lineage" \
-    > gisaidAndPublic.$today.metadata.tsv
-# It's not always possible to recreate both old and new names correctly from metadata,
-# so make a file to translate accession or COG-UK to the name used in VCF, tree and protobufs.
-awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' samples.$today \
-| sort \
-    > idToName
-# NCBI metadata for COG-UK: strip COG-UK/ & United Kingdom:, add country & year to name
-grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
-| tawk '$8 >= '$minReal' {print $1, $3, $4, $5, $4 "/" $6 "/" $3 "|" $1 "|" $3, $8;}' \
-| sed -re 's@COG-UK/@@g; s/United Kingdom://g;  s/(\/[0-9]{4})(-[0-9]+)*/\1/;
-           s@Northern Ireland/@NorthernIreland/@;' \
-    > tmp
-# NCBI metadata for non-COG-UK (strip colon-separated location after country if present):
-grep -v COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
-| tawk '$8 >= '$minReal' { print $1, $3, $4, $5, $6, $8; }' \
-| sed -re 's@\t([A-Za-z -]+):[^\t]+\t@\t\1\t@;' \
-| perl -wpe '@w = split("\t"); $w[4] =~ s/ /_/g; $_ = join("\t", @w);' \
-| cleanGenbank \
-| sort tmp - > gb.metadata
-if [ -e $ncbiDir/lineage_report.csv ]; then
-    echo Getting GenBank Pangolin lineages from $ncbiDir/lineage_report.csv
-    tail -n+2  $ncbiDir/lineage_report.csv \
-    | sed -re 's/^([A-Z][A-Z][0-9]{6}\.[0-9]+)[^,]*/\1/;' \
-    | awk -F, '$2 != "" && $2 != "None" {print $1 "\t" $2;}' \
-    | sort \
-        > gbToLineage
-else
-    echo Getting GenBank Pangolin lineages from $prevMeta
-    zcat $prevMeta \
-    | tail -n+2 \
-    | tawk '$2 != "" && $8 != "" { print $2, $8; }' \
-    | sort \
-        > gbToLineage
-fi
-wc -l gbToLineage
-if [ -e $ncbiDir/nextclade.tsv ]; then
-    sort $ncbiDir/nextclade.tsv > gbToNextclade
-else
-    touch gbToNextclade
-fi
-wc -l gbToNextclade
-join -t$'\t' -a 1 gb.metadata gbToNextclade \
-| join -t$'\t' -a 1 - gbToLineage \
-| tawk '{ if ($2 == "") { $2 = "?"; }
-          print $1, $1, $2, $3, $4, "", $6, $7, $8; }' \
-| join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
-    >> gisaidAndPublic.$today.metadata.tsv
-# COG-UK metadata:
-if [ -e $cogUkDir/nextclade.tsv ]; then
-    sort $cogUkDir/nextclade.tsv > cogUkToNextclade
-else
-    touch cogUkToNextclade
-fi
-#*** Could also add sequence length to metadata from faSizes output...
-tail -n+2 $cogUkDir/cog_metadata.csv \
-| awk -F, -v 'OFS=\t' '{print $1, "", $5, $3, "", "", "", $7; }' \
-| sed -re 's/UK-ENG/England/; s/UK-NIR/Northern Ireland/; s/UK-SCT/Scotland/; s/UK-WLS/Wales/;' \
-| sort \
-| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2,1.8 - cogUkToNextclade \
-| join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
-    >> gisaidAndPublic.$today.metadata.tsv
-# CNCB metadata:
-tail -n+2 $cncbDir/cncb.metadata.tsv \
-| tawk '{ if ($3 != "GISAID" && $3 != "GenBank" && $3 != "Genbank") {
-            print $2, "", $10, $11, $9, $5, $6} }' \
-| sed -re 's@\t([A-Za-z -]+)( / [A-Za-z -'"'"']+)+\t@\t\1\t@;  s/Sapiens/sapiens/;' \
-| sort \
-| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 - $cncbDir/nextclade.tsv \
-| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.2 - $cncbDir/pangolin.tsv \
-| join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
-    >> gisaidAndPublic.$today.metadata.tsv
-wc -l gisaidAndPublic.$today.metadata.tsv
-zcat $gisaidDir/metadata_batch_$today.tsv.gz \
-| grep -Fwf <(grep EPI_ISL samples.$today | cut -d\| -f 2) \
-| tawk '{print $1 "|" $3 "|" $5, "", $5, $7, $15, $13, $14, $18, $19;}' \
-    >> gisaidAndPublic.$today.metadata.tsv
-wc -l gisaidAndPublic.$today.metadata.tsv
-gzip -f gisaidAndPublic.$today.metadata.tsv
+$scriptDir/combineMetadata.sh $prevDate $today
 
 # version/description files
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
     > version.plusGisaid.txt
 sampleCountComma=$(echo $(wc -l < samples.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.plusGisaid.description.txt
 
 # Add nextclade annotations to protobuf
 if [ -s $ottoDir/$prevDate/cladeToName ]; then
     # Use yesterday's clade assignments to annotate clades on today's tree
     time $matUtils annotate -T 50 \
         -l \