83f9e814d5441d0ba86d5fa735aed67d264a771f angie Sun Jun 27 20:27:52 2021 -0700 Skip building new.masked.vcf.gz if it already exists (go straight to usher). More prevention of empty-grep crashes. Use faToVcf new -maxDiff option to prevent memory usage blowup for sequences that are so diverged/erroneous that we won't keep them anyway. Exclude sequences with 10 or more equally parsimonious placements so we don't keep trying in vain (and wasting time). Handle new nextstrain clade names with spaces etc. diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh index acd9fa0..63bbdc5 100755 --- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh +++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh @@ -29,30 +29,34 @@ source $scriptDir/util.sh today=$(date +%F) y=$(date +%Y) m=$(date +%m) d=$(date +%d) cd $ottoDir/$today prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb usherDir=~angie/github/usher usher=$usherDir/build/usher matUtils=$usherDir/build/matUtils +renaming=oldAndNewNames + +if [ ! -s new.masked.vcf.gz ]; then + # Make lists of sequences already in the tree. $matUtils extract -i $prevProtobufMasked -u prevNames # Before updating the tree with new sequences, update the names used in the tree: # * Sequences that are already in the tree with EPI_ IDs, but that have been mapped to public IDs # * COG-UK sequences that are in GenBank. Per Sam Nicholls the isolate alone is enough to identify # them -- no need to match country & year which are sometimes incorrect at first. grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \ | tawk '{print $6, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \ | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \ | sort \ > cogUkInGenBankIsolateToNewName fastaNames $cogUkDir/cog_all.fasta.xz > cogUk.names grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) cogUk.names \ > cogUkInGenBank @@ -88,37 +92,46 @@ | grep -v COG-UK/ \ | tawk '{if ($4 != "" && $3 == $2) { print $1, $2 "|" $4; } else if ($4 != "") { print $1, $3 "|" $2 "|" $4; }}' \ | sort \ > epiToNewName set -o pipefail # Argh, missing sequences in COG-UK metadata can mean that a sequence may have been added to the # tree both with and without EPI ID... so renaming makes a name conflict. # If there are any of those then prune the sequences with EPI_ID or longer dates, so renaming doesn't # cause conflict. comm -12 <(cut -f 2 epiToNewName | sort) <(sort prevNames) > epiToNewNameAlreadyInTree join -t$'\t' epiToPrevName <(grep -vFwf epiToNewNameAlreadyInTree epiToNewName) \ | cut -f 2,3 \ >> prevTree.renaming cp /dev/null dupsToRemove if [ -s epiToNewNameAlreadyInTree ]; then + set +o pipefail grep -Fwf <(cut -d\| -f 1 epiToNewNameAlreadyInTree) prevNames \ | grep EPI_ISL \ + | cat \ >> dupsToRemove + set -o pipefail fi +# Don't rename if the final name is already in the tree; remove the dup with the old name. +set +o pipefail +cut -f 2 prevTree.renaming \ +| grep -Fwf - prevNames | cat \ + > alreadyThere +grep -Fwf alreadyThere prevTree.renaming | cut -f 1 \ + >> dupsToRemove # And finally, sometimes there are duplicates due to country or date being corrected in COG-UK # metadata. -set +o pipefail grep -vFwf dupsToRemove prevTree.renaming \ | cut -f 2 | sort | uniq -c | awk '$1 > 1 {print $2;}' \ | cut -d/ -f 2 \ > cogUkIsolateStillDup grep -Fwf cogUkIsolateStillDup $cogUkDir/cog_metadata.csv \ | awk -F, '{print $1 "|" $5;}' \ > cogDupsLatestMetadata grep -Fwf cogUkIsolateStillDup prevNames \ | grep -vFwf dupsToRemove \ | grep -vFwf cogDupsLatestMetadata \ | cat \ >> dupsToRemove set -o pipefail startingProtobuf=$prevProtobufMasked if [ -s dupsToRemove ]; then @@ -207,31 +220,30 @@ faSize new.fa # Use Rob's script that aligns each sequence to NC_045512.2 and concatenates the results # as a multifasta alignment (from which we can extract VCF with SNVs): #conda install -c bioconda mafft alignedFa=new.aligned.fa rm -f $alignedFa export TMPDIR=/dev/shm time bash ~angie/github/sarscov2phylo/scripts/global_profile_alignment.sh \ -i new.fa \ -o $alignedFa \ -t 50 faSize $alignedFa # Now make a renaming that keeps all the prevNames and adds full names for the new seqs. -renaming=oldAndNewNames tawk '{print $1, $1;}' prevNames > $renaming if [ -s newCogUk.filtered.fa ]; then # Sometimes all of the new COG-UK sequences are missing from cog_metadata.csv -- complained. set +o pipefail fastaNames newCogUk.filtered.fa \ | grep -Fwf - $cogUkDir/cog_metadata.csv \ | awk -F, '{print $1 "\t" $1 "|" $5;}' \ >> $renaming set -o pipefail fi if [ -s newGenBank.filtered.fa ]; then # Special renaming for COG-UK sequences: strip COG-UK/, add back country and year set +o pipefail fastaNames newGenBank.filtered.fa \ | grep COG-UK/ \ @@ -253,44 +265,53 @@ set -o pipefail fi if [ -s newGisaid.filtered.fa ]; then zcat $gisaidDir/metadata_batch_$today.tsv.gz \ | grep -Fwf <(fastaNames newGisaid.filtered.fa) \ | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \ >> $renaming fi wc -l $renaming # Make masked VCF tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \ $problematicSitesVcf > mask.vcf # Add masked VCF to previous protobuf time cat <(twoBitToFa $ref2bit stdout) $alignedFa \ -| faToVcf stdin stdout \ +| faToVcf -maxDiff=1000 -excludeFile=../tooManyEpps.ids -verbose=2 stdin stdout \ | vcfRenameAndPrune stdin $renaming stdout \ | vcfFilter -excludeVcf=mask.vcf stdin \ | gzip -c \ > new.masked.vcf.gz -time $usher -u -T 50 \ + +fi # if [ ! -s new.masked.vcf.gz ] + +time $usher -u -T 80 \ -A \ -e 5 \ -v new.masked.vcf.gz \ -i prevRenamed.pb \ -o gisaidAndPublic.$today.masked.preTrim.pb \ >& usher.addNew.log mv uncondensed-final-tree.nh gisaidAndPublic.$today.preTrim.nwk +# Exclude sequences with a very high number of EPPs from future runs +grep ^Current usher.addNew.log \ +| awk '$16 >= 10 {print $8;}' \ +| awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \ + >> ../tooManyEpps.ids + # Prune samples with too many private mutations and internal branches that are too long. $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \ -a 20 \ -b 30 \ -O -o gisaidAndPublic.$today.masked.pb # Metadata for hgPhyloPlace: # Header names same as nextmeta (with strain first) so hgPhyloPlace recognizes them: echo -e "strain\tgenbank_accession\tdate\tcountry\thost\tcompleteness\tlength\tNextstrain_clade\tpangolin_lineage" \ > gisaidAndPublic.$today.metadata.tsv # It's not always possible to recreate both old and new names correctly from metadata, # so make a file to translate accession or COG-UK to the name used in VCF, tree and protobufs. cut -f 2 $renaming \ | awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' \ | sort \ @@ -368,32 +389,32 @@ gzip gisaidAndPublic.$today.metadata.tsv # version/description files cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/') echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \ > version.plusGisaid.txt $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today sampleCountComma=$(echo $(wc -l < samples.$today) \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ > hgPhyloPlace.plusGisaid.description.txt # Add nextclade annotations to protobuf zcat gisaidAndPublic.$today.metadata.tsv.gz \ | tail -n+2 | tawk '$8 != "" {print $8, $1;}' \ -| sed -re 's/^20E \(EU1\)/20E.EU1/;' \ - > cladeToName +| subColumn -miss=/dev/null 1 stdin ../nextcladeToShort cladeToName + time $matUtils annotate -T 50 \ -l \ -i gisaidAndPublic.$today.masked.pb \ -c cladeToName \ -u mutations.nextclade \ -D details.nextclade \ -o gisaidAndPublic.$today.masked.nextclade.pb \ >& annotate.nextclade.out # Add pangolin lineage annotations to protobuf. Use pangoLEARN training samples; # convert EPI IDs to public to match tree IDs. tail -n+2 ~angie/github/pango-designation/lineages.metadata.csv \ | grep -vFwf $ottoDir/clades.blackList \ | awk -F, '{print $9 "\t" $2;}' \ | sort > epiExemplarToLineage