4efa4aebadc39cb64a256c3489035b00bd2abdc6 angie Wed Sep 8 12:06:50 2021 -0700 Remove dependency on reference.fa from aligning new sequences; add tree-annotated Nextstrain clade and Pango lineage to metadata. diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh index fee47a9..f867a2c 100755 --- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh +++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh @@ -1,47 +1,47 @@ #!/bin/bash set -beEu -x -o pipefail # Do not modify this script, modify the source tree copy: # kent/src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh usage() { - echo "usage: $0 prevDate problematicSitesVcf" + echo "usage: $0 prevDate today problematicSitesVcf" echo "This assumes that ncbi.latest and cogUk.latest links/directories have been updated." } -if [ $# != 2 ]; then +if [ $# != 3 ]; then usage exit 1 fi prevDate=$1 -problematicSitesVcf=$2 +today=$2 +problematicSitesVcf=$3 ottoDir=/hive/data/outside/otto/sarscov2phylo ncbiDir=$ottoDir/ncbi.latest cogUkDir=$ottoDir/cogUk.latest cncbDir=$ottoDir/cncb.latest gisaidDir=/hive/users/angie/gisaid minReal=20000 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit epiToPublic=$gisaidDir/epiToPublicAndDate.latest scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh -today=$(date +%F) - +mkdir -p $ottoDir/$today cd $ottoDir/$today prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb prevMeta=$ottoDir/$prevDate/public-$prevDate.metadata.tsv.gz usherDir=~angie/github/usher usher=$usherDir/build/usher matUtils=$usherDir/build/matUtils renaming=oldAndNewNames if [ ! -s new.masked.vcf.gz ]; then # Make lists of sequences already in the tree. $matUtils extract -i $prevProtobufMasked -u prevNames @@ -298,38 +298,39 @@ -o gisaidAndPublic.$today.masked.preTrim.pb \ >& usher.addNew.log mv uncondensed-final-tree.nh gisaidAndPublic.$today.preTrim.nwk # Exclude sequences with a very high number of EPPs from future runs grep ^Current usher.addNew.log \ | awk '$16 >= 10 {print $8;}' \ | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \ >> ../tooManyEpps.ids # Prune samples with too many private mutations and internal branches that are too long. $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \ -b 30 \ -O -o gisaidAndPublic.$today.masked.pb +$matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today + # Metadata for hgPhyloPlace: # Header names same as nextmeta (with strain first) so hgPhyloPlace recognizes them: echo -e "strain\tgenbank_accession\tdate\tcountry\thost\tcompleteness\tlength\tNextstrain_clade\tpangolin_lineage" \ > gisaidAndPublic.$today.metadata.tsv # It's not always possible to recreate both old and new names correctly from metadata, # so make a file to translate accession or COG-UK to the name used in VCF, tree and protobufs. -cut -f 2 $renaming \ -| awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' \ +awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' samples.$today \ | sort \ > idToName # NCBI metadata for COG-UK: strip COG-UK/ & United Kingdom:, add country & year to name grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \ | tawk '$8 >= '$minReal' {print $1, $3, $4, $5, $4 "/" $6 "/" $3 "|" $1 "|" $3, $8;}' \ | sed -re 's@COG-UK/@@g; s/United Kingdom://g; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s@Northern Ireland/@NorthernIreland/@;' \ > tmp # NCBI metadata for non-COG-UK (strip colon-separated location after country if present): grep -v COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \ | tawk '$8 >= '$minReal' { print $1, $3, $4, $5, $6, $8; }' \ | sed -re 's@\t([A-Za-z -]+):[^\t]+\t@\t\1\t@;' \ | perl -wpe '@w = split("\t"); $w[4] =~ s/ /_/g; $_ = join("\t", @w);' \ | cleanGenbank \ | sort tmp - > gb.metadata @@ -375,41 +376,40 @@ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2,1.8 - cogUkToNextclade \ | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \ >> gisaidAndPublic.$today.metadata.tsv # CNCB metadata: tail -n+2 $cncbDir/cncb.metadata.tsv \ | tawk '{ if ($3 != "GISAID" && $3 != "GenBank" && $3 != "Genbank") { print $2, "", $10, $11, $9, $5, $6} }' \ | sed -re 's@\t([A-Za-z -]+)( / [A-Za-z -'"'"']+)+\t@\t\1\t@; s/Sapiens/sapiens/;' \ | sort \ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 - $cncbDir/nextclade.tsv \ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.2 - $cncbDir/pangolin.tsv \ | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \ >> gisaidAndPublic.$today.metadata.tsv wc -l gisaidAndPublic.$today.metadata.tsv zcat $gisaidDir/metadata_batch_$today.tsv.gz \ -| grep -Fwf <(cut -f 2 $renaming | grep EPI_ISL | cut -d\| -f 2) \ +| grep -Fwf <(grep EPI_ISL samples.$today | cut -d\| -f 2) \ | tawk '{print $1 "|" $3 "|" $5, "", $5, $7, $15, $13, $14, $18, $19;}' \ >> gisaidAndPublic.$today.metadata.tsv wc -l gisaidAndPublic.$today.metadata.tsv gzip -f gisaidAndPublic.$today.metadata.tsv # version/description files cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/') echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \ > version.plusGisaid.txt -$matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today sampleCountComma=$(echo $(wc -l < samples.$today) \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ > hgPhyloPlace.plusGisaid.description.txt # Add nextclade annotations to protobuf if [ -s $ottoDir/$prevDate/cladeToName ]; then # Use yesterday's clade assignments to annotate clades on today's tree time $matUtils annotate -T 50 \ -l \ -i gisaidAndPublic.$today.masked.pb \ -c $ottoDir/$prevDate/cladeToName \ -f 0.95 \ -D details.nextclade \ -o gisaidAndPublic.$today.masked.nextclade.pb \ @@ -439,40 +439,55 @@ fi # Replace protobuf with annotated protobuf. mv gisaidAndPublic.$today.masked{,.unannotated}.pb ln -f gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb # Save clade & lineage annotations for use tomorrow. $matUtils summary -i gisaidAndPublic.$today.masked.pb -C sample-clades tail -n+2 sample-clades \ | tawk '{print $2, $1;}' \ | sort > cladeToName tail -n+2 sample-clades \ | tawk '{print $3, $1;}' \ | sort > lineageToName +# Add clade & lineage from tree to metadata. +zcat gisaidAndPublic.$today.metadata.tsv.gz \ +| tail -n+2 \ +| sort > tmp1 +tail -n+2 sample-clades \ +| sort > tmp2 +paste <(zcat gisaidAndPublic.$today.metadata.tsv.gz | head -1) \ + <(echo -e "Nextstrain_clade_usher\tpango_lineage_usher") \ + > gisaidAndPublic.$today.metadata.tsv +join -t$'\t' tmp1 tmp2 \ + >> gisaidAndPublic.$today.metadata.tsv +gzip -f gisaidAndPublic.$today.metadata.tsv +rm tmp1 tmp2 + # EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have # public names & IDs, we can match them. cut -f 1,3 $epiToPublic > epiToPublic.latest # Update links to latest public+GISAID protobuf and metadata in hgwdev cgi-bin directories for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \ $dir/public.plusGisaid.latest.metadata.tsv.gz ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt ln -sf `pwd`/epiToPublic.latest $dir/ done # Make Taxodium-formatted protobuf for display zcat /hive/data/genomes/wuhCor1/goldenPath/bigZips/genes/ncbiGenes.gtf.gz > ncbiGenes.gtf +zcat /hive/data/genomes/wuhCor1/wuhCor1.fa.gz > wuhCor1.fa zcat gisaidAndPublic.$today.metadata.tsv.gz > metadata.tmp.tsv time $matUtils extract -i gisaidAndPublic.$today.masked.pb \ - -f reference.fa \ + -f wuhCor1.fa \ -g ncbiGenes.gtf \ -M metadata.tmp.tsv \ --write-taxodium gisaidAndPublic.$today.masked.taxodium.pb -rm metadata.tmp.tsv +rm metadata.tmp.tsv wuhCor1.fa gzip gisaidAndPublic.$today.masked.taxodium.pb $scriptDir/extractPublicTree.sh $today