abdea7a81b3d511adaee6300277721a804f5581f angie Tue Mar 30 11:57:47 2021 -0700 Ongoing fixes for odd characters in sequence names and other pipeline tweaks. diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh index 8e6cf6e..5a8eb09 100755 --- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh +++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh @@ -1,252 +1,261 @@ #!/bin/bash set -beEu -x -o pipefail # Do not modify this script, modify the source tree copy: # kent/src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh usage() { echo "usage: $0 prevDate problematicSitesVcf" echo "This assumes that ncbi.latest and cogUk.latest links/directories have been updated." } if [ $# != 2 ]; then usage exit 1 fi prevDate=$1 problematicSitesVcf=$2 ottoDir=/hive/data/outside/otto/sarscov2phylo ncbiDir=$ottoDir/ncbi.latest cogUkDir=$ottoDir/cogUk.latest cncbDir=$ottoDir/cncb.latest gisaidDir=/hive/users/angie/gisaid minReal=20000 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit epiToPublic=$gisaidDir/epiToPublicAndDate.latest scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh today=$(date +%F) cd $ottoDir/$today prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb -# Use Yatish's latest usher and matUtils usherDir=~angie/github/yatish_usher usher=$usherDir/build/usher matUtils=$usherDir/build/matUtils # Make lists of sequences already in the tree. -$matUtils summary -i $prevProtobufMasked -s >(tail -n+2 | cut -f 1 > prevNames) +# matUtils creates full paths for output files, defaulting to current directory, so can't +# output to named pipe because it becomes $cwd//dev/fd/63 ... +$matUtils summary -i $prevProtobufMasked -s prevSamples +tail -n+2 prevSamples | cut -f 1 > prevNames awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ | grep -E '^(England|Northern|Scotland|Wales)' > prevCogUk awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ | grep -E '^EPI_ISL_' > prevGisaid # Add public sequences that have been mapped to GISAID sequences to prevGisaid. grep -Fwf prevGbAcc $epiToPublic | cut -f 1 >> prevGisaid grep -Fwf prevCogUk $epiToPublic | cut -f 1 >> prevGisaid wc -l prev* # Get new GenBank sequences with at least $minReal non-N bases. +#*** TODO: exclude seqs already in tree in their GISAID forms xzcat $ncbiDir/genbank.fa.xz \ | faSomeRecords -exclude stdin prevGbAcc newGenBank.fa faSize -veryDetailed newGenBank.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > gbTooSmall # NCBI also includes NC_045512 in the download, but that's our reference, so... exclude that too. fastaNames newGenBank.fa | grep NC_045512 >> gbTooSmall faSomeRecords -exclude newGenBank.fa gbTooSmall newGenBank.filtered.fa faSize newGenBank.filtered.fa # Get new COG-UK sequences with at least $minReal non-N bases. xzcat $cogUkDir/cog_all.fasta.xz \ | faSomeRecords -exclude stdin prevCogUk newCogUk.fa faSize -veryDetailed newCogUk.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > cogUkTooSmall faSomeRecords -exclude newCogUk.fa cogUkTooSmall newCogUk.filtered.fa faSize newCogUk.filtered.fa # Get new GISAID sequences with at least $minReal non-N bases. xzcat $gisaidDir/gisaid_fullNames_$today.fa.xz \ | sed -re 's/^>.*\|(EPI_ISL_[0-9]+)\|.*/>\1/' \ | faSomeRecords -exclude stdin prevGisaid newGisaid.fa faSize -veryDetailed newGisaid.fa \ | tawk '$4 < '$minReal' {print $1;}' \ > gisaidTooSmall faSomeRecords -exclude newGisaid.fa gisaidTooSmall newGisaid.filtered.fa faSize newGisaid.filtered.fa # Exclude public-mapped sequences from newGisaid: set +o pipefail fastaNames newGisaid.filtered.fa \ | grep -Fwf - $epiToPublic \ | cut -f 1 \ > newGisaid.public.names set -o pipefail if [ -s newGisaid.public.names ]; then faSomeRecords -exclude newGisaid.filtered.fa newGisaid.public.names tmp mv tmp newGisaid.filtered.fa faSize newGisaid.filtered.fa fi cat new*.filtered.fa > new.fa faSize new.fa # Use Rob's script that aligns each sequence to NC_045512.2 and concatenates the results # as a multifasta alignment (from which we can extract VCF with SNVs): #conda install -c bioconda mafft alignedFa=new.aligned.fa rm -f $alignedFa export TMPDIR=/dev/shm time bash ~angie/github/sarscov2phylo/scripts/global_profile_alignment.sh \ -i new.fa \ -o $alignedFa \ -t 50 faSize $alignedFa #*** TODO: if EPIs have recently been mapped to public sequences, we should convert their #*** tree names from EPI to public. # Now make a renaming that keeps all the prevNames and adds full names (with shortened dates) # for the new seqs. renaming=oldAndNewNames tawk '{print $1, $1;}' prevNames > $renaming if [ -s newCogUk.filtered.fa ]; then # Sometimes all of the new COG-UK sequences are missing from cog_metadata.csv -- complained. set +o pipefail fastaNames newCogUk.filtered.fa \ | grep -Fwf - $cogUkDir/cog_metadata.csv \ | awk -F, '{print $1 "\t" $1 "|" $5;}' \ | sed -re 's/20([0-9][0-9])(-[0-9-]+)?$/\1\2/;' \ >> $renaming set -o pipefail fi if [ -s newGenBank.filtered.fa ]; then fastaNames newGenBank.filtered.fa \ | sed -re 's/[ |].*//' \ | grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \ | tawk '{ if ($3 == "") { $3 = "?"; } if ($6 != "") { print $1 "\t" $6 "|" $1 "|" $3; } else { print $1 "\t" $1 "|" $3; } }' \ | cleanGenbank \ | sed -re 's/ /_/g' \ | sed -re 's/20([0-9][0-9])(-[0-9-]+)?$/\1\2/;' \ >> $renaming fi if [ -s newGisaid.filtered.fa ]; then zcat $gisaidDir/metadata_batch_$today.tsv.gz \ | grep -Fwf <(fastaNames newGisaid.filtered.fa) \ | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \ >> $renaming fi wc -l $renaming # Remove old tree's GISAID items that have been mapped to public sequences so we don't have dups set +o pipefail cut -f 1 $epiToPublic \ | grep -Fwf - prevNames \ | cat \ > mappedGisaidToRemove set -o pipefail wc -l mappedGisaidToRemove if [ -s mappedGisaidToRemove ]; then $matUtils extract -i $prevProtobufMasked \ -s mappedGisaidToRemove \ -p \ -o prevTrimmed.pb else cp -p $prevProtobufMasked prevTrimmed.pb fi # Make masked VCF tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \ $problematicSitesVcf > mask.vcf # Add masked VCF to previous protobuf time cat <(twoBitToFa $ref2bit stdout) $alignedFa \ | faToVcf stdin stdout \ | vcfRenameAndPrune stdin $renaming stdout \ | vcfFilter -excludeVcf=mask.vcf stdin \ | gzip -c \ > new.masked.vcf.gz time $usher -u -T 50 \ -v new.masked.vcf.gz \ -i prevTrimmed.pb \ -o gisaidAndPublic.$today.masked.pb \ >& usher.addNew.log mv uncondensed-final-tree.nh gisaidAndPublic.$today.nwk # Metadata for hgPhyloPlace: for now, start with already-built public metadata. zcat public-$today.metadata.tsv.gz > gisaidAndPublic.$today.metadata.tsv zcat $gisaidDir/metadata_batch_$today.tsv.gz \ | grep -Fwf <(cut -f 2 $renaming | grep EPI_ISL | cut -d\| -f 2) \ | tawk '{print $1 "|" $3 "|" $5, "", $5, $7, $15, $13, $14, $18, $19;}' \ >> gisaidAndPublic.$today.metadata.tsv wc -l gisaidAndPublic.$today.metadata.tsv gzip gisaidAndPublic.$today.metadata.tsv # version/description files cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/') echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \ > version.plusGisaid.txt sampleCountComma=$(echo $(wc -l < $renaming) \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ > hgPhyloPlace.plusGisaid.description.txt # Add nextclade annotations to protobuf zcat gisaidAndPublic.$today.metadata.tsv.gz \ | tail -n+2 | tawk '$8 != "" {print $8, $1;}' \ | sed -re 's/^20E \(EU1\)/20E.EU1/;' \ > cladeToName time $matUtils annotate -T 50 \ -l \ -i gisaidAndPublic.$today.masked.pb \ -c cladeToName \ -o gisaidAndPublic.$today.masked.nextclade.pb \ >& annotate.nextclade.out # Add pangolin lineage annotations to protobuf. Use pangoLEARN training samples; # convert EPI IDs to public to match tree IDs. tail -n+2 ~angie/github/pangoLEARN/pangoLEARN/data/lineages.metadata.csv \ | awk -F, '{print $7 "\t" $2;}' \ | sort > epiExemplarToLineage subColumn -miss=/dev/null 1 epiExemplarToLineage \ <(cut -f 1,2 $gisaidDir/epiToPublicAndDate.$today) stdout \ | sort > idExemplarToLineage cut -f 2 $renaming | grep -Fwf <(cut -f 1 idExemplarToLineage) \ | awk -F\| '{ if ($3 == "") { print $1 "\t" $0 } else { print $2 "\t" $0; } }' \ | sort > idExemplarToName join -t$'\t' idExemplarToName idExemplarToLineage \ | tawk '{print $3, $2;}' \ | sort > lineageToName -time $matUtils annotate -T 50 \ +linFile=lineageToName + +# Until pangoLEARN training is done, use pango-designation file for now. +#linFile=/hive/users/angie/matLineages/linToPublicPlusGisaidName.2021-03-16 + +#*** +##***time $matUtils annotate -T 50 \ +time ~angie/github/usher/build/matUtils annotate -T 50 \ -i gisaidAndPublic.$today.masked.nextclade.pb \ - -c lineageToName \ + -c $linFile \ -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \ >& annotate.pangolin.out mv gisaidAndPublic.$today.masked{,.unannotated}.pb ln gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb # Extract public samples from tree -$matUtils summary -i gisaidAndPublic.$today.masked.pb -s >(tail -n+2 | cut -f 1 > newNames) +$matUtils summary -i gisaidAndPublic.$today.masked.pb -s newSamples +tail -n+2 newSamples | cut -f 1 > newNames grep -v EPI_ISL_ newNames > newPublicNames $matUtils extract -i gisaidAndPublic.$today.masked.pb \ -s newPublicNames \ -O -o public.$today.masked.pb for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \ $dir/public.plusGisaid.latest.metadata.tsv.gz ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt done -# Clean up -nice xz -f new*fa &