8446663b24ef2f8db0695d970f7c6b4a03a9949f angie Mon Apr 26 15:33:26 2021 -0700 Use matUtils annotate new options -m and -D to get mutations and details files. diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh index ccc5640..64b644c 100755 --- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh +++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh @@ -21,31 +21,31 @@ ncbiDir=$ottoDir/ncbi.latest cogUkDir=$ottoDir/cogUk.latest cncbDir=$ottoDir/cncb.latest gisaidDir=/hive/users/angie/gisaid minReal=20000 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit epiToPublic=$gisaidDir/epiToPublicAndDate.latest scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh today=$(date +%F) cd $ottoDir/$today prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb -usherDir=~angie/github/yatish_usher +usherDir=~angie/github/usher usher=$usherDir/build/usher matUtils=$usherDir/build/matUtils # Make lists of sequences already in the tree. # matUtils creates full paths for output files, defaulting to current directory, so can't # output to named pipe because it becomes $cwd//dev/fd/63 ... $matUtils summary -i $prevProtobufMasked -s prevSamples tail -n+2 prevSamples | cut -f 1 > prevNames awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ | grep -E '^(England|Northern|Scotland|Wales)' > prevCogUk awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ | grep -E '^EPI_ISL_' > prevGisaid # Add public sequences that have been mapped to GISAID sequences to prevGisaid. @@ -199,57 +199,59 @@ $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today sampleCountComma=$(echo $(wc -l < samples.$today) \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ > hgPhyloPlace.plusGisaid.description.txt # Add nextclade annotations to protobuf zcat gisaidAndPublic.$today.metadata.tsv.gz \ | tail -n+2 | tawk '$8 != "" {print $8, $1;}' \ | sed -re 's/^20E \(EU1\)/20E.EU1/;' \ > cladeToName time $matUtils annotate -T 50 \ -l \ -i gisaidAndPublic.$today.masked.pb \ -c cladeToName \ + -m mutations.nextclade \ + -D details.nextclade \ -o gisaidAndPublic.$today.masked.nextclade.pb \ >& annotate.nextclade.out # Add pangolin lineage annotations to protobuf. Use pangoLEARN training samples; # convert EPI IDs to public to match tree IDs. tail -n+2 ~angie/github/pangoLEARN/pangoLEARN/data/lineages.metadata.csv \ | awk -F, '{print $7 "\t" $2;}' \ | sort > epiExemplarToLineage subColumn -miss=/dev/null 1 epiExemplarToLineage \ <(cut -f 1,2 $gisaidDir/epiToPublicAndDate.$today) stdout \ | sort > idExemplarToLineage cut -f 2 $renaming | grep -Fwf <(cut -f 1 idExemplarToLineage) \ | awk -F\| '{ if ($3 == "") { print $1 "\t" $0 } else { print $2 "\t" $0; } }' \ | sort > idExemplarToName join -t$'\t' idExemplarToName idExemplarToLineage \ | tawk '{print $3, $2;}' \ | sort > lineageToName linFile=lineageToName # Until pangoLEARN training is done, use pango-designation file for now. #linFile=/hive/users/angie/matLineages/linToPublicPlusGisaidName.2021-03-16 -#*** -##***time $matUtils annotate -T 50 \ -time ~angie/github/usher/build/matUtils annotate -T 50 \ +time $matUtils annotate -T 50 \ -i gisaidAndPublic.$today.masked.nextclade.pb \ -c $linFile \ + -m mutations.pangolin \ + -D details.pangolin \ -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \ >& annotate.pangolin.out mv gisaidAndPublic.$today.masked{,.unannotated}.pb ln gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb # Extract public samples from tree $matUtils summary -i gisaidAndPublic.$today.masked.pb -s newSamples tail -n+2 newSamples | cut -f 1 > newNames grep -v EPI_ISL_ newNames > newPublicNames $matUtils extract -i gisaidAndPublic.$today.masked.pb \ -s newPublicNames \ -O -o public.$today.masked.pb for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do