543b4946d460da38fd7dae2c3bfb3ece9af03218 angie Tue Nov 2 12:53:34 2021 -0700 Add the new -M (--clade-mutations) option to matUtils annotate for precise (yet hopefully more stable than --clade-paths) identification of lineage root nodes. Also clip input columns when tacking on clade & lineage from tree at end, so we can run multiple times without gaining extra columns. diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh index 2770026..a374224 100755 --- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh +++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh @@ -63,79 +63,82 @@ # version/description files cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/') echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \ > version.plusGisaid.txt sampleCountComma=$(echo $(wc -l < samples.$today) \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ > hgPhyloPlace.plusGisaid.description.txt # Add nextclade annotations to protobuf if [ -s $ottoDir/$prevDate/cladeToName ]; then # Use yesterday's clade assignments to annotate clades on today's tree time $matUtils annotate -T 50 \ -l \ -i gisaidAndPublic.$today.masked.pb \ + -M $scriptDir/nextstrain.clade-mutations.tsv \ -c $ottoDir/$prevDate/cladeToName \ -f 0.95 \ -D details.nextclade \ -o gisaidAndPublic.$today.masked.nextclade.pb \ >& annotate.nextclade else time $matUtils annotate -T 50 \ -l \ -i gisaidAndPublic.$today.masked.pb \ -P ../nextstrain.clade-paths.tsv \ -o gisaidAndPublic.$today.masked.nextclade.pb fi # Add pangolin lineage annotations to protobuf. if [ -s $ottoDir/$prevDate/lineageToName ]; then time $matUtils annotate -T 50 \ -i gisaidAndPublic.$today.masked.nextclade.pb \ + -M $scriptDir/pango.clade-mutations.tsv \ -c $ottoDir/$prevDate/lineageToName \ -f 0.95 \ -D details.pango \ -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \ >& annotate.pango else time $matUtils annotate -T 50 \ -i gisaidAndPublic.$today.masked.nextclade.pb \ -P ../pango.clade-paths.tsv \ -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb fi # Replace protobuf with annotated protobuf. mv gisaidAndPublic.$today.masked{,.unannotated}.pb ln -f gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb # Save clade & lineage annotations for use tomorrow. $matUtils summary -i gisaidAndPublic.$today.masked.pb -C sample-clades tail -n+2 sample-clades \ | tawk '{print $2, $1;}' \ | sort > cladeToName tail -n+2 sample-clades \ | tawk '{print $3, $1;}' \ | sort > lineageToName # Add clade & lineage from tree to metadata. zcat gisaidAndPublic.$today.metadata.tsv.gz \ | tail -n+2 \ +| cut -f 1-9 \ | sort > tmp1 tail -n+2 sample-clades \ | sort > tmp2 -paste <(zcat gisaidAndPublic.$today.metadata.tsv.gz | head -1) \ +paste <(zcat gisaidAndPublic.$today.metadata.tsv.gz | cut -f 1-9 | head -1) \ <(echo -e "Nextstrain_clade_usher\tpango_lineage_usher") \ > gisaidAndPublic.$today.metadata.tsv join -t$'\t' tmp1 tmp2 \ >> gisaidAndPublic.$today.metadata.tsv gzip -f gisaidAndPublic.$today.metadata.tsv rm tmp1 tmp2 # EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have # public names & IDs, we can match them. cut -f 1,3 $epiToPublic > epiToPublic.latest # Update links to latest public+GISAID protobuf and metadata in hgwdev cgi-bin directories for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \