39d3f511f6022a1b86ba79a68b19b3878e50db89 angie Wed Sep 8 12:07:05 2021 -0700 New script for generating lineageTree.pb used by pangolin --usher: re-root big tree to lineage A, generate random-representative subtrees, test & pick least discordant tree. diff --git src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh new file mode 100755 index 0000000..afbf71f --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh @@ -0,0 +1,107 @@ +#!/bin/bash +source ~/.bashrc +set -beEu -x -o pipefail + +# Do not modify this script, modify the source tree copy: +# kent/src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh + +usage() { + echo "usage: $0 buildDate" +} + +if [ $# != 1 ]; then + usage + exit 1 +fi + +buildDate=$1 + +ottoDir=/hive/data/outside/otto/sarscov2phylo + +usherDir=~angie/github/usher +matUtils=$usherDir/build/matUtils + +today=$(date +%F) +cd $ottoDir/$buildDate + +# Get full sample name for lineage A sequence used as reference/root by Pangolin: +wh4SampleName=$(grep LR757995 samples.$buildDate) + +# Reroot protobuf: +$matUtils extract -i gisaidAndPublic.$buildDate.masked.pb \ + --reroot "$wh4SampleName" \ + -o gisaidAndPublic.$buildDate.masked.reroot.pb + +# Assign updated lineages on the rerooted tree, pango-only for pangolin: +time $matUtils annotate -T 50 \ + -i gisaidAndPublic.$buildDate.masked.reroot.pb \ + -l \ + -c lineageToName \ + -f 0.95 \ + -u mutations.pangoOnly \ + -D details.pangoOnly \ + -o gisaidAndPublic.$buildDate.masked.reroot.pangoOnly.pb \ + >& annotate.pangoOnly.out + +set +o pipefail +grep 'Could not' annotate.pangoOnly.out | cat +set -o pipefail + +# Make a bunch of smaller trees and see how they do. +mkdir /hive/users/angie/lineageTreeUpdate.$today +cd /hive/users/angie/lineageTreeUpdate.$today +for i in 0 1 2 3 4 5 6 7 8 9; do + echo test.50.$i + $matUtils extract -i $ottoDir/$buildDate/gisaidAndPublic.$buildDate.masked.reroot.pangoOnly.pb \ + -r 50 -o test.50.$i.pb + $matUtils mask -i test.50.$i.pb -S -o test.50.$i.simp.pb +done + +# Run pangolin on each subtree +# 7.5-14.5m each job, ~3.5hrs total: +conda activate pangolin +for i in 0 1 2 3 4 5 6 7 8 9; do + echo test.50.$i + time pangolin -t 50 --usher-tree test.50.$i.pb \ + --skip-designation-hash --no-temp --outdir subset_10000_0.pusher.test.50.$i.out \ + ../pangolin_eval/subset_10000_0.fa + echo test.50.$i.simp + time pangolin -t 50 --usher-tree test.50.$i.simp.pb \ + --skip-designation-hash --no-temp --outdir subset_10000_0.pusher.test.50.$i.simp.out \ + ../pangolin_eval/subset_10000_0.fa +done + +# Summarize results +for i in 0 1 2 3 4 5 6 7 8 9; do + echo test.50.$i + tail -n+2 subset_10000_0.pusher.test.50.$i.out/lineage_report.csv \ + | awk -F, '{print $1 "\t" $2;}' \ + | sort \ + > subset_10000_0.pusher.test.50.$i + join -t$'\t' ../pangolin_eval/subset_10000_0.cogNameToLin subset_10000_0.pusher.test.50.$i \ + | tawk '$2 != $3' \ + > subset_10000_0.pusher.test.50.$i.diff + cut -f 2,3 subset_10000_0.pusher.test.50.$i.diff \ + | sort | uniq -c | sort -nr > subset_10000_0.pusher.test.50.$i.diffrank + tail -n+2 subset_10000_0.pusher.test.50.$i.simp.out/lineage_report.csv \ + | awk -F, '{print $1 "\t" $2;}' \ + | sort \ + > subset_10000_0.pusher.test.50.$i.simp + join -t$'\t' ../pangolin_eval/subset_10000_0.cogNameToLin subset_10000_0.pusher.test.50.$i.simp \ + | tawk '$2 != $3' \ + > subset_10000_0.pusher.test.50.$i.simp.diff + cut -f 2,3 subset_10000_0.pusher.test.50.$i.simp.diff \ + | sort | uniq -c | sort -nr > subset_10000_0.pusher.test.50.$i.simp.diffrank +done + +wc -l subset_10000_0.*.diff | sort -n + +# Pick the subtree with the fewest differences +set +o pipefail +bestTree=$(wc -l subset_10000_0.*.diff | sort -n | head -1 \ + | sed -re 's/.*pusher\.test\.50\.//; s/\.diff//') +set -o pipefail +echo $bestTree + +# Which lineages have the most differences? +head subset_10000_0.pusher.test.50.$bestTree.diffrank