3da34ac26ab54ab67a772579de61222237f99e85
angie
  Tue Jun 3 11:22:08 2025 -0700
Exempt a couple lineages with few representatives, all of which have reversions, from the reversion-pruner.  Update to use .pb.gz instead of .pb.

diff --git src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh
index 0e4cdbd1625..9459bda28b0 100755
--- src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh
+++ src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh
@@ -1,119 +1,156 @@
 #!/bin/bash
 source ~/.bashrc
 set -beEu -x -o pipefail
 
 #	Do not modify this script, modify the source tree copy:
 #	kent/src/hg/utils/otto/sarscov2phylo/updateLineageTreePb.sh
 
 usage() {
-    echo "usage: $0 buildDate"
+    echo "usage: $0 buildDate [tree.pb.gz]"
 }
 
-if [ $# != 1 ]; then
+if (( $# != 1 && $# != 2 )); then
   usage
   exit 1
 fi
 
 buildDate=$1
+if [ $# == 2 ]; then
+    startingTree=$2
+else
+    startingTree=gisaidAndPublic.$buildDate.masked.pb.gz
+fi
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 
 usherDir=~angie/github/usher
 matUtils=$usherDir/build/matUtils
 
 today=$(date +%F)
 cd $ottoDir/$buildDate
 
 # Remove sequences that have two or more reversions relative to their assigned clade/lineage.
-$matUtils extract -i gisaidAndPublic.$buildDate.masked.pb --node-stats node-stats
-tawk '$5 > 1 {print $1;}' node-stats > pruneRevs
-$matUtils extract -i gisaidAndPublic.$buildDate.masked.pb \
-    -p -s pruneRevs -O -o gisaidAndPublic.$buildDate.masked.pruneRevs.pb
+$matUtils summary -i $startingTree --node-stats node-stats
+#*** Until BA.2.3.22 gets more samples that don't have bogus reversion on 25000 and 26577,
+#*** exempt some samples; also, most of JP.1 has rev on 27383,27384:
+cat > pruneRevsExemptions <<EOF
+Germany/HH-RKI-I-1073418/2022|EPI_ISL_16330648|2022-12-18
+Germany/SH-RKI-I-1071936/2022|EPI_ISL_16329344|2022-12-05
+Germany/SH-RKI-I-1074649/2022|EPI_ISL_16396486|2022-12-12
+Germany/SH-RKI-I-1071904/2022|EPI_ISL_16329314|2022-12-05
+Germany/SH-RKI-I-1071924/2022|EPI_ISL_16329332|2022-12-05
+Germany/SH-RKI-I-1071920/2022|EPI_ISL_16329328|2022-12-05
+Germany/SH-RKI-I-1071899/2022|EPI_ISL_16329310|2022-12-05
+Germany/HH-RKI-I-1029775/2022|EPI_ISL_15901009|2022-11-13
+Germany/HH-RKI-I-1105132/2023|EPI_ISL_16940514|2023-01-27
+OY160699.1
+OY148358.1
+OY125285.1
+OY165180.1
+OY176754.1
+OY146516.1
+OY142598.1
+OY124642.1
+OY143123.1
+OY151190.1
+SouthAfrica/NICD-N55605/2023|EPI_ISL_17859330|2023-05-22
+SouthAfrica/NICD-N56015/2023|EPI_ISL_18125227|2023-06-15
+SouthAfrica/NICD-N56013/2023|EPI_ISL_18125226|2023-06-15
+SouthAfrica/NICD-R10184/2023|EPI_ISL_18341856|2023-09-05
+SouthAfrica/NICD-R10169/2023|EPI_ISL_18341854|2023-09-05
+SouthAfrica/NICD-R10455/2023|EPI_ISL_18341867|2023-09-12
+EOF
+tawk '$5 > 1 {print $1;}' node-stats \
+| grep -vFwf pruneRevsExemptions \
+    > pruneRevs
+$matUtils extract -i $startingTree \
+    -p -s pruneRevs -O -o gisaidAndPublic.$buildDate.masked.pruneRevs.pb.gz
 
 # Get node ID for root of lineage A, used as reference/root by Pangolin:
-$matUtils extract -i gisaidAndPublic.$buildDate.masked.pruneRevs.pb -C clade-paths.prunedRevs
+$matUtils extract -i gisaidAndPublic.$buildDate.masked.pruneRevs.pb.gz -C clade-paths.prunedRevs
 lineageARoot=$(grep ^A$'\t' clade-paths.prunedRevs | cut -f 2)
 
 # Reroot protobuf to lineage A and restrict to low mutation density (highly supported nodes):
-$matUtils extract -i gisaidAndPublic.$buildDate.masked.pruneRevs.pb \
+$matUtils extract -i gisaidAndPublic.$buildDate.masked.pruneRevs.pb.gz \
     --reroot $lineageARoot \
     --max-mutation-density 2 \
-    -O -o gisaidAndPublic.$buildDate.masked.reroot.pb
+    -O -o gisaidAndPublic.$buildDate.masked.reroot.pb.gz
 
 # Reroot pango.clade-mutations.tsv
 grep -w ^A $scriptDir/pango.clade-mutations.tsv \
 | sed -re 's/T28144C( > )?//;  s/C8782T( > )?//;' \
     > pango.clade-mutations.reroot.tsv
 grep -vw ^A $scriptDir/pango.clade-mutations.tsv \
 | sed -re 's/\t/\tT8782C > C28144T > /;' \
+| sed -re 's/T8782C >([ >ACGTN0-9,]+)C8782T/\1/; s/> +>/>/;' \
     >> pango.clade-mutations.reroot.tsv
 
 # Mask additional bases at the beginning and end of the genome that pangolin masks after
 # aligning input sequences.
 set +x
 for ((i=56;  $i <= 265;  i++)); do
     echo -e "N${i}N"
 done > maskPangoEnds
 for ((i=29674;  $i < 29804;  i++)); do
     echo -e "N${i}N"
 done >> maskPangoEnds
 set -x
-$matUtils mask -i gisaidAndPublic.$buildDate.masked.reroot.pb \
-    -m maskPangoEnds -o gisaidAndPublic.$buildDate.masked.reroot.pangoMasked.pb
+$matUtils mask -i gisaidAndPublic.$buildDate.masked.reroot.pb.gz -c \
+    -m maskPangoEnds -o gisaidAndPublic.$buildDate.masked.reroot.pangoMasked.pb.gz
 
 # Preserve lineage annotations that survived rerooting and pango-masking
-$matUtils extract -i gisaidAndPublic.$buildDate.masked.reroot.pangoMasked.pb \
+$matUtils extract -i gisaidAndPublic.$buildDate.masked.reroot.pangoMasked.pb.gz \
     -C clade-paths.reroot.pangoMasked
 tail -n+2 clade-paths.reroot.pangoMasked \
 | grep '^[A-Za-z]' \
 | cut -f 1,3 > lineageToPath.reroot.pangoMasked
 
 # Assign updated lineages on the rerooted & pango-masked tree, pango-only for pangolin:
-time $matUtils annotate -T 80 \
-    -i gisaidAndPublic.$buildDate.masked.reroot.pangoMasked.pb \
+time $matUtils annotate -T 50 \
+    -i gisaidAndPublic.$buildDate.masked.reroot.pangoMasked.pb.gz \
     -P lineageToPath.reroot.pangoMasked \
     -M pango.clade-mutations.reroot.tsv \
     -l \
     -c lineageToName \
     -f 0.95 \
     -u mutations.pangoOnly \
     -D details.pangoOnly \
-    -o gisaidAndPublic.$buildDate.masked.reroot.pangoOnly.pb \
+    -o gisaidAndPublic.$buildDate.masked.reroot.pangoOnly.pb.gz \
     >& annotate.pangoOnly.out
 
 set +o pipefail
 grep 'Could not' annotate.pangoOnly.out | cat
 grep skip annotate.pangoOnly.out | cat
 set -o pipefail
 
 # Make a bunch of smaller trees and see how they do.
 mkdir -p /hive/users/angie/lineageTreeUpdate.$today
 cd /hive/users/angie/lineageTreeUpdate.$today
 for i in 0 1 2 3 4 5 6 7 8 9; do
     echo test.50.$i
-    $matUtils extract -i $ottoDir/$buildDate/gisaidAndPublic.$buildDate.masked.reroot.pangoOnly.pb \
+    $matUtils extract -i $ottoDir/$buildDate/gisaidAndPublic.$buildDate.masked.reroot.pangoOnly.pb.gz \
         -r 50 -o test.50.$i.pb
     $matUtils mask -i test.50.$i.pb -S -o test.50.$i.simp.pb
 done
 
 # Run pangolin on each subtree
 # 7.5-14.5m each job, ~3.5hrs total:
 conda activate pangolin
 for i in 0 1 2 3 4 5 6 7 8 9; do
     echo test.50.$i.simp
-    time pangolin -t 80 --usher-tree test.50.$i.simp.pb \
+    time pangolin -t 80 --usher-tree test.50.$i.simp.pb --skip-scorpio \
         --skip-designation-cache --no-temp --outdir subset_10000_0.pusher.test.50.$i.simp.out \
         ../pangolin_eval/subset_10000_0.fa
 done
 
 # Summarize results
 for i in 0 1 2 3 4 5 6 7 8 9; do
     echo test.50.$i.simp
     tail -n+2 subset_10000_0.pusher.test.50.$i.simp.out/lineage_report.csv \
     | awk -F, '{print $1 "\t" $2;}' \
     | sort \
         > subset_10000_0.pusher.test.50.$i.simp
     join -t$'\t' ../pangolin_eval/subset_10000_0.cogNameToLin subset_10000_0.pusher.test.50.$i.simp \
     | tawk '$2 != $3' \
         > subset_10000_0.pusher.test.50.$i.simp.diff
     cut -f 2,3 subset_10000_0.pusher.test.50.$i.simp.diff \