8e712a67beb687070337c4739f6a3664c5cc8225
angie
  Wed Aug 14 13:10:23 2024 -0700
Fix lineage check to compare vs. lineages actually annotated on the tree, not just lineages present in pango.clade-mutations.tsv.

diff --git src/hg/utils/otto/sarscov2phylo/updatePublic.sh src/hg/utils/otto/sarscov2phylo/updatePublic.sh
index fbdd66d..446dbf5 100755
--- src/hg/utils/otto/sarscov2phylo/updatePublic.sh
+++ src/hg/utils/otto/sarscov2phylo/updatePublic.sh
@@ -45,35 +45,40 @@
 buildDir=$ottoDir/$today
 mkdir -p $buildDir
 cd $buildDir
 
 time $scriptDir/updateCombinedTree.sh $prevDate $today $problematicSitesVcf \
     >& updateCombinedTree.log
 
 echo ""
 cat hgPhyloPlace.description.txt
 cat hgPhyloPlace.plusGisaid.description.txt
 
 set +o pipefail
 grep skip annotate.pango annotate.nextclade | cat
 grep 'Could not' annotate.pango annotate.nextclade | cat
 
-# Check for newly added lineages that are missing from pango.clade-mutations.tsv
+# Check for lineages that should be annotated on the tree but are not, and vice versa.
 set +x
 lineages=~angie/github/pango-designation/lineages.csv
-tail -n+2 $lineages | cut -d, -f 2 | uniq | grep -E '^(AY|[B-Z][A-Z])' | sort -u \
-    > $TMPDIR/designatedDoubleLetters
-cut -f 1 $scriptDir/pango.clade-mutations.tsv  \
-| grep -E '^(AY|[B-Z][A-Z])' | grep -v _ | sort -u \
-    > $TMPDIR/cladeMutDoubleLetters
-missingLineages=$(comm -23 $TMPDIR/designatedDoubleLetters $TMPDIR/cladeMutDoubleLetters)
-if [[ "$missingLineages" != "" ]]; then
-    echo "LINEAGES MISSING FROM lineages.csv:"
-    echo $missingLineages
+tail -n+2 $lineages | cut -d, -f 2 | uniq | sort -u \
+    > $TMPDIR/designatedLineages
+cut -f 1 $buildDir/clade-paths | egrep '^[A-Z]' | grep -v _ | sort \
+    > $TMPDIR/annotatedLineages
+designatedNotAnnotated=$(comm -23 $TMPDIR/designatedLineages $TMPDIR/annotatedLineages \
+                         | grep -vFwf $scriptDir/designatedNotAnnotated | cat)
+if [[ "$designatedNotAnnotated" != "" ]]; then
+    echo "MISSING LINEAGES:"
+    echo "$designatedNotAnnotated"
+else
+    echo "No unexpectedly missing lineages, good."
 fi
-extraLineages=$(comm -13 $TMPDIR/designatedDoubleLetters $TMPDIR/cladeMutDoubleLetters)
-if [[ "$extraLineages" != "" ]]; then
-    echo "EXTRA LINEAGES (withdrawn?) in pango.clade-mutations.tsv:"
-    echo $extraLineages
+annotatedNotDesignated=$(comm -13 $TMPDIR/designatedLineages $TMPDIR/annotatedLineages \
+                         | grep -vFwf $scriptDir/annotatedNotDesignated | cat)
+if [[ "$annotatedNotDesignated" != "" ]]; then
+    echo "EXTRA LINEAGES (withdrawn?) in tree:"
+    echo "$annotatedNotDesignated"
+else
+    echo "No extra lineages, good."
 fi
 set -o pipefail
 set -x