543b4946d460da38fd7dae2c3bfb3ece9af03218
angie
  Tue Nov 2 12:53:34 2021 -0700
Add the new -M (--clade-mutations) option to matUtils annotate for precise (yet hopefully more stable than --clade-paths) identification of lineage root nodes.  Also clip input columns when tacking on clade & lineage from tree at end, so we can run multiple times without gaining extra columns.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index 2770026..a374224 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -63,79 +63,82 @@
 # version/description files
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
     > version.plusGisaid.txt
 sampleCountComma=$(echo $(wc -l < samples.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.plusGisaid.description.txt
 
 # Add nextclade annotations to protobuf
 if [ -s $ottoDir/$prevDate/cladeToName ]; then
     # Use yesterday's clade assignments to annotate clades on today's tree
     time $matUtils annotate -T 50 \
         -l \
         -i gisaidAndPublic.$today.masked.pb \
+        -M $scriptDir/nextstrain.clade-mutations.tsv \
         -c $ottoDir/$prevDate/cladeToName \
         -f 0.95 \
         -D details.nextclade \
         -o gisaidAndPublic.$today.masked.nextclade.pb \
         >& annotate.nextclade
 else
     time $matUtils annotate -T 50 \
         -l \
         -i gisaidAndPublic.$today.masked.pb \
         -P ../nextstrain.clade-paths.tsv \
         -o gisaidAndPublic.$today.masked.nextclade.pb
 fi
 
 # Add pangolin lineage annotations to protobuf.
 if [ -s $ottoDir/$prevDate/lineageToName ]; then
     time $matUtils annotate -T 50 \
         -i gisaidAndPublic.$today.masked.nextclade.pb \
+        -M $scriptDir/pango.clade-mutations.tsv \
         -c $ottoDir/$prevDate/lineageToName \
         -f 0.95 \
         -D details.pango \
         -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \
         >& annotate.pango
 else
     time $matUtils annotate -T 50 \
         -i gisaidAndPublic.$today.masked.nextclade.pb \
         -P ../pango.clade-paths.tsv \
         -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb
 fi
 
 # Replace protobuf with annotated protobuf.
 mv gisaidAndPublic.$today.masked{,.unannotated}.pb
 ln -f gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb
 
 # Save clade & lineage annotations for use tomorrow.
 $matUtils summary -i gisaidAndPublic.$today.masked.pb -C sample-clades
 tail -n+2 sample-clades \
 | tawk '{print $2, $1;}' \
 | sort > cladeToName
 tail -n+2 sample-clades \
 | tawk '{print $3, $1;}' \
 | sort > lineageToName
 
 # Add clade & lineage from tree to metadata.
 zcat gisaidAndPublic.$today.metadata.tsv.gz \
 | tail -n+2 \
+| cut -f 1-9 \
 | sort > tmp1
 tail -n+2 sample-clades \
 | sort > tmp2
-paste <(zcat gisaidAndPublic.$today.metadata.tsv.gz | head -1) \
+paste <(zcat gisaidAndPublic.$today.metadata.tsv.gz | cut -f 1-9 | head -1) \
       <(echo -e "Nextstrain_clade_usher\tpango_lineage_usher") \
     > gisaidAndPublic.$today.metadata.tsv
 join -t$'\t' tmp1 tmp2 \
     >> gisaidAndPublic.$today.metadata.tsv
 gzip -f gisaidAndPublic.$today.metadata.tsv
 rm tmp1 tmp2
 
 # EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have
 # public names & IDs, we can match them.
 cut -f 1,3 $epiToPublic > epiToPublic.latest
 
 # Update links to latest public+GISAID protobuf and metadata in hgwdev cgi-bin directories
 for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do
     ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb
     ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \