7f46acdf8f879de28c3ba40534e58a881941e168
angie
  Sun Feb 4 09:24:53 2024 -0800
CNCB is updated daily so make version.txt more concise.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index 286b5a4..8932313 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -72,78 +72,77 @@
     time $matOptimize \
         -T 80 -m 0.00000001 -M 2 -S move_log.filtered \
         -i merged.deltaMasked.filtered.pb \
         -o gisaidAndPublic.$today.masked.preTrim.pb \
         >& matOptimize.filtered.log
 
     # Fix grandparent-reversion nodes that cause some lineages to be incorrectly placed as
     # sublineages of siblings.
     $matUtils fix -i gisaidAndPublic.$today.masked.preTrim.pb -c 10 \
         -o gisaidAndPublic.$today.masked.preTrim.fix.pb
 
     # Again prune samples with too many private mutations and internal branches that are too long.
     $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.fix.pb \
         --max-parsimony 20 \
         --max-branch-length 60 \
-        --max-path-length 175 \
+        --max-path-length 225 \
         -O -o gisaidAndPublic.$today.masked.pb
 fi
 
 # Exclude sequences with a very high number of EPPs from future runs
 grep ^Current usher.addNew.log \
 | awk '$16 >= 10 {print $8;}' \
 | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \
     > tooManyEpps.ids
 cat tooManyEpps.ids >> ../tooManyEpps.ids
 
 $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today
 
 $scriptDir/combineMetadata.sh $prevDate $today
 
 # version/description files
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
-echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
+echo "sarscov2phylo release 13-11-20; GISAID, NCBI, COG-UK and CNCB sequences downloaded $today" \
     > version.plusGisaid.txt
 sampleCountComma=$(echo $(wc -l < samples.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.plusGisaid.description.txt
 
 # Add nextclade annotations to protobuf (completely specified by nextstrain.clade-mutations.tsv)
 time $matUtils annotate -T 50 \
     -l \
     -i gisaidAndPublic.$today.masked.pb \
     -P $ottoDir/$prevDate/cladeToPath \
     -M $scriptDir/nextstrain.clade-mutations.tsv \
     -D details.nextclade \
     -o gisaidAndPublic.$today.masked.nextclade.pb \
     >& annotate.nextclade
 
 # Add pangolin lineage annotations to protobuf.
 time $matUtils annotate -T 50 \
     -i gisaidAndPublic.$today.masked.nextclade.pb \
     -P $ottoDir/$prevDate/lineageToPath \
     -M $scriptDir/pango.clade-mutations.tsv \
     -c $ottoDir/$prevDate/lineageToName \
     -f 0.95 \
     -D details.pango \
     -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb \
     >& annotate.pango
 
 # Replace protobuf with annotated protobuf.
-mv gisaidAndPublic.$today.masked{,.unannotated}.pb
-ln -f gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb
+mv gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb
 
 # Save paths and annotations for use tomorrow.
 $matUtils extract -i gisaidAndPublic.$today.masked.pb -C clade-paths
 tail -n+2 clade-paths \
 | grep -E '^[12]' \
 | cut -f 1,3 > cladeToPath
 tail -n+2 clade-paths \
 | grep -E '^[A-Za-z]' \
 | cut -f 1,3 > lineageToPath
 $matUtils summary -i gisaidAndPublic.$today.masked.pb -C sample-clades
 tail -n+2 sample-clades \
 | tawk '{print $2, $1;}' \
 | sort > cladeToName
 tail -n+2 sample-clades \
 | tawk '{print $3, $1;}' \