9ae312b796eca29cd110e08ecc8506ca593fcbfd angie Tue Jun 3 11:22:47 2025 -0700 Relax branch length filter again (for BA.3.2). diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh index cc74e885143..3cfa1cbde34 100755 --- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh +++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh @@ -48,57 +48,57 @@ gunzip -f new.masked.mpl.gz time $usher \ -T 50 -A -e 5 \ -i prevRenamed.pb.gz \ --diff new.masked.mpl \ --ref $ottoDir/NC_045512.2.fa \ -o merged.pb.gz \ --optimization_radius 0 --batch_size_per_process 10 \ > usher.addNew.log 2>usher-sampled.stderr pigz -f -p 8 new.masked.mpl # Branch-specific masking time $scriptDir/maskDelta.sh merged.pb.gz merged.deltaMasked.pb.gz # Prune samples with too many private mutations and internal branches that are too long. $matUtils extract -i merged.deltaMasked.pb.gz \ --max-parsimony 20 \ - --max-branch-length 60 \ + --max-branch-length 70 \ --max-path-length 225 \ -O -o merged.deltaMasked.filtered.pb.gz # matOptimize: used -r 8 -M2 until 2023-05-12, then switched to Cheng's recommended # -m 0.00000001 -M 4 (avoid identical-child-node problem in # https://github.com/sars-cov-2-variants/lineage-proposals/issues/40) # The -M 4 allowed up to radius 32, and crazy things started happening all while I was # trying to get the tree cleaned up for pango-designation release 1.20 --> lineageTree. # After 2023-05-20, when I found that matOptimize had moved a big chunk of B.1 onto a # B.1.1.7 garbage branch, causing big trouble for lineageTree (23_05_18_updateLineageTreePb.txt). # After that I changed it back to -M 2 for my sanity. If the identical-child thing happens again, # then I'll probably just run matOptimize twice, with a small radius the second time. cd $ottoDir/$today && $matOptimize \ -T 64 -m 0.00000001 -M 2 -S move_log.filtered \ -i merged.deltaMasked.filtered.pb.gz \ -o gisaidAndPublic.$today.masked.preTrim.pb.gz \ >& matOptimize.filtered.log # Fix grandparent-reversion nodes that cause some lineages to be incorrectly placed as # sublineages of siblings. $matUtils fix -i gisaidAndPublic.$today.masked.preTrim.pb.gz -c 10 \ -o gisaidAndPublic.$today.masked.preTrim.fix.pb.gz # Again prune samples with too many private mutations and internal branches that are too long. $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.fix.pb.gz \ --max-parsimony 20 \ - --max-branch-length 60 \ + --max-branch-length 70 \ --max-path-length 225 \ -O -o gisaidAndPublic.$today.masked.pb.gz fi # Exclude sequences with a very high number of EPPs from future runs grep ^Current usher.addNew.log \ | awk '$16 >= 10 {print $8;}' \ | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \ > tooManyEpps.ids cat tooManyEpps.ids >> ../tooManyEpps.ids $matUtils extract -i gisaidAndPublic.$today.masked.pb.gz -u samples.$today $scriptDir/combineMetadata.sh $prevDate $today