129db42d275d159115a7aef3d6377abb5d8e6c60 angie Tue Nov 14 17:08:20 2023 -0800 Reduce number of threads for usher-sampled -- Cheng's suggestion after mysterious hang on 2023-10-05. Use new matUtils fix subcommand to get rid of annoying grandparent-reversions. diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh index b2045ef..8fdf8df 100755 --- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh +++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh @@ -34,61 +34,66 @@ cd $ottoDir/$today usherDir=~angie/github/usher usher=$usherDir/build/usher-sampled matUtils=$usherDir/build/matUtils matOptimize=$usherDir/build/matOptimize if [ ! -s new.masked.vcf.gz ]; then $scriptDir/makeNewMaskedVcf.sh $prevDate $today $problematicSitesVcf $baseProtobuf fi if [ ! -s gisaidAndPublic.$today.masked.pb ]; then # $scriptDir/usherClusterRun.sh $today # Instead of the cluster, use Cheng's blazingly fast new usher-sampled: time $usher \ - -T 64 -A -e 5 \ + -T 50 -A -e 5 \ -i prevRenamed.pb \ -v new.masked.vcf.gz \ -o merged.pb \ --optimization_radius 0 --batch_size_per_process 10 \ > usher.addNew.log 2>usher-sampled.stderr # Branch-specific masking time $scriptDir/maskDelta.sh merged.pb merged.deltaMasked.pb # Prune samples with too many private mutations and internal branches that are too long. $matUtils extract -i merged.deltaMasked.pb \ --max-parsimony 20 \ --max-branch-length 60 \ --max-path-length 175 \ -O -o merged.deltaMasked.filtered.pb # matOptimize: used -r 8 -M2 until 2023-05-12, then switched to Cheng's recommended # -m 0.00000001 -M 4 (avoid identical-child-node problem in # https://github.com/sars-cov-2-variants/lineage-proposals/issues/40) # The -M 4 allowed up to radius 32, and crazy things started happening all while I was # trying to get the tree cleaned up for pango-designation release 1.20 --> lineageTree. # After 2023-05-20, when I found that matOptimize had moved a big chunk of B.1 onto a # B.1.1.7 garbage branch, causing big trouble for lineageTree (23_05_18_updateLineageTreePb.txt). # After that I changed it back to -M 2 for my sanity. If the identical-child thing happens again, # then I'll probably just run matOptimize twice, with a small radius the second time. time $matOptimize \ -T 80 -m 0.00000001 -M 2 -S move_log.filtered \ -i merged.deltaMasked.filtered.pb \ -o gisaidAndPublic.$today.masked.preTrim.pb \ >& matOptimize.filtered.log + # Fix grandparent-reversion nodes that cause some lineages to be incorrectly placed as + # sublineages of siblings. + $matUtils fix -i gisaidAndPublic.$today.masked.preTrim.pb -c 10 \ + -o gisaidAndPublic.$today.masked.preTrim.fix.pb + # Again prune samples with too many private mutations and internal branches that are too long. - $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \ + $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.fix.pb \ --max-parsimony 20 \ --max-branch-length 60 \ --max-path-length 175 \ -O -o gisaidAndPublic.$today.masked.pb fi # Exclude sequences with a very high number of EPPs from future runs grep ^Current usher.addNew.log \ | awk '$16 >= 10 {print $8;}' \ | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \ > tooManyEpps.ids cat tooManyEpps.ids >> ../tooManyEpps.ids $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today @@ -171,21 +176,21 @@ $dir/public.plusGisaid.latest.metadata.tsv.gz ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt ln -sf `pwd`/epiToPublic.latest $dir/ ln -sf `pwd`/samples.$today.gz $dir/public.plusGisaid.names.gz done # Make Taxonium v2 protobuf for display usher_to_taxonium --input gisaidAndPublic.$today.masked.pb \ --metadata gisaidAndPublic.$today.metadata.tsv.gz \ --genbank ~angie/github/taxonium/taxoniumtools/test_data/hu1.gb \ --columns genbank_accession,country,date,pangolin_lineage,pango_lineage_usher \ --clade_types=nextstrain,pango \ --name_internal_nodes \ --title "$today tree with sequences from GISAID, INSDC, COG-UK and CNCB" \ --output gisaidAndPublic.$today.masked.taxonium.jsonl.gz \ - >& utt.log + >& utt.log & $scriptDir/extractPublicTree.sh $today $prevDate set +o pipefail grep skipping annotate* | cat