5af316b0f05735ba8dfb8a8264fe634e16e5bb22 angie Tue Jun 3 11:07:21 2025 -0700 Reduce threads to avoid lockup in usher-sampled and matOptimize. Remove Goya clades because they were removed from nextclade. Update hgdownload2 --> hgdownload3 and tolerate hgdownload rsync failure. diff --git src/hg/utils/otto/rsv/buildTree.sh src/hg/utils/otto/rsv/buildTree.sh index fed9898e6c7..ec5ff126f65 100755 --- src/hg/utils/otto/rsv/buildTree.sh +++ src/hg/utils/otto/rsv/buildTree.sh @@ -114,150 +114,107 @@ >& nextclade.$aOrB.log # Run nextalign with RefSeq. nextalign run --input-ref $refFa \ --include-reference \ --jobs 32 \ --output-fasta aligned.$aOrB.fa.xz \ $rsvDir/ncbi/ncbi.$today/genbank.fa.xz \ >& nextalign.log if [[ -s $rsvScriptDir/mask.$aOrB.vcf ]]; then maskCmd="vcfFilter -excludeVcf=$rsvScriptDir/mask.$aOrB.vcf stdin " else maskCmd="cat" fi - time faToVcf -verbose=2 -includeRef -includeNoAltN -excludeFile=$rsvScriptDir/exclude.ids \ + time faToVcf -verbose=2 -includeNoAltN -excludeFile=$rsvScriptDir/exclude.ids \ <(xzcat aligned.$aOrB.fa.xz) stdout \ | vcfRenameAndPrune stdin renaming.tsv stdout \ | $maskCmd \ | pigz -p 8 \ > all.$aOrB.vcf.gz - time $usherSampled -T 64 -A -e 5 \ + time $usherSampled -T 16 -A -e 5 \ -t emptyTree.nwk \ -v all.$aOrB.vcf.gz \ -o rsv$aOrB.$today.preFilter.pb\ --optimization_radius 0 --batch_size_per_process 10 \ > usher.addNew.$aOrB.log 2>usher-sampled.$aOrB.stderr # Filter out branches that are so long they must lead to the other subspecies (B or A) $matUtils extract -i rsv$aOrB.$today.preFilter.pb \ --max-branch-length $maxBranchLen \ -O -o rsv$aOrB.$today.preOpt.pb >& tmp.log # Optimize: - time $matOptimize -T 64 -r 20 -M 2 -S move_log.$aOrB \ + time $matOptimize -T 16 -r 20 -M 2 -S move_log.$aOrB \ -i rsv$aOrB.$today.preOpt.pb \ -o rsv$aOrB.$today.opt.pb \ >& matOptimize.$aOrB.log - # Annotate consortium clades and Goya et al. clades using nextclade assignments and - # other sources (nextstrain.org, supplemental table...). + # Annotate consortium clades using nextclade assignments and other sources + # (nextstrain.org, supplemental table...). function tl { perl -wne 'chomp; @w = split(/\t/, $_, -1); $i = 1; foreach $w (@w) { print "$i\t$w[$i-1]\n"; $i++; }' } cCol=$(head -1 nextclade.rsv$aOrB.tsv | tl | grep -w clade | cut -f 1) - gCol=$(head -1 nextclade.rsv$aOrB.tsv | tl | grep -w G_clade | cut -f 1) subsCol=$(head -1 nextclade.rsv$aOrB.tsv | tl | grep -w totalSubstitutions | cut -f 1) tail -n+2 nextclade.rsv$aOrB.tsv \ - | tawk '$'$subsCol' < '$maxSubs' && $'$gCol' != "" {print $2, $'$gCol';}' \ - | sed -re 's/unassigned/Unassigned/;' \ - | sort > accToGClade - join -t$'\t' accToGClade renaming.tsv | grep -v Unassigned | cut -f 2,3 | sort \ - > gCladeToName.$aOrB - if [[ $aOrB == A ]]; then - # RSV-A only: if nextclade is not calling some GAs, use assignments from nextstrain.org. - for ga in GA1 GA2.1 GA2.2 GA2.3.6a; do - set +o pipefail - gaCount=$(grep -w $ga gCladeToName.A | wc -l) - set -o pipefail - if (( $gaCount < 10 )); then - join -t$'\t' \ - <(grep -w $ga $rsvDir/nextstrain_acc_to_clade.tsv | sort) \ - <(sed -re 's/\.[0-9]+\t/\t/;' renaming.tsv) \ - | cut -f 2,3 \ - >> gCladeToName.$aOrB - fi - done - else - # RSV-B only: if nextclade is not calling GB1, use assignments from nextstrain.org. - set +o pipefail - gb1Count=$(grep -w GB1 gCladeToName.B | wc -l) - set -o pipefail - if (( $gb1Count < 10 )); then - join -t$'\t' \ - <(grep -w GB1 $rsvDir/nextstrain_acc_to_clade.tsv | sort) \ - <(sed -re 's/\.[0-9]+\t/\t/;' renaming.tsv) \ - | cut -f 2,3 \ - >> gCladeToName.$aOrB - fi - fi - if [[ -s $rsvScriptDir/goya.$aOrB.clade-mutations.tsv ]]; then - cladeMuts="-M $rsvScriptDir/goya.$aOrB.clade-mutations.tsv" - else - cladeMuts="" - fi - $matUtils annotate -T 64 -f 0.9 -m 0.1 -i rsv$aOrB.$today.opt.pb \ - -l -c gCladeToName.$aOrB $cladeMuts -o rsv$aOrB.$today.gClade.pb \ - >& annotate.$aOrB.gClade.log - tail -n+2 nextclade.rsv$aOrB.tsv \ | tawk '$'$subsCol' < '$maxSubs' && $'$cCol' != "" {print $2, $'$cCol';}' \ | sed -re 's/unassigned/Unassigned/;' \ | sort > accToCClade join -t$'\t' accToCClade renaming.tsv \ | grep -v Unassigned | cut -f 2,3 | sort \ > cCladeToName.$aOrB if [[ -s $rsvScriptDir/gcc.$aOrB.clade-mutations.tsv ]]; then cladeMuts="-M $rsvScriptDir/gcc.$aOrB.clade-mutations.tsv" else cladeMuts="" fi - $matUtils annotate -T 64 -f 0.9 -m 0.1 -i rsv$aOrB.$today.gClade.pb \ + $matUtils annotate -T 16 -f 0.9 -m 0.1 -i rsv$aOrB.$today.opt.pb \ -c cCladeToName.$aOrB $cladeMuts -o rsv$aOrB.$today.pb \ >& annotate.$aOrB.cClade.log $matUtils summary -i rsv$aOrB.$today.pb -C sample-clades.$aOrB >& tmp.log # Make metadata that uses same names as tree - echo -e "strain\tgenbank_accession\tdate\tcountry\tlocation\tlength\thost\tbioproject_accession\tbiosample_accession\tsra_accession\tauthors\tpublications\tgoya_nextclade\tGCC_nextclade\tgoya_usher\tGCC_usher\tGCC_assigned_2023-11" \ + echo -e "strain\tgenbank_accession\tdate\tcountry\tlocation\tlength\thost\tbioproject_accession\tbiosample_accession\tsra_accession\tauthors\tpublications\tGCC_nextclade\tGCC_usher\tGCC_assigned_2023-11" \ > rsv$aOrB.$today.metadata.tsv join -t$'\t' -o 1.2,2.1,2.6,2.4,2.5,2.8,2.9,2.10,2.11,2.13,2.14,2.15 \ <(sort renaming.tsv) \ <(sort $rsvNcbiDir/metadata.tsv \ | perl -F'/\t/' -walne '$F[3] =~ s/(: ?|$)/\t/; print join("\t", @F);') \ | sort \ - | join -t$'\t' - <(join -a 1 -o 1.2,2.2 -t$'\t' renaming.tsv accToGClade | sort) \ | join -t$'\t' - <(join -a 1 -o 1.2,2.2 -t$'\t' renaming.tsv accToCClade | sort) \ | join -t$'\t' - <(sort sample-clades.$aOrB | sed -re 's/None/Unassigned/g') \ | join -t$'\t' - <(join -a 1 -o 1.2,2.2 -t$'\t' \ <(sed -re 's/\.[0-9]+\t/\t/;' renaming.tsv) \ $rsvDir/RSV${aOrB}_GCC_2023-11-06.tsv \ | sort) \ >> rsv$aOrB.$today.metadata.tsv pigz -f -p 8 rsv$aOrB.$today.metadata.tsv # Make a tree version description for hgPhyloPlace $matUtils extract -i rsv$aOrB.$today.pb -u samples.$aOrB.$today >& tmp.log sampleCountComma=$(wc -l < samples.$aOrB.$today \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from INSDC (GenBank/ENA/DDBJ) ($today)" \ > hgPhyloPlace.description.$aOrB.txt # Make a taxonium view usher_to_taxonium --input rsv$aOrB.$today.pb \ --metadata rsv$aOrB.$today.metadata.tsv.gz \ - --columns genbank_accession,country,location,date,authors,GCC_nextclade,goya_nextclade,GCC_usher,goya_usher,GCC_assigned_2023-11 \ + --columns genbank_accession,country,location,date,authors,GCC_nextclade,GCC_usher,GCC_assigned_2023-11 \ --clade_types=placeholder,pango \ --genbank $gbff \ --name_internal_nodes \ --title "RSV-"$aOrB" $today tree with $sampleCountComma genomes from INSDC" \ --output rsv$aOrB.$today.taxonium.jsonl.gz \ >& usher_to_taxonium.$aOrB.log # Update links to latest protobuf and metadata in /gbdb directories nc=$(basename $gbff .gbff) dir=/gbdb/wuhCor1/hgPhyloPlaceData/rsv/$nc mkdir -p $dir ln -sf $(pwd)/rsv$aOrB.$today.pb $dir/rsv$aOrB.latest.pb ln -sf $(pwd)/rsv$aOrB.$today.metadata.tsv.gz $dir/rsv$aOrB.latest.metadata.tsv.gz ln -sf $(pwd)/hgPhyloPlace.description.$aOrB.txt $dir/rsv$aOrB.latest.version.txt @@ -275,39 +232,45 @@ gzip -c rsv$aOrB.$today.pb > $archive/rsv$aOrB.$today.pb.gz ln -f $(pwd)/hgPhyloPlace.description.$aOrB.txt $archive/rsv$aOrB.$today.version.txt # Update 'latest' in $archiveRoot for f in $archive/rsv$aOrB.$today.*; do latestF=$(echo $(basename $f) | sed -re 's/'$today'/latest/') ln -f $f $archiveRoot/$latestF done # Update hgdownload-test link for archive asmDir=$(echo $asmAcc \ | sed -re 's@^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\.([0-9]+)@\1/\2/\3/\4/\1_\2\3\4.\5@') mkdir -p /data/apache/htdocs-hgdownload/hubs/$asmDir/UShER_RSV-$aOrB/$y/$m ln -sf $archive /data/apache/htdocs-hgdownload/hubs/$asmDir/UShER_RSV-$aOrB/$y/$m # rsync to hgdownload{1,2} hubs dir - for h in hgdownload1 hgdownload2; do - rsync -a -L --delete /data/apache/htdocs-hgdownload/hubs/$asmDir/UShER_RSV-$aOrB/* \ - qateam@$h:/mirrordata/hubs/$asmDir/UShER_RSV-$aOrB/ + for h in hgdownload1 hgdownload3; do + if rsync -a -L --delete /data/apache/htdocs-hgdownload/hubs/$asmDir/UShER_RSV-$aOrB/* \ + qateam@$h:/mirrordata/hubs/$asmDir/UShER_RSV-$aOrB/; then + true + else + echo "" + echo "*** rsync to $h failed; disk full? ***" + echo "" + fi done done set +o pipefail grep 'Could not' annotate.*.log | cat grep skipping annotate.*.log | cat set -o pipefail cat hgPhyloPlace.description.A.txt zcat rsvA.$today.metadata.tsv.gz | tail -n+2 | cut -f 13,15 | sort | uniq -c echo "" cat hgPhyloPlace.description.B.txt zcat rsvB.$today.metadata.tsv.gz | tail -n+2 | cut -f 13,15 | sort | uniq -c echo "" echo "RSV GCC clades:" zcat rsvA.$today.metadata.tsv.gz | tail -n+2 | cut -f 14,16,17 | sort | uniq -c echo "" zcat rsvB.$today.metadata.tsv.gz | tail -n+2 | cut -f 14,16,17 | sort | uniq -c -rm -f mutation-paths.txt *.pre*.pb final-tree.nh *.opt.pb *.gClade.pb *.pbintermediate*.pb +rm -f mutation-paths.txt *.pre*.pb final-tree.nh *.opt.pb *.pbintermediate*.pb nice gzip -f *.log *.tsv move_log* *.stderr samples.* sample-clades.*