src/hg/utils/otto/mpxv/buildTree.sh 5581e1884c54c583fe68844f8271a3baac1617ae

5581e1884c54c583fe68844f8271a3baac1617ae
angie
  Fri Dec 20 15:21:56 2024 -0800
Accumulated minor updates.

diff --git src/hg/utils/otto/mpxv/buildTree.sh src/hg/utils/otto/mpxv/buildTree.sh
index 1129847..10e88ea 100755
--- src/hg/utils/otto/mpxv/buildTree.sh
+++ src/hg/utils/otto/mpxv/buildTree.sh
@@ -1,166 +1,171 @@
 #!/bin/bash
 source ~/.bashrc
 set -beEu -o pipefail
 
 # Make masked VCF from nextalign'd INSDC sequences
 
 mpxvScriptDir=$(dirname "${BASH_SOURCE[0]}")
 
 today=$(date +%F)
 
 mpxvDir=/hive/data/outside/otto/mpxv
 
 # RefSeq assembly for our track hub
 asmAcc=GCF_014621545.1
 asmDir=$(echo $asmAcc \
     | sed -re 's@^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\.([0-9]+)@\1/\2/\3/\4/\1_\2\3\4.\5@')
 
 # NC_063383.1 reference
 ref2bit=/hive/data/genomes/asmHubs/$asmDir/$asmAcc.2bit
 
 # GenBank flat file for taxonium
 gbff=$mpxvDir/GCF_014621545.1_ASM1462154v1_genomic.gbff
 
 mpxvNcbiDir=$mpxvDir/ncbi/ncbi.latest
 archiveRoot=/hive/users/angie/publicTreesHMPXV
 
 usherDir=~angie/github/usher
 usherSampled=$usherDir/build/usher-sampled
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
 
+if [[ ! -d $mpxvDir/ncbi/ncbi.$today || ! -s $mpxvDir/ncbi/ncbi.$today/genbank.fa.xz ]]; then
     mkdir -p $mpxvDir/ncbi/ncbi.$today
     $mpxvScriptDir/getNcbiMpxv.sh >& $mpxvDir/ncbi/ncbi.$today/getNcbiMpxv.log
+fi
 
 buildDir=$mpxvDir/build/$today
 mkdir -p $buildDir
 cd $buildDir
 
 # Use metadata to make a renaming file
 perl -wne 'chomp; @w=split(/\t/);
     my ($acc, $iso, $loc, $date, $name) = ($w[0], $w[1], $w[3], $w[4], $w[11]);
     if ($iso eq "") { $iso = $name; }
     my $country = $loc;  $country =~ s/:.*//;
     my $COU = $country;  $COU =~ s/^(\w{3}).*/$1/;  $COU = uc($COU);
     if ($country eq "United Kingdom") { $COU = "UK"; }
     if ($iso !~ /$country/ && $iso !~ /\b$COU\b/) { $iso = "$country/$iso"; }
     my $year = $date;  $year =~ s/-.*//;
     if ($iso !~ /$year/) { $iso = "$iso/$year"; }
-    my $fullName = "$iso|$acc|$date";  $fullName =~ s/ /_/g;
+    my $fullName = $name ? "$name|$acc|$date" : "$acc|$date";
+    $fullName =~ s/[ ,:()]/_/g;
     print "$acc\t$fullName\n";' \
     $mpxvNcbiDir/metadata.2017outbreak.tsv > renaming.tsv
 
 # Use Nextstrain's exclusion list too.
-awk '{print $1;}' ~angie/github/monkeypox/config/exclude_accessions_mpxv.txt \
+awk '{print $1;}' ~angie/github/monkeypox/phylogenetic/defaults/exclude_accessions.txt \
 | grep -Fwf - <(cut -f 1 $mpxvNcbiDir/metadata.2017outbreak.tsv) \
     > exclude.ids
 
 # This builds the whole tree from scratch!  Eventually we'll want to add only the new sequences
 # to yesterday's tree.
 time cat <(twoBitToFa $ref2bit stdout) <(xzcat $mpxvNcbiDir/nextalign.fa.xz) \
 | faToVcf -maxDiff=100 -verbose=2 -excludeFile=exclude.ids -includeNoAltN stdin stdout \
 | vcfRenameAndPrune stdin renaming.tsv stdout \
 | vcfFilter -excludeVcf=$mpxvScriptDir/mask.vcf.gz stdin \
 | pigz -p 8 \
     > all.masked.vcf.gz
 
 echo '()' > emptyTree.nwk
 time $usherSampled -T 64 -A -e 5 \
         -t emptyTree.nwk \
         -v all.masked.vcf.gz \
         -o mpxv.$today.masked.preOpt.pb\
         --optimization_radius 0 --batch_size_per_process 10 \
         > usher.addNew.log 2>usher-sampled.stderr
 
 # Optimize:
 time ~angie/github/usher_branch/build/matOptimize \
     -T 64 -r 20 -M 2 -S move_log.usher_branch \
     -i mpxv.$today.masked.preOpt.pb \
     -o mpxv.$today.masked.opt.pb \
     >& matOptimize.usher_branch.log
 # It crashes when I add
 #    -v all.masked.vcf.gz \
 # -- bug Cheng later.
 
 # Annotate root nodes for Nextstrain lineages.
-join -t$'\t' <(sort renaming.tsv) <(cut -f 1,4 $mpxvNcbiDir/nextclade.tsv | sort) \
+join -t$'\t' <(sort renaming.tsv) <(cut -f 2,5 $mpxvNcbiDir/nextclade.tsv | sort) \
 | tawk '{print $3, $2;}' | sort > lineageToName
 $matUtils annotate -T 30 -i mpxv.$today.masked.opt.pb -c lineageToName \
     -o mpxv.$today.masked.pb \
     >& annotate.log
 
 # Make metadata that uses same names as tree and includes nextclade lineage assignments.
 echo -e "strain\tgenbank_accession\tdate\tcountry\tlocation\tlength\thost\tbioproject_accession\tbiosample_accession\tsra_accession\tauthors\tpublications\tNextstrain_lineage" \
     > mpxv.$today.metadata.tsv
-join -t$'\t' <(sort renaming.tsv) <(cut -f 1,4 $mpxvNcbiDir/nextclade.tsv | sort) \
+join -t$'\t' <(sort renaming.tsv) <(cut -f 2,5 $mpxvNcbiDir/nextclade.tsv | sort) \
 | join -t$'\t' -o 1.2,2.1,2.6,2.4,2.5,2.8,2.9,2.10,2.11,2.13,2.14,2.15,1.3 \
     - <(sort $mpxvNcbiDir/metadata.2017outbreak.tsv \
         | perl -F'/\t/' -walne '$F[3] =~ s/(: ?|$)/\t/;  print join("\t", @F);') \
     >> mpxv.$today.metadata.tsv
 pigz -f -p 8 mpxv.$today.metadata.tsv
 
 # Make a tree version description for hgPhyloPlace
 $matUtils extract -i mpxv.$today.masked.pb -u samples.$today
 sampleCount=$(wc -l < samples.$today)
 # Sometimes NCBI download is incomplete; don't replace yesterday's tree in that case.
 minSamples=3300
 if (( $sampleCount < $minSamples )); then
     echo "*** Too few samples ($sampleCount)!  Expected at least $minSamples.  Halting. ***"
     exit 1
 fi
 sampleCountComma=$(echo $sampleCount \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from INSDC (GenBank/ENA/DDBJ) ($today)" \
     > hgPhyloPlace.description.txt
 
 # Make a taxonium view
 usher_to_taxonium --input mpxv.$today.masked.pb \
     --metadata mpxv.$today.metadata.tsv.gz \
     --genbank $gbff \
     --columns genbank_accession,location,date,authors,Nextstrain_lineage \
     --clade_types=pango \
     --output mpxv.$today.masked.taxonium.jsonl.gz \
     >& usher_to_taxonium.log
 
 # Update links to latest protobuf and metadata in hgwdev cgi-bin directories
 for dir in /usr/local/apache/cgi-bin{-angie,,-beta}/hgPhyloPlaceData/$asmAcc; do
     ln -sf $(pwd)/mpxv.$today.masked.pb $dir/mpxv.latest.pb
     ln -sf $(pwd)/mpxv.$today.metadata.tsv.gz $dir/mpxv.latest.metadata.tsv.gz
     ln -sf $(pwd)/hgPhyloPlace.description.txt $dir/mpxv.latest.version.txt
 done
 
 # Extract Newick and VCF for anyone who wants to download those instead of protobuf
 $matUtils extract -i mpxv.$today.masked.pb \
     -t mpxv.$today.nwk \
     -v mpxv.$today.masked.vcf
 pigz -p 8 -f mpxv.$today.nwk mpxv.$today.masked.vcf
 
 # Link to public trees download directory hierarchy
 read y m d < <(echo $today | sed -re 's/-/ /g')
 archive=$archiveRoot/$y/$m/$d
 mkdir -p $archive
 ln -f $(pwd)/mpxv.$today.{nwk,masked.vcf,metadata.tsv,masked.taxonium.jsonl}.gz $archive/
 gzip -c mpxv.$today.masked.pb > $archive/mpxv.$today.masked.pb.gz
 ln -f $(pwd)/hgPhyloPlace.description.txt $archive/mpxv.$today.version.txt
 
 # Update 'latest' in $archiveRoot
 for f in $archive/mpxv.$today.*; do
     latestF=$(echo $(basename $f) | sed -re 's/'$today'/latest/')
     ln -f $f $archiveRoot/$latestF
 done
 
 # Update hgdownload-test link for archive
 mkdir -p /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER_hMPXV/$y/$m
 ln -sf $archive /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER_hMPXV/$y/$m
 # rsync to hgdownload hubs dir
-rsync -v -a -L --delete /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER_hMPXV/* \
-    qateam@hgdownload.soe.ucsc.edu:/mirrordata/hubs/$asmDir/UShER_hMPXV/
+for h in hgdownload1 hgdownload2; do
+    rsync -a -L --delete /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER_hMPXV/* \
+        qateam@$h:/mirrordata/hubs/$asmDir/UShER_hMPXV/
+done
 
 set +o pipefail
 grep 'Could not' annotate.log | cat
 grep skipping annotate.log | cat
 set -o pipefail
 
 cat hgPhyloPlace.description.txt
 
 zcat mpxv.$today.metadata.tsv.gz | tail -n+2 | cut -f 13 | sort | uniq -c