f9ee0c7ea6540636411b7ff4a6ed54ab574701e2 angie Tue Jun 3 11:04:31 2025 -0700 Add Clade I. diff --git src/hg/utils/otto/mpxv/buildTree.sh src/hg/utils/otto/mpxv/buildTree.sh index 10e88ea8fdc..aafd231ec2b 100755 --- src/hg/utils/otto/mpxv/buildTree.sh +++ src/hg/utils/otto/mpxv/buildTree.sh @@ -1,171 +1,255 @@ #!/bin/bash source ~/.bashrc set -beEu -o pipefail -# Make masked VCF from nextalign'd INSDC sequences - mpxvScriptDir=$(dirname "${BASH_SOURCE[0]}") today=$(date +%F) mpxvDir=/hive/data/outside/otto/mpxv -# RefSeq assembly for our track hub -asmAcc=GCF_014621545.1 -asmDir=$(echo $asmAcc \ - | sed -re 's@^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\.([0-9]+)@\1/\2/\3/\4/\1_\2\3\4.\5@') - -# NC_063383.1 reference -ref2bit=/hive/data/genomes/asmHubs/$asmDir/$asmAcc.2bit +# RefSeq assembly for our track hubs for clade I and clade II +asmAcc_I=GCF_000857045.1 +asmAcc_II=GCF_014621545.1 # GenBank flat file for taxonium -gbff=$mpxvDir/GCF_014621545.1_ASM1462154v1_genomic.gbff +gbff_I=$mpxvDir/NC_003310.1.gbff +gbff_II=$mpxvDir/NC_063383.1.gbff + +# Metadata file +metadata_I=$mpxvNcbiDir/metadata.cladeI.tsv +metadata_II=$mpxvNcbiDir/metadata.2017outbreak.tsv + +# Mask file +#*** TODO get mask regions for clade I! +mask_I=/dev/null +mask_II=$mpxvScriptDir/mask.vcf.gz + +# faToVcf -maxDiff +maxDiff_I=10000 +maxDiff_II=100 + +# Minimum number of samples in tree to be sure the NCBI download wasn't incomplete +minSamples_I=600 +minSamples_II=7000 + +# Download archive storage directory +archiveRoot_I=/hive/users/angie/publicTreesMpxvCladeI +archiveRoot_II=/hive/users/angie/publicTreesHMPXV + +# Download archive URL directory +downloadDir_I=UShER_MPXV_cladeI +downloadDir_II=UShER_hMPXV mpxvNcbiDir=$mpxvDir/ncbi/ncbi.latest -archiveRoot=/hive/users/angie/publicTreesHMPXV usherDir=~angie/github/usher usherSampled=$usherDir/build/usher-sampled usher=$usherDir/build/usher matUtils=$usherDir/build/matUtils if [[ ! -d $mpxvDir/ncbi/ncbi.$today || ! -s $mpxvDir/ncbi/ncbi.$today/genbank.fa.xz ]]; then mkdir -p $mpxvDir/ncbi/ncbi.$today $mpxvScriptDir/getNcbiMpxv.sh >& $mpxvDir/ncbi/ncbi.$today/getNcbiMpxv.log fi buildDir=$mpxvDir/build/$today mkdir -p $buildDir cd $buildDir +# This builds the whole tree from scratch! Eventually we'll want to add only the new sequences +# to yesterday's tree. +echo '()' > emptyTree.nwk + +for clade in I II; do + if [[ $clade == "I" ]]; then + asmAcc=$asmAcc_I + gbff=$gbff_I + metadata=$metadata_I + mask=$mask_I + maxDiff=$maxDiff_I + minSamples=$minSamples_I + archiveRoot=$archiveRoot_I + downloadDir=$downloadDir_I + else + asmAcc=$asmAcc_II + gbff=$gbff_II + metadata=$metadata_II + mask=$mask_II + maxDiff=$maxDiff_II + minSamples=$minSamples_II + archiveRoot=$archiveRoot_II + downloadDir=$downloadDir_II + fi + # Use metadata to make a renaming file perl -wne 'chomp; @w=split(/\t/); my ($acc, $iso, $loc, $date, $name) = ($w[0], $w[1], $w[3], $w[4], $w[11]); if ($iso eq "") { $iso = $name; } my $country = $loc; $country =~ s/:.*//; my $COU = $country; $COU =~ s/^(\w{3}).*/$1/; $COU = uc($COU); if ($country eq "United Kingdom") { $COU = "UK"; } if ($iso !~ /$country/ && $iso !~ /\b$COU\b/) { $iso = "$country/$iso"; } my $year = $date; $year =~ s/-.*//; if ($iso !~ /$year/) { $iso = "$iso/$year"; } my $fullName = $name ? "$name|$acc|$date" : "$acc|$date"; $fullName =~ s/[ ,:()]/_/g; print "$acc\t$fullName\n";' \ - $mpxvNcbiDir/metadata.2017outbreak.tsv > renaming.tsv + $metadata > renaming.$clade.tsv # Use Nextstrain's exclusion list too. awk '{print $1;}' ~angie/github/monkeypox/phylogenetic/defaults/exclude_accessions.txt \ -| grep -Fwf - <(cut -f 1 $mpxvNcbiDir/metadata.2017outbreak.tsv) \ + | grep -Fwf - <(cut -f 1 $metadata) \ > exclude.ids -# This builds the whole tree from scratch! Eventually we'll want to add only the new sequences -# to yesterday's tree. -time cat <(twoBitToFa $ref2bit stdout) <(xzcat $mpxvNcbiDir/nextalign.fa.xz) \ -| faToVcf -maxDiff=100 -verbose=2 -excludeFile=exclude.ids -includeNoAltN stdin stdout \ -| vcfRenameAndPrune stdin renaming.tsv stdout \ -| vcfFilter -excludeVcf=$mpxvScriptDir/mask.vcf.gz stdin \ + asmDir=$(echo $asmAcc \ + | sed -re 's@^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\.([0-9]+)@\1/\2/\3/\4/\1_\2\3\4.\5@') + ref2bit=/hive/data/genomes/asmHubs/$asmDir/$asmAcc.2bit + time cat <(twoBitToFa $ref2bit stdout) <(xzcat $mpxvNcbiDir/nextalign.$clade.fa.xz) \ + | faToVcf -maxDiff=$maxDiff -verbose=2 -excludeFile=exclude.ids -includeNoAltN stdin stdout \ + | vcfRenameAndPrune stdin renaming.$clade.tsv stdout \ + | vcfFilter -excludeVcf=$mask stdin \ | pigz -p 8 \ > all.masked.vcf.gz -echo '()' > emptyTree.nwk -time $usherSampled -T 64 -A -e 5 \ + time $usherSampled -T 16 -A -e 10 \ -t emptyTree.nwk \ -v all.masked.vcf.gz \ - -o mpxv.$today.masked.preOpt.pb\ + -o mpxv.clade$clade.$today.masked.preOpt.pb.gz \ --optimization_radius 0 --batch_size_per_process 10 \ > usher.addNew.log 2>usher-sampled.stderr # Optimize: time ~angie/github/usher_branch/build/matOptimize \ - -T 64 -r 20 -M 2 -S move_log.usher_branch \ - -i mpxv.$today.masked.preOpt.pb \ - -o mpxv.$today.masked.opt.pb \ + -T 16 -r 20 -M 2 -S move_log.usher_branch \ + -i mpxv.clade$clade.$today.masked.preOpt.pb.gz \ + -o mpxv.clade$clade.$today.masked.opt.pb.gz \ >& matOptimize.usher_branch.log # It crashes when I add # -v all.masked.vcf.gz \ # -- bug Cheng later. + if [[ $clade == "II" ]]; then # Annotate root nodes for Nextstrain lineages. -join -t$'\t' <(sort renaming.tsv) <(cut -f 2,5 $mpxvNcbiDir/nextclade.tsv | sort) \ + join -t$'\t' <(sort renaming.$clade.tsv) <(zcat $mpxvNcbiDir/nextclade.$clade.tsv.gz | cut -f 2,5 | sort) \ | tawk '{print $3, $2;}' | sort > lineageToName -$matUtils annotate -T 30 -i mpxv.$today.masked.opt.pb -c lineageToName \ - -o mpxv.$today.masked.pb \ - >& annotate.log + $matUtils annotate -T 16 -i mpxv.clade$clade.$today.masked.opt.pb.gz -c lineageToName \ + -o mpxv.clade$clade.$today.masked.pb.gz \ + >& annotate.$clade.log # Make metadata that uses same names as tree and includes nextclade lineage assignments. echo -e "strain\tgenbank_accession\tdate\tcountry\tlocation\tlength\thost\tbioproject_accession\tbiosample_accession\tsra_accession\tauthors\tpublications\tNextstrain_lineage" \ - > mpxv.$today.metadata.tsv -join -t$'\t' <(sort renaming.tsv) <(cut -f 2,5 $mpxvNcbiDir/nextclade.tsv | sort) \ + > mpxv.clade$clade.$today.metadata.tsv + join -t$'\t' <(sort renaming.$clade.tsv) <(zcat $mpxvNcbiDir/nextclade.$clade.tsv.gz | cut -f 2,5 | sort) \ + | join -t$'\t' -o 1.2,2.1,2.6,2.4,2.5,2.8,2.9,2.10,2.11,2.13,2.14,2.15,1.3 \ + - <(sort $metadata \ + | perl -F'/\t/' -walne '$F[3] =~ s/(: ?|$)/\t/; print join("\t", @F);') \ + >> mpxv.clade$clade.$today.metadata.tsv + else + # No lineages to annotate; just clean up .opt with -O. + $matUtils extract -i mpxv.clade$clade.$today.masked.opt.pb.gz \ + -O -o mpxv.clade$clade.$today.masked.pb.gz + + # Make metadata that uses same names as tree and includes nextclade clade Ia/Ib assignments. + echo -e "strain\tgenbank_accession\tdate\tcountry\tlocation\tlength\thost\tbioproject_accession\tbiosample_accession\tsra_accession\tauthors\tpublications\tNextstrain_clade" \ + > mpxv.clade$clade.$today.metadata.tsv + join -t$'\t' <(sort renaming.$clade.tsv) <(zcat $mpxvNcbiDir/nextclade.$clade.tsv.gz | cut -f 2,3 | sort) \ | join -t$'\t' -o 1.2,2.1,2.6,2.4,2.5,2.8,2.9,2.10,2.11,2.13,2.14,2.15,1.3 \ - - <(sort $mpxvNcbiDir/metadata.2017outbreak.tsv \ + - <(sort $metadata \ | perl -F'/\t/' -walne '$F[3] =~ s/(: ?|$)/\t/; print join("\t", @F);') \ - >> mpxv.$today.metadata.tsv -pigz -f -p 8 mpxv.$today.metadata.tsv + >> mpxv.clade$clade.$today.metadata.tsv + fi + pigz -f -p 8 mpxv.clade$clade.$today.metadata.tsv # Make a tree version description for hgPhyloPlace -$matUtils extract -i mpxv.$today.masked.pb -u samples.$today -sampleCount=$(wc -l < samples.$today) + $matUtils extract -i mpxv.clade$clade.$today.masked.pb.gz -u samples.$clade.$today + sampleCount=$(wc -l < samples.$clade.$today) # Sometimes NCBI download is incomplete; don't replace yesterday's tree in that case. -minSamples=3300 if (( $sampleCount < $minSamples )); then - echo "*** Too few samples ($sampleCount)! Expected at least $minSamples. Halting. ***" + echo "*** Too few samples ($sampleCount) for clade $clade! Expected at least $minSamples. Halting. ***" exit 1 fi sampleCountComma=$(echo $sampleCount \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from INSDC (GenBank/ENA/DDBJ) ($today)" \ - > hgPhyloPlace.description.txt + > hgPhyloPlace.description.$clade.txt # Make a taxonium view -usher_to_taxonium --input mpxv.$today.masked.pb \ - --metadata mpxv.$today.metadata.tsv.gz \ + if [[ $clade == "I" ]]; then + columns=genbank_accession,location,date,authors,Nextstrain_clade + else + columns=genbank_accession,location,date,authors,Nextstrain_lineage + fi + usher_to_taxonium --input mpxv.clade$clade.$today.masked.pb.gz \ + --metadata mpxv.clade$clade.$today.metadata.tsv.gz \ --genbank $gbff \ - --columns genbank_accession,location,date,authors,Nextstrain_lineage \ + --columns $columns \ --clade_types=pango \ - --output mpxv.$today.masked.taxonium.jsonl.gz \ + --output mpxv.clade$clade.$today.masked.taxonium.jsonl.gz \ >& usher_to_taxonium.log -# Update links to latest protobuf and metadata in hgwdev cgi-bin directories -for dir in /usr/local/apache/cgi-bin{-angie,,-beta}/hgPhyloPlaceData/$asmAcc; do - ln -sf $(pwd)/mpxv.$today.masked.pb $dir/mpxv.latest.pb - ln -sf $(pwd)/mpxv.$today.metadata.tsv.gz $dir/mpxv.latest.metadata.tsv.gz - ln -sf $(pwd)/hgPhyloPlace.description.txt $dir/mpxv.latest.version.txt -done + # Update links to latest protobuf and metadata in /gbdb directories + nc=$(basename $gbff .gbff) + dir=/gbdb/wuhCor1/hgPhyloPlaceData/mpxv/$nc + mkdir -p $dir + ln -sf $(pwd)/mpxv.clade$clade.$today.masked.pb.gz $dir/mpxv.clade$clade.latest.pb.gz + ln -sf $(pwd)/mpxv.clade$clade.$today.metadata.tsv.gz $dir/mpxv.clade$clade.latest.metadata.tsv.gz + ln -sf $(pwd)/hgPhyloPlace.description.$clade.txt $dir/mpxv.clade$clade.latest.version.txt # Extract Newick and VCF for anyone who wants to download those instead of protobuf -$matUtils extract -i mpxv.$today.masked.pb \ - -t mpxv.$today.nwk \ - -v mpxv.$today.masked.vcf -pigz -p 8 -f mpxv.$today.nwk mpxv.$today.masked.vcf + $matUtils extract -i mpxv.clade$clade.$today.masked.pb.gz \ + -t mpxv.clade$clade.$today.nwk \ + -v mpxv.clade$clade.$today.masked.vcf + pigz -p 8 -f mpxv.clade$clade.$today.nwk mpxv.clade$clade.$today.masked.vcf # Link to public trees download directory hierarchy read y m d < <(echo $today | sed -re 's/-/ /g') archive=$archiveRoot/$y/$m/$d mkdir -p $archive -ln -f $(pwd)/mpxv.$today.{nwk,masked.vcf,metadata.tsv,masked.taxonium.jsonl}.gz $archive/ -gzip -c mpxv.$today.masked.pb > $archive/mpxv.$today.masked.pb.gz -ln -f $(pwd)/hgPhyloPlace.description.txt $archive/mpxv.$today.version.txt + if [[ $clade == "I" ]]; then + ln -f $(pwd)/mpxv.clade$clade.$today.{nwk,masked.vcf,metadata.tsv,masked.taxonium.jsonl,masked.pb}.gz $archive/ + ln -f $(pwd)/hgPhyloPlace.description.$clade.txt $archive/mpxv.clade$clade.$today.version.txt + else + # At first, clade II hMPXV was the only mpox, so cladeII was not part of the download file names. + # Also, for clade II we're not making a tree with all clade II, only the 2017 outbreak, so cladeII + # in the name wouldn't be entirely accurate either. + for f in mpxv.clade$clade.$today.{nwk,masked.vcf,metadata.tsv,masked.taxonium.jsonl,masked.pb}.gz; do + downloadF=$(echo $f | sed -re 's/.cladeII//;') + ln -f $(pwd)/$f $archive/$downloadF + done + ln -f $(pwd)/hgPhyloPlace.description.$clade.txt $archive/mpxv.$today.version.txt + fi # Update 'latest' in $archiveRoot -for f in $archive/mpxv.$today.*; do + for f in $archive/mpxv*.$today.*; do latestF=$(echo $(basename $f) | sed -re 's/'$today'/latest/') ln -f $f $archiveRoot/$latestF done # Update hgdownload-test link for archive -mkdir -p /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER_hMPXV/$y/$m -ln -sf $archive /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER_hMPXV/$y/$m + mkdir -p /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/$downloadDir/$y/$m + ln -sf $archive /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/$downloadDir/$y/$m # rsync to hgdownload hubs dir -for h in hgdownload1 hgdownload2; do - rsync -a -L --delete /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER_hMPXV/* \ - qateam@$h:/mirrordata/hubs/$asmDir/UShER_hMPXV/ + for h in hgdownload1 hgdownload3; do + if rsync -a -L --delete /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/$downloadDir/* \ + qateam@$h:/mirrordata/hubs/$asmDir/$downloadDir/; then + true + else + echo "" + echo "*** rsync to $h failed; disk full? ***" + echo "" + fi done + if [[ $clade == "II" ]]; then set +o pipefail -grep 'Could not' annotate.log | cat -grep skipping annotate.log | cat + grep 'Could not' annotate.$clade.log | cat + grep skipping annotate.$clade.log | cat set -o pipefail + fi -cat hgPhyloPlace.description.txt + cat hgPhyloPlace.description.$clade.txt -zcat mpxv.$today.metadata.tsv.gz | tail -n+2 | cut -f 13 | sort | uniq -c + zcat mpxv.clade$clade.$today.metadata.tsv.gz | tail -n+2 | cut -f 13 | sort | uniq -c + +done