src/hg/utils/otto/dengue/buildTree.sh 30d1e434dcd5b313f393eac770c8da2d4323d54f

30d1e434dcd5b313f393eac770c8da2d4323d54f
angie
  Fri Oct 13 11:45:13 2023 -0700
Adding download & usher tree build scripts for several non-SARS-CoV-2 viruses.
mpxv has been going for over a year, rsv & dengue for some months now.  fluA is an incomplete work in progress,
complicated by having 8 segments, greater distances, and different subtyping schemes for different segments and even for different types within the segments.

diff --git src/hg/utils/otto/dengue/buildTree.sh src/hg/utils/otto/dengue/buildTree.sh
new file mode 100755
index 0000000..a3f8362
--- /dev/null
+++ src/hg/utils/otto/dengue/buildTree.sh
@@ -0,0 +1,227 @@
+#!/bin/bash
+source ~/.bashrc
+set -beEu -o pipefail
+
+# Align INSDC sequences to references and build 4 trees, one for each subtype 1-4.
+
+dengueScriptDir=$(dirname "${BASH_SOURCE[0]}")
+
+today=$(date +%F)
+
+dengueDir=/hive/data/outside/otto/dengue
+dengueNcbiDir=$dengueDir/ncbi/ncbi.latest
+
+usherDir=~angie/github/usher
+usherSampled=$usherDir/build/usher-sampled
+usher=$usherDir/build/usher
+matUtils=$usherDir/build/matUtils
+matOptimize=$usherDir/build/matOptimize
+
+# Subtype 1
+asmAcc1=GCF_000862125.1
+gbff1=$dengueDir/NC_001477.1.gbff
+refFa1=$dengueDir/NC_001477.1.fa
+archiveRoot1=/hive/users/angie/publicTreesDenv1
+
+# Subtype 2
+asmAcc2=GCF_000871845.1
+gbff2=$dengueDir/NC_001474.2.gbff
+refFa2=$dengueDir/NC_001474.2.fa
+archiveRoot2=/hive/users/angie/publicTreesDenv2
+
+# Subtype 3
+asmAcc3=GCF_000866625.1
+gbff3=$dengueDir/NC_001475.2.gbff
+refFa3=$dengueDir/NC_001475.2.fa
+archiveRoot3=/hive/users/angie/publicTreesDenv3
+
+# Subtype 4
+asmAcc4=GCF_000865065.1
+gbff4=$dengueDir/NC_002640.1.gbff
+refFa4=$dengueDir/NC_002640.1.fa
+archiveRoot4=/hive/users/angie/publicTreesDenv4
+
+if [[ ! -d $dengueDir/ncbi/ncbi.$today || ! -s $dengueDir/ncbi/ncbi.$today/genbank.fa.xz ]]; then
+    mkdir -p $dengueDir/ncbi/ncbi.$today
+    $dengueScriptDir/getNcbiDengue.sh >& $dengueDir/ncbi/ncbi.$today/getNcbiDengue.log
+fi
+
+buildDir=$dengueDir/build/$today
+mkdir -p $buildDir
+cd $buildDir
+
+# Use metadata to make a renaming file
+cat $dengueNcbiDir/metadata.tsv \
+| sed -re 's@([\t /])h?rsv(-?[ab])?/@\1@ig;' \
+| sed -re 's@([\t /])human/@\1@ig;' \
+| sed -re 's@([\t /])homo sapiens/@\1@ig;' \
+| sed -re 's@Capsid, Premembrane, Envelope, Non-structural 1, Non-structural 2a, Non-structural 2b, Non-structural 3, Non-structural 4a, Non-structural 4b, Non-structural 5 gene for polyprotein, Capsid, Premembrane, Envelope, Non-structural 1, Non-structural 2a, Non-structural 2b, Non-structural 3, Non-structural 4a, Non-structural 4b, Non-structural 5 region, @@;' \
+| sed -re 's@Capsid, Premembrane, Envelope, Non-structural 1, Non-structural 2a, Non-structural 2b, Non-structural 3, Non-structural 4a, Non-structural 4b, Non-structural 5 gene for polyprotein, 1 region, @@;' \
+| sed -re 's@(NS1, NS2A, NS2B, NS3, NS4A, NS4B, NS5)@@;' \
+    > tweakedMetadata.tsv
+cut -f 1,12 tweakedMetadata.tsv \
+| trimWordy.pl \
+    > accToIdFromTitle.tsv
+
+#*** If this is really the same as RSV then it should become a real script.
+#*** -- I changed $w[15] to $w[16] because I added serotype as a metadata column in getNcbiDengue.sh.
+#*** Still should probably become a proper script -- I could cut -f... from metadata.
+#*** Also had to add the paren-stripping seds.
+join -t$'\t' <(sort tweakedMetadata.tsv) accToIdFromTitle.tsv \
+| perl -wne 'chomp; @w=split(/\t/);
+    my ($acc, $iso, $loc, $date, $str, $tid) = ($w[0], $w[1], $w[3], $w[4], $w[14], $w[16]);
+    if (! defined $str) { $str = ""; }
+    my $name = $str ? $str : $iso ? $iso : $tid ? $tid : "";
+    my $country = $loc;  $country =~ s/:.*//;
+    my $COU = $country;  $COU =~ s/^(\w{3}).*/$1/;  $COU = uc($COU);
+    if ($country eq "United Kingdom") { $COU = "UK"; }
+    if ($name !~ /$country/ && $name !~ /\b$COU\b/ && $name ne "") { $name = "$country/$name"; }
+    $name =~ s/[,;]//g;
+    my $year = $date;  $year =~ s/-.*//;
+    my $year2 = $year;   $year2 =~ s/^\d{2}(\d{2})$/$1/;
+    if ($name ne "" && $name !~ /$year/ && $name !~ /\/$year2$/) { $name = "$name/$year"; }
+    if ($date eq "") { $date = "?"; }
+    my $fullName = $name ? "$name|$acc|$date" : "$acc|$date";
+    $fullName =~ s/ /_/g;
+    print "$acc\t$fullName\n";' \
+| sed -re 's/[():'"'"']/_/g; s/\[/_/g; s/\]/_/g;' \
+| sed -re 's/_+\|/|/;' \
+    > renaming.tsv
+
+# This builds the whole tree from scratch!  Eventually we'll want to add only the new sequences
+# to yesterday's tree.
+
+echo '()' > emptyTree.nwk
+
+for subtype in 1 2 3 4; do
+    case $subtype in
+    1)
+        refFa=$refFa1
+        gbff=$gbff1
+        asmAcc=$asmAcc1
+        archiveRoot=$archiveRoot1
+        ;;
+    2)
+        refFa=$refFa2
+        gbff=$gbff2
+        asmAcc=$asmAcc2
+        archiveRoot=$archiveRoot2
+        ;;
+    3)
+        refFa=$refFa3
+        gbff=$gbff3
+        asmAcc=$asmAcc3
+        archiveRoot=$archiveRoot3
+        ;;
+    4)
+        refFa=$refFa4
+        gbff=$gbff4
+        asmAcc=$asmAcc4
+        archiveRoot=$archiveRoot4
+        ;;
+    esac
+
+    # Run nextalign with RefSeq.
+    nextalign run --input-ref $refFa \
+        --include-reference \
+        --jobs 32 \
+        --output-fasta aligned.$subtype.fa.xz \
+        $dengueDir/ncbi/ncbi.$today/genbank.fa.xz \
+        >& nextalign.log
+
+# If it becomes necessary, add -excludeFile=$dengueScriptDir/exclude.ids
+    time faToVcf -verbose=2 -includeRef -includeNoAltN \
+        <(xzcat aligned.$subtype.fa.xz) stdout \
+        | vcfRenameAndPrune stdin renaming.tsv stdout \
+        | pigz -p 8 \
+        > all.$subtype.vcf.gz
+
+    time $usherSampled -T 64 -A -e 5 \
+        -t emptyTree.nwk \
+        -v all.$subtype.vcf.gz \
+        -o denv$subtype.$today.preFilter.pb\
+        --optimization_radius 0 --batch_size_per_process 10 \
+        > usher.addNew.$subtype.log 2>usher-sampled.$subtype.stderr
+
+    # Filter out branches that are so long they must lead to some other subtype
+    $matUtils extract -i denv$subtype.$today.preFilter.pb \
+        --max-branch-length 1000 \
+        -O -o denv$subtype.$today.preOpt.pb >& tmp.log
+
+    # Optimize:
+    time $matOptimize -T 64 -r 20 -M 2 -S move_log.$subtype \
+        -i denv$subtype.$today.preOpt.pb \
+        -o denv$subtype.$today.pb \
+        >& matOptimize.$subtype.log
+    rm -f *.pbintermediate*.pb
+    chmod 664 denv$subtype.$today.pb
+
+    # Make metadata that uses same names as tree
+    echo -e "strain\tgenbank_accession\tdate\tcountry\tlocation\tlength\thost\tbioproject_accession\tbiosample_accession\tsra_accession\tauthors\tpublications" \
+        > denv$subtype.$today.metadata.tsv
+    sort $dengueNcbiDir/metadata.tsv \
+    | perl -F'/\t/' -walne '$F[3] =~ s/(: ?|$)/\t/;  print join("\t", @F);' \
+    | join -t$'\t' -o 1.2,2.1,2.6,2.4,2.5,2.8,2.9,2.10,2.11,2.12,2.14,2.15 \
+        <(sort renaming.tsv) \
+        - \
+    | sort \
+        >> denv$subtype.$today.metadata.tsv
+    pigz -f -p 8 denv$subtype.$today.metadata.tsv
+
+    # Make a tree version description for hgPhyloPlace
+    $matUtils extract -i denv$subtype.$today.pb -u samples.$subtype.$today >& tmp.log
+    sampleCountComma=$(wc -l < samples.$subtype.$today \
+              | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
+    echo "$sampleCountComma genomes from INSDC (GenBank/ENA/DDBJ) ($today)" \
+        > hgPhyloPlace.description.$subtype.txt
+
+    # Make a taxonium view
+    usher_to_taxonium --input denv$subtype.$today.pb \
+        --metadata denv$subtype.$today.metadata.tsv.gz \
+        --columns genbank_accession,country,location,date,authors \
+        --genbank $gbff \
+        --name_internal_nodes \
+        --title "Dengue subtype "$subtype" $today tree with $sampleCountComma genomes from INSDC" \
+        --output denv$subtype.$today.taxonium.jsonl.gz \
+        >& usher_to_taxonium.$subtype.log
+
+    # Update links to latest protobuf and metadata in hgwdev cgi-bin directories
+#***    for dir in /usr/local/apache/cgi-bin{-angie,,-beta}/hgPhyloPlaceData/$asmAcc; do
+    for dir in /usr/local/apache/cgi-bin-angie/hgPhyloPlaceData/$asmAcc; do
+        ln -sf $(pwd)/denv$subtype.$today.pb $dir/denv$subtype.latest.pb
+        ln -sf $(pwd)/denv$subtype.$today.metadata.tsv.gz $dir/denv$subtype.latest.metadata.tsv.gz
+        ln -sf $(pwd)/hgPhyloPlace.description.$subtype.txt $dir/denv$subtype.latest.version.txt
+    done
+
+    # Extract Newick and VCF for anyone who wants to download those instead of protobuf
+    $matUtils extract -i denv$subtype.$today.pb \
+        -t denv$subtype.$today.nwk \
+        -v denv$subtype.$today.vcf >& tmp.log
+    pigz -p 8 -f denv$subtype.$today.nwk denv$subtype.$today.vcf
+
+    # Link to public trees download directory hierarchy
+    read y m d < <(echo $today | sed -re 's/-/ /g')
+    archive=$archiveRoot/$y/$m/$d
+    mkdir -p $archive
+    ln -f $(pwd)/denv$subtype.$today.{nwk,vcf,metadata.tsv,taxonium.jsonl}.gz $archive/
+    gzip -c denv$subtype.$today.pb > $archive/denv$subtype.$today.pb.gz
+    ln -f $(pwd)/hgPhyloPlace.description.$subtype.txt $archive/denv$subtype.$today.version.txt
+
+    # Update 'latest' in $archiveRoot
+    for f in $archive/denv$subtype.$today.*; do
+        latestF=$(echo $(basename $f) | sed -re 's/'$today'/latest/')
+        ln -f $f $archiveRoot/$latestF
+    done
+
+    # Update hgdownload-test link for archive
+    asmDir=$(echo $asmAcc \
+        | sed -re 's@^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\.([0-9]+)@\1/\2/\3/\4/\1_\2\3\4.\5@')
+    mkdir -p /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER_DENV-$subtype/$y/$m
+    ln -sf $archive /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER_DENV-$subtype/$y/$m
+    # rsync to hgdownload hubs dir
+    rsync -v -a -L --delete /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER_DENV-$subtype/* \
+        qateam@hgdownload.soe.ucsc.edu:/mirrordata/hubs/$asmDir/UShER_DENV-$subtype/
+done
+
+rm -f mutation-paths.txt *.pre*.pb final-tree.nh
+nice gzip -f *.log *.tsv move_log* *.stderr samples.*