src/hg/utils/otto/mtb/buildTree.sh 61654162481479361c996500c8dcdae120caae95

61654162481479361c996500c8dcdae120caae95
angie
  Fri Dec 20 16:06:08 2024 -0800
March 2024 build: start with Lily Karim's tree and Ash O'Farrell's metadata, add GenBank sequences, combine metadata.

diff --git src/hg/utils/otto/mtb/buildTree.sh src/hg/utils/otto/mtb/buildTree.sh
new file mode 100755
index 0000000..0b5b2cf
--- /dev/null
+++ src/hg/utils/otto/mtb/buildTree.sh
@@ -0,0 +1,166 @@
+#!/bin/bash
+source ~/.bashrc
+set -beEu -o pipefail
+
+# Align INSDC sequences to reference and build tree.
+
+mtbScriptDir=$(dirname "${BASH_SOURCE[0]}")
+
+today=$(date +%F)
+
+mtbDir=/hive/data/outside/otto/mtb
+mtbNcbiDir=$mtbDir/ncbi/ncbi.latest
+
+usherDir=~angie/github/usher
+usherSampled=$usherDir/build/usher-sampled
+usher=$usherDir/build/usher
+matUtils=$usherDir/build/matUtils
+matOptimize=$usherDir/build/matOptimize
+
+asmAcc=GCF_000195955.2
+gbff=$mtbDir/NC_000962.3.gbff
+refFa=$mtbDir/NC_000962.3.fa
+archiveRoot=/hive/users/angie/publicTreesMtb
+
+if [[ ! -d $mtbDir/ncbi/ncbi.$today || ! -s $mtbDir/ncbi/ncbi.$today/genbank.fa.xz ]]; then
+    mkdir -p $mtbDir/ncbi/ncbi.$today
+    $mtbScriptDir/getNcbiMtb.sh >& $mtbDir/ncbi/ncbi.$today/getNcbiMtb.log
+fi
+
+buildDir=$mtbDir/build/$today
+mkdir -p $buildDir
+cd $buildDir
+
+# Make renaming file from metadata.tsv.
+perl -wne 'chomp;
+    s/\tmissing\t/\t\t/g;
+    s/\bnot applicable\b//g;
+    @w=split(/\t/);
+    my ($acc, $biosample, $date, $loc, $host, $asmName, $len, $bStrain, $org, $oStrain, $iso) = @w;
+    my $name = $bStrain ? $bStrain : $iso ? $iso : $oStrain ? $oStrain : $asmName ? $asmName : "";
+    my $country = $loc;  $country =~ s/:.*//;
+    my $COU = $country;  $COU =~ s/^(\w{3}).*/$1/;  $COU = uc($COU);
+    if ($country eq "United Kingdom") { $COU = "UK"; }
+    if ($name !~ /$country/ && $name !~ /\b$COU\b/ && $name ne "") { $name = "$country/$name"; }
+    $name =~ s/[,;]//g;
+    $name =~ s/ /_/g;
+    my $year = $date;  $year =~ s/-.*//;
+    my $year2 = $year;   $year2 =~ s/^\d{2}(\d{2})$/$1/;
+    if ($name ne "" && $name !~ /$year/ && $name !~ /\/$year2$/) { $name = "$name/$year"; }
+    if ($date eq "") { $date = "?"; }
+    my $fullName = $name ? "$name|$acc|$date" : "$acc|$date";
+    $fullName =~ s/[ ,:()]/_/g;
+    print "$acc\t$fullName\n";' \
+        $mtbNcbiDir/metadata.tsv \
+| sed -re 's/[():'"'"']/_/g; s/\[/_/g; s/\]/_/g;' \
+| sed -re 's/_+\|/|/;' \
+    > renaming.tsv
+
+# This adds all sequences to Lily's tree even if they were added yesterday.
+# Eventually we'll want to add only the new sequences to yesterday's tree.
+
+# These sequences are too diverged for nextclade -- use minimap2 and make maple diff file.
+xzcat $mtbDir/ncbi/ncbi.$today/genbank.fa.xz \
+| minimap2 -x asm20 --score-N 0 --secondary no --cs -t 32 $refFa - \
+| pafToMaple stdin aligned.maple -minNonNBases=4000000 -rename=renaming.tsv \
+    -excludeBed=$mtbDir/R00000039_repregions.bed
+
+time $usherSampled -T 64 -A -e 5 \
+    -i $mtbDir/lily_pruned_20240221.pb \
+    --diff aligned.maple \
+    --ref $refFa \
+    -o mtb.$today.preFilter.pb \
+    --optimization_radius 0 --batch_size_per_process 10 \
+    > usher.addNew.log 2>usher-sampled.stderr
+
+# Filter out a few sequences on very long branches.
+$matUtils extract -i mtb.$today.preFilter.pb \
+    --max-parsimony 500 \
+    --max-branch-length 2000 \
+    -O -o mtb.$today.preOpt.pb >& tmp.log
+
+# Optimize:
+time $matOptimize -T 64 -r 20 -M 2 -S move_log \
+    -i mtb.$today.preOpt.pb \
+    -o mtb.$today.pb \
+    >& matOptimize.log
+rm -f *.pbintermediate*.pb
+chmod 664 mtb.$today.pb
+
+# Make metadata that uses same names as tree
+echo -e "strain\tgenbank_accession\tbiosample_accession\tdate\tcountry\tlocation\thost\tlength\tbioproject_accession\tsra_sample_accession" \
+        > gb.metadata.tsv
+sort $mtbNcbiDir/metadata.tsv \
+| sed -re 's/not applicable//g;' \
+| perl -F'/\t/' -walne '$F[3] =~ s/(: ?|$)/\t/;  print join("\t", @F);' \
+| join -t$'\t' -o 1.2,2.1,2.2,2.3,2.4,2.5,2.6,2.8,2.9,2.10 \
+    <(sort renaming.tsv) \
+    - \
+| sort \
+    >> gb.metadata.tsv
+pigz -f -p 8 gb.metadata.tsv
+
+$mtbScriptDir/combineMetadata.py gb.metadata.tsv.gz $mtbDir/tree_metadata_etc_fran_v3.tsv.gz \
+| pigz -p 8 \
+    > mtb.$today.metadata.tsv.gz
+
+# Make a tree version description for hgPhyloPlace
+$matUtils extract -i mtb.$today.pb -u samples.$today >& tmp.log
+sampleCountComma=$(wc -l < samples.$today \
+                   | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
+echo "$sampleCountComma genomes from SRA and INSDC (GenBank/ENA/DDBJ) ($today)" \
+    > hgPhyloPlace.description.txt
+
+# Make a taxonium view
+#***    --columns genbank_accession,country,location,date,biosample_accession \
+usher_to_taxonium --input mtb.$today.pb \
+    --metadata mtb.$today.metadata.tsv.gz \
+    --genbank $gbff \
+    --only_variable_sites \
+    --name_internal_nodes \
+    --title "M. tb $today tree with $sampleCountComma genomes from INSDC and 66498 genomes from SRA" \
+    --output mtb.$today.taxonium.jsonl.gz \
+    >& usher_to_taxonium.log
+
+    # Update links to latest protobuf and metadata in hgwdev cgi-bin directories
+
+#*** TODO: use the /gbdb/ dir not cgi-bin
+
+#***for dir in /usr/local/apache/cgi-bin{-angie,,-beta}/hgPhyloPlaceData/$asmAcc; do
+for dir in /usr/local/apache/cgi-bin{-angie,}/hgPhyloPlaceData/$asmAcc; do
+    ln -sf $(pwd)/mtb.$today.pb $dir/mtb.latest.pb
+    ln -sf $(pwd)/mtb.$today.metadata.tsv.gz $dir/mtb.latest.metadata.tsv.gz
+    ln -sf $(pwd)/hgPhyloPlace.description.txt $dir/mtb.latest.version.txt
+done
+
+    # Extract Newick and VCF for anyone who wants to download those instead of protobuf
+    $matUtils extract -i mtb.$today.pb \
+        -t mtb.$today.nwk \
+        -v mtb.$today.vcf >& tmp.log
+    pigz -p 8 -f mtb.$today.nwk mtb.$today.vcf
+
+    # Link to public trees download directory hierarchy
+    read y m d < <(echo $today | sed -re 's/-/ /g')
+    archive=$archiveRoot/$y/$m/$d
+    mkdir -p $archive
+    ln -f $(pwd)/mtb.$today.{nwk,vcf,metadata.tsv,taxonium.jsonl}.gz $archive/
+    gzip -c mtb.$today.pb > $archive/mtb.$today.pb.gz
+    ln -f $(pwd)/hgPhyloPlace.description.txt $archive/mtb.$today.version.txt
+
+    # Update 'latest' in $archiveRoot
+    for f in $archive/mtb.$today.*; do
+        latestF=$(echo $(basename $f) | sed -re 's/'$today'/latest/')
+        ln -f $f $archiveRoot/$latestF
+    done
+
+    # Update hgdownload-test link for archive
+    asmDir=$(echo $asmAcc \
+        | sed -re 's@^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\.([0-9]+)@\1/\2/\3/\4/\1_\2\3\4.\5@')
+    mkdir -p /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER/$y/$m
+    ln -sf $archive /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER/$y/$m
+    # rsync to hgdownload hubs dir
+    rsync -v -a -L --delete /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER/* \
+        qateam@hgdownload.soe.ucsc.edu:/mirrordata/hubs/$asmDir/UShER/
+
+rm -f mutation-paths.txt *.pre*.pb final-tree.nh
+nice gzip -f *.log *.tsv move_log* *.stderr samples.*