src/hg/utils/otto/rsv/buildTree.sh 2d6373e8e8e6c30f0ffe140bed6e7d1c21082f46

2d6373e8e8e6c30f0ffe140bed6e7d1c21082f46
angie
  Fri Dec 20 15:19:55 2024 -0800
Tweak filtering params, add masking for RSV-A, annotate RSV Genotype Consensus Consortium lineages instead of Ramaekers set.

diff --git src/hg/utils/otto/rsv/buildTree.sh src/hg/utils/otto/rsv/buildTree.sh
index 2bf5e3a..fed9898 100755
--- src/hg/utils/otto/rsv/buildTree.sh
+++ src/hg/utils/otto/rsv/buildTree.sh
@@ -1,314 +1,313 @@
 #!/bin/bash
 source ~/.bashrc
 set -beEu -o pipefail
 
 # Align INSDC sequences to reference and build two trees, one for RSV-A and one for RSV-B.
 
 rsvScriptDir=$(dirname "${BASH_SOURCE[0]}")
 
 today=$(date +%F)
 
 rsvDir=/hive/data/outside/otto/rsv
 rsvNcbiDir=$rsvDir/ncbi/ncbi.latest
 
 usherDir=~angie/github/usher
 usherSampled=$usherDir/build/usher-sampled
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
 matOptimize=$usherDir/build/matOptimize
 
 # RSV-A reference: Nextstrain uses KJ627695.1 but RefSeq is NC_038235.1 (M74568)
 asmAccA=GCF_002815475.1
 gbffA=$rsvDir/NC_038235.1.gbff
 nextcladeNameA=rsv_a
 refFaA=$rsvDir/NC_038235.1.fa
 archiveRootA=/hive/users/angie/publicTreesRsvA
 
 # RSV-B reference: Nextstrain uses ? but RefSeq is NC_001781.1 (AF013254)
 asmAccB=GCF_000855545.1
 gbffB=$rsvDir/NC_001781.1.gbff
 nextcladeNameB=rsv_b
 refFaB=$rsvDir/NC_001781.1.fa
 archiveRootB=/hive/users/angie/publicTreesRsvB
 
+# Filter out branches longer than this so we don't get RSV-A and RSV-B in same final tree
+maxSubs=1500
+maxBranchLen=500
+
 if [[ ! -d $rsvDir/ncbi/ncbi.$today || ! -s $rsvDir/ncbi/ncbi.$today/genbank.fa.xz ]]; then
     mkdir -p $rsvDir/ncbi/ncbi.$today
     $rsvScriptDir/getNcbiRsv.sh >& $rsvDir/ncbi/ncbi.$today/getNcbiRsv.log
 fi
 
 buildDir=$rsvDir/build/$today
 mkdir -p $buildDir
 cd $buildDir
 
 # Make sure the download wasn't truncated without reporting an error:
 count=$(wc -l < $rsvNcbiDir/metadata.tsv)
-minSamples=4500
+minSamples=5600
 if (( $count < $minSamples )); then
     echo "*** Too few samples ($count)!  Expected at least $minSamples.  Halting. ***"
     exit 1
 fi
 
 # Use metadata to make a renaming file
 cat $rsvNcbiDir/metadata.tsv \
 | sed -re 's@([\t /])h?rsv(-?[ab])?/@\1@ig;' \
 | sed -re 's@([\t /])[ab]/@\1@ig;' \
 | sed -re 's@([\t /])human/@\1@ig;' \
 | sed -re 's@([\t /])homo sapiens/@\1@ig;' \
 | sed -re 's@Human respiratory syncytial virus nonstructural protein 1, nonstructural protein 2, nucleocapsid protein, phosphoprotein, matrix protein, small hydrophobic protein, glycoprotein, fusion glycoprotein, 22K/M2 protein and L protein mRNA, complete cds@@;' \
     > tweakedMetadata.tsv
 cut -f 1,12 tweakedMetadata.tsv \
 | trimWordy.pl \
     > accToIdFromTitle.tsv
 join -t$'\t' <(sort tweakedMetadata.tsv) accToIdFromTitle.tsv \
 | perl -wne 'chomp; @w=split(/\t/);
     my ($acc, $iso, $loc, $date, $str, $tid) = ($w[0], $w[1], $w[3], $w[4], $w[14], $w[15]);
     if (! defined $str) { $str = ""; }
     my $name = $str ? $str : $iso ? $iso : $tid ? $tid : "";
     my $country = $loc;  $country =~ s/:.*//;
     my $COU = $country;  $COU =~ s/^(\w{3}).*/$1/;  $COU = uc($COU);
     if ($country eq "United Kingdom") { $COU = "UK"; }
     if ($name !~ /$country/ && $name !~ /\b$COU\b/ && $name ne "") { $name = "$country/$name"; }
     $name =~ s/[,;]//g;
     my $year = $date;  $year =~ s/-.*//;
     my $year2 = $year;   $year2 =~ s/^\d{2}(\d{2})$/$1/;
     if ($name ne "" && $name !~ /$year/ && $name !~ /\/$year2$/) { $name = "$name/$year"; }
     if ($date eq "") { $date = "?"; }
     my $fullName = $name ? "$name|$acc|$date" : "$acc|$date";
-    $fullName =~ s/ /_/g;
+    $fullName =~ s/[ ,:()]/_/g;
     print "$acc\t$fullName\n";' \
     > renaming.tsv
 
 # This builds the whole tree from scratch!  Eventually we'll want to add only the new sequences
 # to yesterday's tree.
 
 echo '()' > emptyTree.nwk
 
 for aOrB in A B; do
     if [[ $aOrB == "A" ]]; then
         refFa=$refFaA
         gbff=$gbffA
         asmAcc=$asmAccA
         archiveRoot=$archiveRootA
         nextcladeName=$nextcladeNameA
     else
         refFa=$refFaB
         gbff=$gbffB
         asmAcc=$asmAccB
         archiveRoot=$archiveRootB
         nextcladeName=$nextcladeNameB
     fi
 
     # Run nextclade to get clade assignments -- but not alignments because we need alignments
     # to RefSeqs not nextclade's chosen references.
     nextclade dataset get --name $nextcladeName --output-zip $nextcladeName.zip
     time nextclade run \
         -D $nextcladeName.zip \
         -j 30 \
         --retry-reverse-complement true \
         --output-tsv nextclade.rsv$aOrB.tsv \
         $rsvDir/ncbi/ncbi.$today/genbank.fa.xz \
         >& nextclade.$aOrB.log
 
     # Run nextalign with RefSeq.
     nextalign run --input-ref $refFa \
         --include-reference \
         --jobs 32 \
         --output-fasta aligned.$aOrB.fa.xz \
         $rsvDir/ncbi/ncbi.$today/genbank.fa.xz \
         >& nextalign.log
 
+    if [[ -s $rsvScriptDir/mask.$aOrB.vcf ]]; then
+        maskCmd="vcfFilter -excludeVcf=$rsvScriptDir/mask.$aOrB.vcf stdin "
+    else
+        maskCmd="cat"
+    fi
     time faToVcf -verbose=2 -includeRef -includeNoAltN -excludeFile=$rsvScriptDir/exclude.ids \
         <(xzcat aligned.$aOrB.fa.xz) stdout \
         | vcfRenameAndPrune stdin renaming.tsv stdout \
+        | $maskCmd \
         | pigz -p 8 \
         > all.$aOrB.vcf.gz
 
     time $usherSampled -T 64 -A -e 5 \
         -t emptyTree.nwk \
         -v all.$aOrB.vcf.gz \
         -o rsv$aOrB.$today.preFilter.pb\
         --optimization_radius 0 --batch_size_per_process 10 \
         > usher.addNew.$aOrB.log 2>usher-sampled.$aOrB.stderr
 
     # Filter out branches that are so long they must lead to the other subspecies (B or A)
     $matUtils extract -i rsv$aOrB.$today.preFilter.pb \
-        --max-branch-length 1000 \
+        --max-branch-length $maxBranchLen \
         -O -o rsv$aOrB.$today.preOpt.pb >& tmp.log
 
     # Optimize:
     time $matOptimize -T 64 -r 20 -M 2 -S move_log.$aOrB \
         -i rsv$aOrB.$today.preOpt.pb \
         -o rsv$aOrB.$today.opt.pb \
         >& matOptimize.$aOrB.log
 
-    # Annotate Ramaekers et al., Goya et al. and consortium clades using nextclade assignments and
+    # Annotate consortium clades and Goya et al. clades using nextclade assignments and
     # other sources (nextstrain.org, supplemental table...).
     function tl {
         perl -wne 'chomp; @w = split(/\t/, $_, -1); $i = 1; foreach $w (@w) { print "$i\t$w[$i-1]\n"; $i++; }'
     }
-    rCol=$(head -1 nextclade.rsv$aOrB.tsv | tl | grep -w clade | cut -f 1)
+    cCol=$(head -1 nextclade.rsv$aOrB.tsv | tl | grep -w clade | cut -f 1)
     gCol=$(head -1 nextclade.rsv$aOrB.tsv | tl | grep -w G_clade | cut -f 1)
     subsCol=$(head -1 nextclade.rsv$aOrB.tsv | tl | grep -w totalSubstitutions | cut -f 1)
     tail -n+2 nextclade.rsv$aOrB.tsv \
-    | tawk '$'$subsCol' < 1000 && $'$gCol' != "" {print $1, $'$gCol';}' \
+    | tawk '$'$subsCol' < '$maxSubs' && $'$gCol' != "" {print $2, $'$gCol';}' \
     | sed -re 's/unassigned/Unassigned/;' \
     | sort > accToGClade
     join -t$'\t' accToGClade renaming.tsv | grep -v Unassigned | cut -f 2,3 | sort \
         > gCladeToName.$aOrB
     if [[ $aOrB == A ]]; then
         # RSV-A only: if nextclade is not calling some GAs, use assignments from nextstrain.org.
         for ga in GA1 GA2.1 GA2.2 GA2.3.6a; do
             set +o pipefail
             gaCount=$(grep -w $ga gCladeToName.A | wc -l)
             set -o pipefail
             if (( $gaCount < 10 )); then
                 join -t$'\t' \
                     <(grep -w $ga $rsvDir/nextstrain_acc_to_clade.tsv | sort) \
                     <(sed -re 's/\.[0-9]+\t/\t/;' renaming.tsv) \
                 | cut -f 2,3 \
                     >> gCladeToName.$aOrB
             fi
         done
     else
         # RSV-B only: if nextclade is not calling GB1, use assignments from nextstrain.org.
         set +o pipefail
         gb1Count=$(grep -w GB1 gCladeToName.B | wc -l)
         set -o pipefail
         if (( $gb1Count < 10 )); then
             join -t$'\t' \
                 <(grep -w GB1 $rsvDir/nextstrain_acc_to_clade.tsv | sort) \
                 <(sed -re 's/\.[0-9]+\t/\t/;' renaming.tsv) \
             | cut -f 2,3 \
                 >> gCladeToName.$aOrB
         fi
     fi
     if [[ -s $rsvScriptDir/goya.$aOrB.clade-mutations.tsv ]]; then
         cladeMuts="-M $rsvScriptDir/goya.$aOrB.clade-mutations.tsv"
     else
         cladeMuts=""
     fi
     $matUtils annotate -T 64 -f 0.9 -m 0.1 -i rsv$aOrB.$today.opt.pb \
         -l -c gCladeToName.$aOrB $cladeMuts -o rsv$aOrB.$today.gClade.pb \
         >& annotate.$aOrB.gClade.log
     tail -n+2 nextclade.rsv$aOrB.tsv \
-    | tawk '$'$subsCol' < 1000 && $'$rCol' != "" {print $1, $'$rCol';}' \
+    | tawk '$'$subsCol' < '$maxSubs' && $'$cCol' != "" {print $2, $'$cCol';}' \
     | sed -re 's/unassigned/Unassigned/;' \
-    | sort > accToRClade
-    # As of 2023-01-24, nextclade doesn't call all of the Ramaekers clades, but Table S1 from the
-    # publication (https://academic.oup.com/ve/article/6/2/veaa052/5876035) lists INSDC accessions
-    # and assignments.  Use those when available, and fall back on nextclade for other sequences.
-    # Exclude nextclade A17 because it messes up A18.
-    join -a 1 -o 1.1,1.2,2.2 -t$'\t' \
-        <(sed -re 's/\.[0-9]+\t/\t/;' accToRClade ) \
-        $rsvDir/Ramaekers_TableS1_accTo$aOrB.tsv \
-    | tawk '{ if ($3 != "" || $2 == "A17") { print $1, $3; } else { print $1, $2; } }' \
-    | sort \
-        > accToRCladeCombined
-    join -t$'\t' accToRCladeCombined <(sed -re 's/\.[0-9]+\t/\t/;' renaming.tsv) \
+    | sort > accToCClade
+    join -t$'\t' accToCClade renaming.tsv \
     | grep -v Unassigned | cut -f 2,3 | sort \
-        > rCladeToName.$aOrB
-    if [[ -s $rsvScriptDir/ramaekers.$aOrB.clade-mutations.tsv ]]; then
-        cladeMuts="-M $rsvScriptDir/ramaekers.$aOrB.clade-mutations.tsv"
+        > cCladeToName.$aOrB
+    if [[ -s $rsvScriptDir/gcc.$aOrB.clade-mutations.tsv ]]; then
+        cladeMuts="-M $rsvScriptDir/gcc.$aOrB.clade-mutations.tsv"
     else
         cladeMuts=""
     fi
     $matUtils annotate -T 64 -f 0.9 -m 0.1 -i rsv$aOrB.$today.gClade.pb \
-        -c rCladeToName.$aOrB $cladeMuts -o rsv$aOrB.$today.pb \
-        >& annotate.$aOrB.rClade.log
+        -c cCladeToName.$aOrB $cladeMuts -o rsv$aOrB.$today.pb \
+        >& annotate.$aOrB.cClade.log
     $matUtils summary -i rsv$aOrB.$today.pb -C sample-clades.$aOrB >& tmp.log
 
     # Make metadata that uses same names as tree
-    echo -e "strain\tgenbank_accession\tdate\tcountry\tlocation\tlength\thost\tbioproject_accession\tbiosample_accession\tsra_accession\tauthors\tpublications\tgoya_nextclade\tramaekers_nextclade\tgoya_usher\tramaekers_usher\tramaekers_tableS1\tgcc_lineage" \
+    echo -e "strain\tgenbank_accession\tdate\tcountry\tlocation\tlength\thost\tbioproject_accession\tbiosample_accession\tsra_accession\tauthors\tpublications\tgoya_nextclade\tGCC_nextclade\tgoya_usher\tGCC_usher\tGCC_assigned_2023-11" \
         > rsv$aOrB.$today.metadata.tsv
     join -t$'\t' -o 1.2,2.1,2.6,2.4,2.5,2.8,2.9,2.10,2.11,2.13,2.14,2.15 \
         <(sort renaming.tsv) \
         <(sort $rsvNcbiDir/metadata.tsv \
           | perl -F'/\t/' -walne '$F[3] =~ s/(: ?|$)/\t/;  print join("\t", @F);') \
     | sort \
     | join -t$'\t' - <(join -a 1 -o 1.2,2.2 -t$'\t' renaming.tsv accToGClade | sort) \
-    | join -t$'\t' - <(join -a 1 -o 1.2,2.2 -t$'\t' renaming.tsv accToRClade | sort) \
+    | join -t$'\t' - <(join -a 1 -o 1.2,2.2 -t$'\t' renaming.tsv accToCClade | sort) \
     | join -t$'\t' - <(sort sample-clades.$aOrB | sed -re 's/None/Unassigned/g') \
     | join -t$'\t' - <(join -a 1 -o 1.2,2.2 -t$'\t' \
                            <(sed -re 's/\.[0-9]+\t/\t/;' renaming.tsv) \
-                           $rsvDir/Ramaekers_TableS1_accTo$aOrB.tsv \
-                       | sort) \
-    | join -t$'\t' - <(join -a 1 -o 1.2,2.2 -t$'\t' \
-                           <(sed -re 's/\.[0-9]+\t/\t/;' renaming.tsv) \
-                           <(tawk '{print $2, $1;}' $rsvDir/RSV${aOrB}_GCC_2023-05-16.tsv | sort) \
+                           $rsvDir/RSV${aOrB}_GCC_2023-11-06.tsv \
                        | sort) \
         >> rsv$aOrB.$today.metadata.tsv
     pigz -f -p 8 rsv$aOrB.$today.metadata.tsv
 
     # Make a tree version description for hgPhyloPlace
     $matUtils extract -i rsv$aOrB.$today.pb -u samples.$aOrB.$today >& tmp.log
     sampleCountComma=$(wc -l < samples.$aOrB.$today \
               | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
     echo "$sampleCountComma genomes from INSDC (GenBank/ENA/DDBJ) ($today)" \
         > hgPhyloPlace.description.$aOrB.txt
 
     # Make a taxonium view
     usher_to_taxonium --input rsv$aOrB.$today.pb \
         --metadata rsv$aOrB.$today.metadata.tsv.gz \
-        --columns genbank_accession,country,location,date,authors,ramaekers_nextclade,goya_nextclade,ramaekers_usher,goya_usher,ramaekers_tableS1,gcc_lineage \
-        --clade_types=pango,placeholder \
+        --columns genbank_accession,country,location,date,authors,GCC_nextclade,goya_nextclade,GCC_usher,goya_usher,GCC_assigned_2023-11 \
+        --clade_types=placeholder,pango \
         --genbank $gbff \
         --name_internal_nodes \
         --title "RSV-"$aOrB" $today tree with $sampleCountComma genomes from INSDC" \
         --output rsv$aOrB.$today.taxonium.jsonl.gz \
         >& usher_to_taxonium.$aOrB.log
 
-    # Update links to latest protobuf and metadata in hgwdev cgi-bin directories
-    for dir in /usr/local/apache/cgi-bin{-angie,,-beta}/hgPhyloPlaceData/$asmAcc; do
+    # Update links to latest protobuf and metadata in /gbdb directories
+    nc=$(basename $gbff .gbff)
+    dir=/gbdb/wuhCor1/hgPhyloPlaceData/rsv/$nc
+    mkdir -p $dir
     ln -sf $(pwd)/rsv$aOrB.$today.pb $dir/rsv$aOrB.latest.pb
     ln -sf $(pwd)/rsv$aOrB.$today.metadata.tsv.gz $dir/rsv$aOrB.latest.metadata.tsv.gz
     ln -sf $(pwd)/hgPhyloPlace.description.$aOrB.txt $dir/rsv$aOrB.latest.version.txt
-    done
 
     # Extract Newick and VCF for anyone who wants to download those instead of protobuf
     $matUtils extract -i rsv$aOrB.$today.pb \
         -t rsv$aOrB.$today.nwk \
         -v rsv$aOrB.$today.vcf >& tmp.log
     pigz -p 8 -f rsv$aOrB.$today.nwk rsv$aOrB.$today.vcf
 
     # Link to public trees download directory hierarchy
     read y m d < <(echo $today | sed -re 's/-/ /g')
     archive=$archiveRoot/$y/$m/$d
     mkdir -p $archive
     ln -f $(pwd)/rsv$aOrB.$today.{nwk,vcf,metadata.tsv,taxonium.jsonl}.gz $archive/
     gzip -c rsv$aOrB.$today.pb > $archive/rsv$aOrB.$today.pb.gz
     ln -f $(pwd)/hgPhyloPlace.description.$aOrB.txt $archive/rsv$aOrB.$today.version.txt
 
     # Update 'latest' in $archiveRoot
     for f in $archive/rsv$aOrB.$today.*; do
         latestF=$(echo $(basename $f) | sed -re 's/'$today'/latest/')
         ln -f $f $archiveRoot/$latestF
     done
 
     # Update hgdownload-test link for archive
     asmDir=$(echo $asmAcc \
         | sed -re 's@^(GC[AF])_([0-9]{3})([0-9]{3})([0-9]{3})\.([0-9]+)@\1/\2/\3/\4/\1_\2\3\4.\5@')
-    mkdir -p /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER_RSV-$aOrB/$y/$m
-    ln -sf $archive /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER_RSV-$aOrB/$y/$m
-    # rsync to hgdownload hubs dir
-    rsync -v -a -L --delete /usr/local/apache/htdocs-hgdownload/hubs/$asmDir/UShER_RSV-$aOrB/* \
-        qateam@hgdownload.soe.ucsc.edu:/mirrordata/hubs/$asmDir/UShER_RSV-$aOrB/
+    mkdir -p /data/apache/htdocs-hgdownload/hubs/$asmDir/UShER_RSV-$aOrB/$y/$m
+    ln -sf $archive /data/apache/htdocs-hgdownload/hubs/$asmDir/UShER_RSV-$aOrB/$y/$m
+    # rsync to hgdownload{1,2} hubs dir
+    for h in hgdownload1 hgdownload2; do
+        rsync -a -L --delete /data/apache/htdocs-hgdownload/hubs/$asmDir/UShER_RSV-$aOrB/* \
+            qateam@$h:/mirrordata/hubs/$asmDir/UShER_RSV-$aOrB/
+    done
 done
 
 set +o pipefail
 grep 'Could not' annotate.*.log | cat
 grep skipping annotate.*.log | cat
 set -o pipefail
 
 cat hgPhyloPlace.description.A.txt
 zcat rsvA.$today.metadata.tsv.gz | tail -n+2 | cut -f 13,15 | sort | uniq -c
 echo ""
 cat hgPhyloPlace.description.B.txt
 zcat rsvB.$today.metadata.tsv.gz | tail -n+2 | cut -f 13,15 | sort | uniq -c
 echo ""
-echo "Ramaekers clades:"
+echo "RSV GCC clades:"
 zcat rsvA.$today.metadata.tsv.gz | tail -n+2 | cut -f 14,16,17 | sort | uniq -c
 echo ""
 zcat rsvB.$today.metadata.tsv.gz | tail -n+2 | cut -f 14,16,17 | sort | uniq -c
 
 rm -f mutation-paths.txt *.pre*.pb final-tree.nh *.opt.pb *.gClade.pb *.pbintermediate*.pb
 nice gzip -f *.log *.tsv move_log* *.stderr samples.* sample-clades.*