3feec259f20a7cc03affb9fae613492486bdd91a
angie
  Wed Jul 28 15:41:37 2021 -0700
Move out the extraction of the public-sequences-only tree from the combined tree into a separate script for ease of manually continuing runs when there's a failure (most often matUtils annotate -P failing due to a changed path in the tree).

diff --git src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh
new file mode 100755
index 0000000..59ffeeb
--- /dev/null
+++ src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+set -beEu -x -o pipefail
+
+#	Do not modify this script, modify the source tree copy:
+#	kent/src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+
+usage() {
+    echo "usage: $0 date"
+    echo "This assumes that \$date/gisaidAndPublic.\$date.masked.pb has been generated."
+}
+
+if [ $# != 1 ]; then
+  usage
+  exit 1
+fi
+
+today=$1
+ottoDir=/hive/data/outside/otto/sarscov2phylo
+cncbDir=$ottoDir/cncb.latest
+
+scriptDir=$(dirname "${BASH_SOURCE[0]}")
+source $scriptDir/util.sh
+
+usherDir=~angie/github/usher
+matUtils=$usherDir/build/matUtils
+
+cd $ottoDir/$today
+
+# Extract public samples from tree
+grep -v EPI_ISL_ samples.$today > newPublicNames
+$matUtils extract -i gisaidAndPublic.$today.masked.pb \
+    -s newPublicNames \
+    -O -o public-$today.all.masked.pb
+# Extract Newick and VCF from public-only tree
+time $matUtils extract -i public-$today.all.masked.pb \
+    -t public-$today.all.nwk \
+    -v public-$today.all.masked.vcf
+time gzip -f public-$today.all.masked.vcf
+zcat gisaidAndPublic.$today.metadata.tsv.gz \
+| grep -v EPI_ISL_ \
+| gzip -c \
+    > public-$today.metadata.tsv.gz
+
+# Add nextclade annotations to public protobuf
+time $matUtils annotate -T 50 \
+    -l \
+    -i public-$today.all.masked.pb \
+    -P ../nextstrain.clade-paths.public.tsv \
+    -o public-$today.all.masked.nextclade.pb
+
+# Add pangolin lineage annotations to public protobuf
+time $matUtils annotate -T 50 \
+    -i public-$today.all.masked.nextclade.pb \
+    -P ../pango.clade-paths.public.tsv \
+    -o public-$today.all.masked.nextclade.pangolin.pb
+
+rm public-$today.all.masked.pb
+ln -f public-$today.all.masked.nextclade.pangolin.pb public-$today.all.masked.pb
+
+cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
+echo "sarscov2phylo release 13-11-20; NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
+    > version.txt
+
+$matUtils extract -i public-$today.all.masked.pb -u samples.public.$today
+sampleCountComma=$(echo $(wc -l < samples.public.$today) \
+                   | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
+echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
+    > hgPhyloPlace.description.txt
+
+
+# Link to public trees download directory hierarchy
+archiveRoot=/hive/users/angie/publicTrees
+read y m d < <(echo $today | sed -re 's/-/ /g')
+archive=$archiveRoot/$y/$m/$d
+mkdir -p $archive
+gzip -c public-$today.all.nwk > $archive/public-$today.all.nwk.gz
+ln `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/
+gzip -c public-$today.all.masked.pb > $archive/public-$today.all.masked.pb.gz
+ln `pwd`/public-$today.metadata.tsv.gz $archive/
+gzip -c public-$today.all.masked.nextclade.pangolin.pb \
+    > $archive/public-$today.all.masked.nextclade.pangolin.pb.gz
+gzip -c ../nextstrain.clade-paths.public.tsv > $archive/nextstrain.clade-paths.public.tsv.gz
+gzip -c ../pango.clade-paths.public.tsv > $archive/pango.clade-paths.public.tsv.gz
+ln `pwd`/hgPhyloPlace.description.txt $archive/public-$today.version.txt
+
+# Update 'latest' in $archiveRoot
+ln -f $archive/public-$today.all.nwk.gz $archiveRoot/public-latest.all.nwk.gz
+ln -f $archive/public-$today.all.masked.pb $archiveRoot/public-latest.all.masked.pb
+ln -f $archive/public-$today.all.masked.pb.gz $archiveRoot/public-latest.all.masked.pb.gz
+ln -f $archive/public-$today.all.masked.vcf.gz $archiveRoot/public-latest.all.masked.vcf.gz
+ln -f $archive/public-$today.metadata.tsv.gz $archiveRoot/public-latest.metadata.tsv.gz
+ln -f $archive/public-$today.version.txt $archiveRoot/public-latest.version.txt
+
+# Update hgdownload-test link for archive
+mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m
+ln -sf $archive /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m
+
+# Update links to latest public protobuf and metadata in hgwdev cgi-bin directories
+for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do
+    ln -sf `pwd`/public-$today.all.masked.pb $dir/public-latest.all.masked.pb
+    ln -sf `pwd`/public-$today.metadata.tsv.gz $dir/public-latest.metadata.tsv.gz
+    ln -sf `pwd`/hgPhyloPlace.description.txt $dir/public-latest.version.txt
+done