3feec259f20a7cc03affb9fae613492486bdd91a
angie
  Wed Jul 28 15:41:37 2021 -0700
Move out the extraction of the public-sequences-only tree from the combined tree into a separate script for ease of manually continuing runs when there's a failure (most often matUtils annotate -P failing due to a changed path in the tree).

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index 91c8f79..fb3411d 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -17,33 +17,30 @@
 prevDate=$1
 problematicSitesVcf=$2
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 ncbiDir=$ottoDir/ncbi.latest
 cogUkDir=$ottoDir/cogUk.latest
 cncbDir=$ottoDir/cncb.latest
 gisaidDir=/hive/users/angie/gisaid
 minReal=20000
 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit
 epiToPublic=$gisaidDir/epiToPublicAndDate.latest
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 today=$(date +%F)
-y=$(date +%Y)
-m=$(date +%m)
-d=$(date +%d)
 
 cd $ottoDir/$today
 
 prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb
 
 usherDir=~angie/github/usher
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
 
 renaming=oldAndNewNames
 
 if [ ! -s new.masked.vcf.gz ]; then
 
 # Make lists of sequences already in the tree.
 $matUtils extract -i $prevProtobufMasked -u prevNames
@@ -375,130 +372,57 @@
 tail -n+2 $cncbDir/cncb.metadata.tsv \
 | tawk '{ if ($3 != "GISAID" && $3 != "GenBank" && $3 != "Genbank") {
             print $2, "", $10, $11, $9, $5, $6} }' \
 | sed -re 's@\t([A-Za-z -]+)( / [A-Za-z -'"'"']+)+\t@\t\1\t@;  s/Sapiens/sapiens/;' \
 | sort \
 | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 - $cncbDir/nextclade.tsv \
 | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.2 - $cncbDir/pangolin.tsv \
 | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
     >> gisaidAndPublic.$today.metadata.tsv
 wc -l gisaidAndPublic.$today.metadata.tsv
 zcat $gisaidDir/metadata_batch_$today.tsv.gz \
 | grep -Fwf <(cut -f 2 $renaming | grep EPI_ISL | cut -d\| -f 2) \
 | tawk '{print $1 "|" $3 "|" $5, "", $5, $7, $15, $13, $14, $18, $19;}' \
     >> gisaidAndPublic.$today.metadata.tsv
 wc -l gisaidAndPublic.$today.metadata.tsv
-gzip gisaidAndPublic.$today.metadata.tsv
+gzip -f gisaidAndPublic.$today.metadata.tsv
 
 # version/description files
 cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
 echo "sarscov2phylo release 13-11-20; GISAID, NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
     > version.plusGisaid.txt
 $matUtils extract -i gisaidAndPublic.$today.masked.pb -u samples.$today
 sampleCountComma=$(echo $(wc -l < samples.$today) \
                    | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
 echo "$sampleCountComma genomes from GISAID, GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
     > hgPhyloPlace.plusGisaid.description.txt
 
 # Add nextclade annotations to protobuf
 time $matUtils annotate -T 50 \
     -l \
     -i gisaidAndPublic.$today.masked.pb \
     -P ../nextstrain.clade-paths.tsv \
     -o gisaidAndPublic.$today.masked.nextclade.pb
 
 # Add pangolin lineage annotations to protobuf.
 time $matUtils annotate -T 50 \
     -i gisaidAndPublic.$today.masked.nextclade.pb \
     -P ../pango.clade-paths.tsv \
     -o gisaidAndPublic.$today.masked.nextclade.pangolin.pb
 
 mv gisaidAndPublic.$today.masked{,.unannotated}.pb
-ln gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb
+ln -f gisaidAndPublic.$today.masked.nextclade.pangolin.pb gisaidAndPublic.$today.masked.pb
 
 # EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have
 # public names & IDs, we can match them.
 cut -f 1,3 $epiToPublic > epiToPublic.latest
 
 # Update links to latest public+GISAID protobuf and metadata in hgwdev cgi-bin directories
 for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do
     ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb
     ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \
         $dir/public.plusGisaid.latest.metadata.tsv.gz
     ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt
     ln -sf `pwd`/epiToPublic.latest $dir/
 done
 
-# Extract public samples from tree
-grep -v EPI_ISL_ samples.$today > newPublicNames
-$matUtils extract -i gisaidAndPublic.$today.masked.pb \
-    -s newPublicNames \
-    -O -o public-$today.all.masked.pb
-# Extract Newick and VCF from public-only tree
-$matUtils extract -i public-$today.all.masked.pb \
-    -t public-$today.all.nwk \
-    -v public-$today.all.masked.vcf
-gzip -f public-$today.all.masked.vcf
-zcat gisaidAndPublic.$today.metadata.tsv.gz \
-| grep -v EPI_ISL_ \
-| gzip -c \
-    > public-$today.metadata.tsv.gz
-
-# Add nextclade annotations to public protobuf
-time $matUtils annotate -T 50 \
-    -l \
-    -i public-$today.all.masked.pb \
-    -P ../nextstrain.clade-paths.public.tsv \
-    -o public-$today.all.masked.nextclade.pb
-
-# Add pangolin lineage annotations to public protobuf
-time $matUtils annotate -T 50 \
-    -i public-$today.all.masked.nextclade.pb \
-    -P ../pango.clade-paths.public.tsv \
-    -o public-$today.all.masked.nextclade.pangolin.pb
-
-rm public-$today.all.masked.pb
-ln -f public-$today.all.masked.nextclade.pangolin.pb public-$today.all.masked.pb
-
-cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/')
-echo "sarscov2phylo release 13-11-20; NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \
-    > version.txt
-
-$matUtils extract -i public-$today.all.masked.pb -u samples.public.$today
-sampleCountComma=$(echo $(wc -l < samples.public.$today) \
-                   | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;')
-echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \
-    > hgPhyloPlace.description.txt
-
-
-# Link to public trees download directory hierarchy
-archiveRoot=/hive/users/angie/publicTrees
-archive=$archiveRoot/$y/$m/$d
-mkdir -p $archive
-gzip -c public-$today.all.nwk > $archive/public-$today.all.nwk.gz
-ln `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/
-gzip -c public-$today.all.masked.pb > $archive/public-$today.all.masked.pb.gz
-ln `pwd`/public-$today.metadata.tsv.gz $archive/
-gzip -c public-$today.all.masked.nextclade.pangolin.pb \
-    > $archive/public-$today.all.masked.nextclade.pangolin.pb.gz
-gzip -c ../nextstrain.clade-paths.public.tsv > $archive/nextstrain.clade-paths.public.tsv.gz
-gzip -c ../pango.clade-paths.public.tsv > $archive/pango.clade-paths.public.tsv.gz
-ln `pwd`/hgPhyloPlace.description.txt $archive/public-$today.version.txt
-
-# Update 'latest' in $archiveRoot
-ln -f $archive/public-$today.all.nwk.gz $archiveRoot/public-latest.all.nwk.gz
-ln -f $archive/public-$today.all.masked.pb $archiveRoot/public-latest.all.masked.pb
-ln -f $archive/public-$today.all.masked.pb.gz $archiveRoot/public-latest.all.masked.pb.gz
-ln -f $archive/public-$today.all.masked.vcf.gz $archiveRoot/public-latest.all.masked.vcf.gz
-ln -f $archive/public-$today.metadata.tsv.gz $archiveRoot/public-latest.metadata.tsv.gz
-ln -f $archive/public-$today.version.txt $archiveRoot/public-latest.version.txt
-
-# Update hgdownload-test link for archive
-mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m
-ln -sf $archive /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m
-
-# Update links to latest public protobuf and metadata in hgwdev cgi-bin directories
-for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do
-    ln -sf `pwd`/public-$today.all.masked.pb $dir/public-latest.all.masked.pb
-    ln -sf `pwd`/public-$today.metadata.tsv.gz $dir/public-latest.metadata.tsv.gz
-    ln -sf `pwd`/hgPhyloPlace.description.txt $dir/public-latest.version.txt
-done
+$scriptDir/extractPublicTree.sh $today