src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh e931fd3e5fdab83eeb110b2450a0f365eff170e2

e931fd3e5fdab83eeb110b2450a0f365eff170e2
angie
  Mon Oct 4 17:04:17 2021 -0700
Omit ORF1a from taxodium amino acid annotations (redundant with ORF1ab).  Add optional baseProtobuf arg to updateCombinedTree.sh / makeNewMaskedVcf.sh to start with a custom protobuf (e.g. optimized) instead of eyesterday's protobuf.

diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
index bb426e6..3791daa 100755
--- src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
+++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
@@ -1,60 +1,64 @@
 #!/bin/bash
 set -beEu -x -o pipefail
 
 #	Do not modify this script, modify the source tree copy:
 #	kent/src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
 
 usage() {
-    echo "usage: $0 prevDate today problematicSitesVcf"
+    echo "usage: $0 prevDate today problematicSitesVcf [baseProtobuf]"
     echo "This assumes that ncbi.latest and cogUk.latest links/directories have been updated."
 }
 
-if [ $# != 3 ]; then
+if [ $# != 3 && $# != 4]; then
   usage
   exit 1
 fi
 
 prevDate=$1
 today=$2
 problematicSitesVcf=$3
+baseProtobuf=$4
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 ncbiDir=$ottoDir/ncbi.latest
 cogUkDir=$ottoDir/cogUk.latest
 cncbDir=$ottoDir/cncb.latest
 gisaidDir=/hive/users/angie/gisaid
 minReal=20000
 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit
 epiToPublic=$gisaidDir/epiToPublicAndDate.latest
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 mkdir -p $ottoDir/$today
 cd $ottoDir/$today
 
 prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb
-prevMeta=$ottoDir/$prevDate/public-$prevDate.metadata.tsv.gz
 
 usherDir=~angie/github/usher
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
 
 renaming=oldAndNewNames
 
+if [ "$baseProtobuf" == "" ]; then
+    baseProtobuf=$prevProtobufMasked
+fi
+
 # Make lists of sequences already in the tree.
-$matUtils extract -i $prevProtobufMasked -u prevNames
+$matUtils extract -i $baseProtobuf -u prevNames
 
 # Before updating the tree with new sequences, update the names used in the tree:
 # * Sequences that are already in the tree with EPI_ IDs, but that have been mapped to public IDs
 # * COG-UK sequences that are in GenBank.  Per Sam Nicholls the isolate alone is enough to identify
 #   them -- no need to match country & year which are sometimes incorrect at first.
 grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
 | tawk '{print $6, $4 "/" $6 "/" $3 "|" $1 "|" $3;}' \
 | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;' \
 | sort \
     > cogUkInGenBankIsolateToNewName
 fastaNames $cogUkDir/cog_all.fasta.xz > cogUk.names
 grep -Fwf <(cut -f 1 cogUkInGenBankIsolateToNewName) cogUk.names \
     > cogUkInGenBank
 set +o pipefail
 # From 2021-05-04 on there should be no more unrenamed COG-UK/ in prevNames, but doesn't hurt
@@ -117,34 +121,34 @@
 # And finally, sometimes there are duplicates due to country or date being corrected in COG-UK
 # metadata.
 grep -vFwf dupsToRemove prevTree.renaming \
 | cut -f 2 | sort | uniq -c | awk '$1 > 1 {print $2;}' \
 | cut -d/ -f 2 \
     > cogUkIsolateStillDup
 grep -Fwf cogUkIsolateStillDup $cogUkDir/cog_metadata.csv \
 | awk -F, '{print $1 "|" $5;}' \
     > cogDupsLatestMetadata
 grep -Fwf cogUkIsolateStillDup prevNames \
 | grep -vFwf dupsToRemove \
 | grep -vFwf cogDupsLatestMetadata \
 | cat \
     >> dupsToRemove
 set -o pipefail
-startingProtobuf=$prevProtobufMasked
+startingProtobuf=$baseProtobuf
 if [ -s dupsToRemove ]; then
     startingProtobuf=prevDupTrimmed.pb
-    $matUtils extract -i $prevProtobufMasked -p -s dupsToRemove -o $startingProtobuf
+    $matUtils extract -i $baseProtobuf -p -s dupsToRemove -o $startingProtobuf
 fi
 
 if [ -s prevTree.renaming ]; then
     $matUtils mask -r prevTree.renaming -i $startingProtobuf -o prevRenamed.pb >& rename.out
     $matUtils extract -i prevRenamed.pb -u prevNames
 else
     ln -sf $startingProtobuf prevRenamed.pb
 fi
 
 # OK, now that the tree names are updated, figure out which seqs are already in there and
 # which need to be added.
 awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
 | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc
 grep -E '^(England|Northern|Scotland|Wales)' prevNames \
 | cut -d\| -f1 > prevCogUk