02e715d7eb789c5a3fc5399dc6138e94f4fdaa85
angie
  Fri Aug 26 12:02:40 2022 -0700
Add a hook for a prevDate.masked.useMe.pb file so I can do some major pruning, and let sequences be added back the next day without worrying that I've removed too much from the version used by hgPhyloPlace.  Update column indices for latest version of nextclade.

diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
index e021c8e..a5af428 100755
--- src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
+++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
@@ -25,31 +25,37 @@
 
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 ncbiDir=$ottoDir/ncbi.latest
 cogUkDir=$ottoDir/cogUk.latest
 cncbDir=$ottoDir/cncb.latest
 gisaidDir=/hive/users/angie/gisaid
 minReal=20000
 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit
 epiToPublic=$gisaidDir/epiToPublicAndDate.latest
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 mkdir -p $ottoDir/$today
 cd $ottoDir/$today
 
+# If there's a version that I didn't want to push out to the main site, but wanted to be used
+# as the basis for the next day's build (for example with some extra pruning), use that:
+if [ -e $ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.useMe.pb ]; then
+    prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.useMe.pb
+else
     prevProtobufMasked=$ottoDir/$prevDate/gisaidAndPublic.$prevDate.masked.pb
+fi
 
 usherDir=~angie/github/usher
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
 
 renaming=oldAndNewNames
 
 if [ "$baseProtobuf" == "" ]; then
     baseProtobuf=$prevProtobufMasked
 fi
 
 # Make lists of sequences already in the tree.
 $matUtils extract -i $baseProtobuf -u prevNames
 
 # Before updating the tree with new sequences, update the names used in the tree:
@@ -162,37 +168,37 @@
 grep -Fwf prevGbAcc $epiToPublic | cut -f 1 >> prevGisaid
 grep -Fwf prevCogUk $epiToPublic | cut -f 1 >> prevGisaid
 wc -l prev*
 
 # Exclude some sequences based on nextclade counts of reversions and other-clade mutations.
 zcat $gisaidDir/chunks/nextclade.full.tsv.gz \
 | $scriptDir/findDropoutContam.pl > gisaid.dropoutContam
 zcat $ncbiDir/nextclade.full.tsv.gz \
 | $scriptDir/findDropoutContam.pl > gb.dropoutContam
 zcat $cogUkDir/nextclade.full.tsv.gz \
 | $scriptDir/findDropoutContam.pl > cog.dropoutContam
 cut -f 1 *.dropoutContam \
 | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \
     > dropoutContam.ids
 # Also exclude sequences with unbelievably low numbers of mutations given sampling dates.
-zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 1,5 \
+zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 1,6 \
 | awk -F\| '{ if ($3 == "") { print $1 "\t" $2; } else { print $2 "\t" $3; } }' \
 | $scriptDir/findRefBackfill.pl > gisaid.refBackfill
-zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 1,5 | sort \
+zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 1,6 | sort \
 | join -t $'\t' <(cut -f 1,3 $ncbiDir/ncbi_dataset.plusBioSample.tsv | sort) - \
 | $scriptDir/findRefBackfill.pl > gb.refBackfill
-zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 1,5 | sort \
+zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 1,6 | sort \
 | join -t $'\t' <(cut -d, -f 1,5 $cogUkDir/cog_metadata.csv | tr , $'\t' | sort) - \
 | $scriptDir/findRefBackfill.pl > cog.refBackfill
 cut -f 1 *.refBackfill > refBackfill.ids
 sort -u ../tooManyEpps.ids ../badBranchSeed.ids dropoutContam.ids refBackfill.ids \
 | grep -vFwf <(tail -n+4 $scriptDir/includeRecombinants.tsv | cut -f 1) \
     > exclude.ids
 
 # Get new GenBank sequences with at least $minReal non-N bases.
 # Exclude seqs in the tree with EPI IDs that that have been mapped in the very latest $epiToPublic.
 set +o pipefail
 egrep $'\t''[A-Z][A-Z][0-9]{6}\.[0-9]+' $epiToPublic \
 | grep -Fwf prevGisaid - \
 | grep -vFwf prevGbAcc \
 | cat \
     >> prevGbAcc