1f64522294183ac3613b48b2d937fa8a666c9afa
angie
  Thu Sep 2 17:55:45 2021 -0700
Move the --max-parsimony post-filtering into usher's new --max-parsimony-per-sample option.  Add mechanism to exclude sequences that sneak past quality filters by faking reference alleles where they should have Ns.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index a2d68e2..fee47a9 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -266,57 +266,59 @@
     zcat $gisaidDir/metadata_batch_$today.tsv.gz \
     | grep -Fwf <(fastaNames newGisaid.filtered.fa) \
     | tawk '{print $3 "\t" $1 "|" $3 "|" $5;}' \
         >> $renaming
 fi
 wc -l $renaming
 
 # Make masked VCF
 tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \
     $problematicSitesVcf > mask.vcf
 # Add masked VCF to previous protobuf
 #*** With horrible hack for time being, to mask 21987 (because it messes up the Delta branch)...
 #*** TODO: make hgPhyloPlace handle protobufs that don't have all of the latest problematic sites
 #*** masked more gracefully, and update the Problematic Sites track.
 time cat <(twoBitToFa $ref2bit stdout) $alignedFa \
-| faToVcf -maxDiff=1000 -excludeFile=../tooManyEpps.ids -verbose=2 stdin stdout \
+| faToVcf -maxDiff=1000 \
+    -excludeFile=<(cat ../tooManyEpps.ids ../badBranchSeed.ids) \
+    -verbose=2 stdin stdout \
 | vcfRenameAndPrune stdin $renaming stdout \
 | vcfFilter -excludeVcf=mask.vcf stdin \
 | tawk '$2 != 21987' \
 | gzip -c \
     > new.masked.vcf.gz
 
 fi # if [ ! -s new.masked.vcf.gz ]
 
 time $usher -u -T 80 \
     -A \
     -e 5 \
+    --max-parsimony-per-sample 20 \
     -v new.masked.vcf.gz \
     -i prevRenamed.pb \
     -o gisaidAndPublic.$today.masked.preTrim.pb \
     >& usher.addNew.log
 mv uncondensed-final-tree.nh gisaidAndPublic.$today.preTrim.nwk
 
 # Exclude sequences with a very high number of EPPs from future runs
 grep ^Current usher.addNew.log \
 | awk '$16 >= 10 {print $8;}' \
 | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \
     >> ../tooManyEpps.ids
 
 # Prune samples with too many private mutations and internal branches that are too long.
 $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \
-    -a 20 \
     -b 30 \
     -O -o gisaidAndPublic.$today.masked.pb
 
 # Metadata for hgPhyloPlace:
 # Header names same as nextmeta (with strain first) so hgPhyloPlace recognizes them:
 echo -e "strain\tgenbank_accession\tdate\tcountry\thost\tcompleteness\tlength\tNextstrain_clade\tpangolin_lineage" \
     > gisaidAndPublic.$today.metadata.tsv
 # It's not always possible to recreate both old and new names correctly from metadata,
 # so make a file to translate accession or COG-UK to the name used in VCF, tree and protobufs.
 cut -f 2 $renaming \
 | awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' \
 | sort \
     > idToName
 # NCBI metadata for COG-UK: strip COG-UK/ & United Kingdom:, add country & year to name
 grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \