f78b8c4c7e8c23acd998ce19c7ef5af7c404adc4
angie
  Wed Dec 2 16:54:28 2020 -0800
Use Yatish's find_parsimonious_assignments program to make parsimony score bigWigs.
Also save full output of find_parsimonious_assignments for collaborators (Russ request).

diff --git src/hg/utils/otto/sarscov2phylo/processRelease.sh src/hg/utils/otto/sarscov2phylo/processRelease.sh
index 22b7fe6..979fcc4 100755
--- src/hg/utils/otto/sarscov2phylo/processRelease.sh
+++ src/hg/utils/otto/sarscov2phylo/processRelease.sh
@@ -13,30 +13,33 @@
   exit 1
 fi
 
 releaseLabel=$1
 problematicSitesVcf=$2
 
 echo "releaseLabel=$releaseLabel problematicSitesVcf=$problematicSitesVcf"
 
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 
 # UShER (and matToVcf)
 usherDir=~angie/github/usher
 usher=$usherDir/build/usher
 matToVcf=$usherDir/build/matToVcf
 
+# strain_phylogenetics / find_parsimonious_assignments for parsimony scores
+find_parsimonious_assignments=~angie/github/strain_phylogenetics/build/find_parsimonious_assignments
+
 # Files generated by getRelease.sh
 metadata=gisaid_metadata.tsv
 fasta=gisaid_sequences.fasta
 alignedFasta=gisaid_aligned.fasta
 
 # We'll need to prune the tree to keep only those sequences that are in metadata and fasta:
 grep ^\> $fasta | sed -re 's/^>//' | sort > namesInFasta
 
 # Make renaming files for tree (EPI_ IDs) and VCF (isolate names): add strain name and
 # date with shortened year.
 awk -F"\t" '{print $3 "\t" $1 "|" $3 "|" $5;}' $metadata \
 | grep -Fwf namesInFasta \
 | sed -re 's/\|20/\|/;' \
 | sort > tree.renaming
 awk -F"\t" '{print $1 "\t" $1 "|" $3 "|" $5;}' $metadata \
@@ -74,38 +77,60 @@
   --save-mutation-annotated-tree sarscov2phylo-$releaseLabel.notMasked.pb \
   --write-uncondensed-final-tree
 mv uncondensed-final-tree.nh ft_SH.reroot.collapsed.notMasked.nh 
 $matToVcf -i sarscov2phylo-$releaseLabel.notMasked.pb \
     -v gisaid-$releaseLabel.vcf
 bgzip -f gisaid-$releaseLabel.vcf
 tabix -p vcf gisaid-$releaseLabel.vcf.gz
 
 # Remove problematic sites recommended for masking for usher/phyloPlace
 tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \
     $problematicSitesVcf > mask.vcf
 time vcfFilter -excludeVcf=mask.vcf gisaid-$releaseLabel.unfiltered.vcf.gz \
 | gzip -c \
     > gisaid-$releaseLabel.unfiltered.masked.vcf.gz
 
+# Full find_parsimonious_assignments output on uncollapsed tree for collaborators
+time $find_parsimonious_assignments --tree ft_SH.reroot.nh \
+    --vcf gisaid-$releaseLabel.unfiltered.masked.vcf.gz \
+| gzip -c \
+    > find_parsimonious_assignments.$releaseLabel.out.gz
+
 # Run usher to collapse tree and make masked protobuf for hgPhyloPlace
 time $usher -c \
   --vcf gisaid-$releaseLabel.unfiltered.masked.vcf.gz \
   --tree ft_SH.reroot.nh \
   --save-mutation-annotated-tree sarscov2phylo-$releaseLabel.masked.pb \
   --write-uncondensed-final-tree
 mv uncondensed-final-tree.nh ft_SH.reroot.collapsed.nh 
 
+# Parsimony scores on collapsed tree
+time $find_parsimonious_assignments --tree ft_SH.reroot.collapsed.nh \
+    --vcf <(gunzip -c gisaid-$releaseLabel.vcf.gz) \
+| tail -n+2 \
+| sed -re 's/^[A-Z]([0-9]+)[A-Z,]+.*parsimony_score=([0-9]+).*/\1\t\2/;' \
+| tawk '{print "NC_045512v2", $1-1, $1, $2;}' \
+| sort -k2n,2n \
+    > gisaid-$releaseLabel.parsimony.bg
+bedGraphToBigWig gisaid-$releaseLabel.parsimony.bg /hive/data/genomes/wuhCor1/chrom.sizes \
+    gisaid-$releaseLabel.parsimony.bw
+
+# Full find_parsimonious_assignments output on collapsed tree for collaborators
+time $find_parsimonious_assignments --tree ft_SH.reroot.collapsed.nh \
+    --vcf gisaid-$releaseLabel.unfiltered.masked.vcf.gz \
+    > find_parsimonious_assignments.$releaseLabel.collapsed.out
+
 # Now filter by frequency for faster track display: alt al freq >= 0.001
 sampleCount=$(wc -l < tree.renamedPruned.ids.sorted)
 minAc001=$(( (($sampleCount + 999) / 1000) ))
 time vcfFilter -minAc=$minAc001 -rename gisaid-$releaseLabel.vcf.gz \
     > gisaid-$releaseLabel.minAf.001.vcf
 ls -l gisaid-$releaseLabel.minAf.001.vcf
 wc -l gisaid-$releaseLabel.minAf.001.vcf
 bgzip -f gisaid-$releaseLabel.minAf.001.vcf
 tabix -p vcf gisaid-$releaseLabel.minAf.001.vcf.gz
 
 # Alt al freq >= 0.01
 minAc01=$(( (($sampleCount + 99) / 100) ))
 time vcfFilter -minAc=$minAc01 -rename gisaid-$releaseLabel.minAf.001.vcf.gz \
     > gisaid-$releaseLabel.minAf.01.vcf
 ls -l gisaid-$releaseLabel.minAf.01.vcf