src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh 89a4fdc0fec59184412fd55ae77183b7b2acb35f

89a4fdc0fec59184412fd55ae77183b7b2acb35f
angie
  Fri Mar 12 15:21:33 2021 -0800
Use the latest matUtils subcommand to get VCF;  add subtracks by quarterly date ranges.

diff --git src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh
index 32c13ae..bc1f847 100755
--- src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh
@@ -37,30 +37,31 @@
 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit
 
 usherDir=~angie/github/usher
 usher=$usherDir/build/usher
 matUtils=$usherDir/build/matUtils
 find_parsimonious_assignments=~angie/github/strain_phylogenetics/build/find_parsimonious_assignments
 
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 
 source $scriptDir/util.sh
 
 # Before we get started, make sure cog_metadata has the columns we're expecting:
 expectedHeaderStart='sequence_name,country,adm1,pillar_2,sample_date,epi_week,lineage,'
 cogUkMetHeader=$(head -1 $cogUkDir/cog_metadata.csv | grep ^$expectedHeaderStart)
 # The grep will fail if they change one of the first 7 fields.
+# If it fails, update cog_metadata.csv column indices below and in getCogUk.sh too.
 
 mkdir -p $ottoDir/$today
 cd $ottoDir/$today
 
 # Disregard pipefail just for this pipe because head prevents the cat from completing:
 set +o pipefail
 xcat $prevVcf | head | grep ^#CHROM | sed -re 's/\t/\n/g;' | tail -n+10 > prevNames
 set -o pipefail
 awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
 | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc
 awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \
 | grep -E '^(England|Northern|Scotland|Wales)' > prevCogUk
 
 # Get new GenBank sequences with at least $minReal non-N bases.
 xzcat $ncbiDir/genbank.fa.xz \
@@ -136,51 +137,51 @@
 wc -l $renaming
 
 # Make VCF -- first without masking, for hgTracks
 time cat <(twoBitToFa $ref2bit stdout) $alignedFa \
 | faToVcf stdin stdout \
 | vcfRenameAndPrune stdin $renaming stdout \
 | gzip -c \
     > new.vcf.gz
 # Use usher to add new public sequences to tree of mapped subset and infer missing/ambig bases.
 time $usher -u -T 50 \
     -v new.vcf.gz \
     -i $prevProtobufUnmasked \
     -o public-$today.all.notMasked.pb \
     >& usher.addNewUnmasked.log
 #~10 hours for 56k seqs, ~72m for 6k
-$matUtils convert -i public-$today.all.notMasked.pb -v public-$today.all.vcf
+$matUtils extract -i public-$today.all.notMasked.pb -v public-$today.all.vcf
 ls -l public-$today.all.vcf
 wc -l public-$today.all.vcf
 bgzip -f public-$today.all.vcf
 tabix -p vcf public-$today.all.vcf.gz
 
 # Then with masking to collapse the tree & make protobuf for usher/hgPhyloPlace:
 tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \
     $problematicSitesVcf > mask.vcf
 time vcfFilter -excludeVcf=mask.vcf new.vcf.gz \
 | gzip -c \
     > new.masked.vcf.gz
 time $usher -u -T 50 \
     -v new.masked.vcf.gz \
     -i $prevProtobufMasked \
     -o public-$today.all.masked.pb \
     >& usher.addNew.log
 mv uncondensed-final-tree.nh public-$today.all.nwk
 # Masked VCF that goes with the masked protobuf, for public distribution for folks who want
 # to run UShER and also have the VCF.
-$matUtils convert -i public-$today.all.masked.pb -v public-$today.all.masked.vcf
+$matUtils extract -i public-$today.all.masked.pb -v public-$today.all.masked.vcf
 gzip public-$today.all.masked.vcf
 
 # Make allele-frequency-filtered versions
 # Disregard pipefail just for this pipe because head prevents the cat from completing:
 set +o pipefail
 sampleCount=$(zcat public-$today.all.vcf.gz | head | grep ^#CHROM | sed -re 's/\t/\n/g' \
               | tail -n+10 | wc -l)
 set -o pipefail
 minAc001=$(( (($sampleCount + 999) / 1000) ))
 vcfFilter -minAc=$minAc001 -rename public-$today.all.vcf.gz \
     > public-$today.all.minAf.001.vcf
 wc -l public-$today.all.minAf.001.vcf
 bgzip -f public-$today.all.minAf.001.vcf
 tabix -p vcf public-$today.all.minAf.001.vcf.gz
 
@@ -318,46 +319,83 @@
     >& annotate.nextclade.out
 
 # Add pangolin lineage annotations to protobuf
 zcat public-$today.metadata.tsv.gz \
 | tail -n+2 | tawk '$9 != "" {print $9, $1;}' \
     > lineageToPublicName
 time $matUtils annotate -T 50 \
     -i public-$today.all.masked.nextclade.pb \
     -c lineageToPublicName \
     -o public-$today.all.masked.nextclade.pangolin.pb \
     >& annotate.pangolin.out
 
 # Not all the Pangolin lineages can be assigned nodes so for now just use nextclade
 cp -p public-$today.all.masked.nextclade.pb public-$today.all.masked.pb
 
+# Divide up allele-frequency-filtered versions into quarterly versions
+set +o pipefail
+zcat public-$today.all.vcf.gz | head | grep ^#CHROM | sed -re 's/\t/\n/g' | tail -n+10 \
+| awk -F\| '{if ($3 == "") { print $2 "\t" $0; } else { print $3 "\t" $0;} }' \
+| egrep '^[0-9][0-9]-[0-9][0-9]' \
+| sed -re 's/^([0-9][0-9])-([0-9][0-9])(-[0-9][0-9])?(.*)/\1\t\2\4/;' \
+| tawk '($1 == 19 && $2 == 12) || ($1 > 19 && $2 > 0)' \
+| sort > samplesByDate
+set -o pipefail
+
+tawk '$1 < 20 || ($1 == 20 && $2 <= 3) { print $3, $3; }' samplesByDate \
+    > 20Q1.renaming
+tawk '$1 == 20 && $2 >= 4 && $2 <= 6 { print $3, $3; }' samplesByDate \
+    > 20Q2.renaming
+tawk '$1 == 20 && $2 >= 7 && $2 <= 9 { print $3, $3; }' samplesByDate \
+    > 20Q3.renaming
+tawk '$1 == 20 && $2 >= 10 { print $3, $3; }' samplesByDate \
+    > 20Q4.renaming
+tawk '$1 == 21 && $2 <= 3{ print $3, $3; }' samplesByDate \
+    > 21Q1.renaming
+
+for q in 20Q1 20Q2 20Q3 20Q4 21Q1; do
+    for af in 01 001; do
+        vcfRenameAndPrune public-$today.all.minAf.$af.vcf.gz $q.renaming \
+            public-$today.$q.minAf.$af.vcf
+        wc -l public-$today.$q.minAf.$af.vcf
+        bgzip public-$today.$q.minAf.$af.vcf
+        tabix -p vcf public-$today.$q.minAf.$af.vcf.gz
+    done
+done
+
 # Update gbdb links -- not every day, too much churn for getting releases out and the
 # tracks are getting unmanageably large for VCF.
 if false; then
 for f in public-$today.all{,.minAf*}.vcf.gz ; do
     t=$(echo $f | sed -re "s/-$today//;")
     ln -sf `pwd`/$f /gbdb/wuhCor1/sarsCov2PhyloPub/$t
     ln -sf `pwd`/$f.tbi /gbdb/wuhCor1/sarsCov2PhyloPub/$t.tbi
 done
 ln -sf `pwd`/public-$today.all.nwk /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.nwk
 ln -sf `pwd`/public-$today.all.parsimony.bw \
     /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.parsimony.bw
 ln -sf `pwd`/public-$today.lineageColors.gz \
     /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.lineageColors.gz
 ln -sf `pwd`/public-$today.nextstrainColors.gz \
     /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.nextstrainColors.gz
 ln -sf `pwd`/version.txt /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.version.txt
+for q in 20Q1 20Q2 20Q3 20Q4 21Q1; do
+    for af in 01 001; do
+        ln -sf `pwd`/public-$today.$q.minAf.$af.vcf.gz \
+            /gbdb/wuhCor1/sarsCov2PhyloPub/public.$q.minAf.$af.vcf.gz
+    done
+done
 fi
 
 # Link to public trees download directory hierarchy
 y=$(date +%Y)
 m=$(date +%m)
 d=$(date +%d)
 archiveRoot=/hive/users/angie/publicTrees
 archive=$archiveRoot/$y/$m/$d
 mkdir -p $archive
 ln `pwd`/public-$today.all.nwk $archive/
 ln `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/
 ln `pwd`/public-$today.metadata.tsv.gz $archive/
 ln `pwd`/public-$today.all.masked.nextclade.pangolin.pb $archive/
 ln `pwd`/cladeToPublicName $archive/
 ln `pwd`/lineageToPublicName $archive/