89a4fdc0fec59184412fd55ae77183b7b2acb35f angie Fri Mar 12 15:21:33 2021 -0800 Use the latest matUtils subcommand to get VCF; add subtracks by quarterly date ranges. diff --git src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh index 32c13ae..bc1f847 100755 --- src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh +++ src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh @@ -37,30 +37,31 @@ ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit usherDir=~angie/github/usher usher=$usherDir/build/usher matUtils=$usherDir/build/matUtils find_parsimonious_assignments=~angie/github/strain_phylogenetics/build/find_parsimonious_assignments scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh # Before we get started, make sure cog_metadata has the columns we're expecting: expectedHeaderStart='sequence_name,country,adm1,pillar_2,sample_date,epi_week,lineage,' cogUkMetHeader=$(head -1 $cogUkDir/cog_metadata.csv | grep ^$expectedHeaderStart) # The grep will fail if they change one of the first 7 fields. +# If it fails, update cog_metadata.csv column indices below and in getCogUk.sh too. mkdir -p $ottoDir/$today cd $ottoDir/$today # Disregard pipefail just for this pipe because head prevents the cat from completing: set +o pipefail xcat $prevVcf | head | grep ^#CHROM | sed -re 's/\t/\n/g;' | tail -n+10 > prevNames set -o pipefail awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ | grep -E '^[A-Z]{2}[0-9]{6}\.[0-9]' > prevGbAcc awk -F\| '{if ($3 == "") { print $1; } else { print $2; } }' prevNames \ | grep -E '^(England|Northern|Scotland|Wales)' > prevCogUk # Get new GenBank sequences with at least $minReal non-N bases. xzcat $ncbiDir/genbank.fa.xz \ @@ -136,51 +137,51 @@ wc -l $renaming # Make VCF -- first without masking, for hgTracks time cat <(twoBitToFa $ref2bit stdout) $alignedFa \ | faToVcf stdin stdout \ | vcfRenameAndPrune stdin $renaming stdout \ | gzip -c \ > new.vcf.gz # Use usher to add new public sequences to tree of mapped subset and infer missing/ambig bases. time $usher -u -T 50 \ -v new.vcf.gz \ -i $prevProtobufUnmasked \ -o public-$today.all.notMasked.pb \ >& usher.addNewUnmasked.log #~10 hours for 56k seqs, ~72m for 6k -$matUtils convert -i public-$today.all.notMasked.pb -v public-$today.all.vcf +$matUtils extract -i public-$today.all.notMasked.pb -v public-$today.all.vcf ls -l public-$today.all.vcf wc -l public-$today.all.vcf bgzip -f public-$today.all.vcf tabix -p vcf public-$today.all.vcf.gz # Then with masking to collapse the tree & make protobuf for usher/hgPhyloPlace: tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \ $problematicSitesVcf > mask.vcf time vcfFilter -excludeVcf=mask.vcf new.vcf.gz \ | gzip -c \ > new.masked.vcf.gz time $usher -u -T 50 \ -v new.masked.vcf.gz \ -i $prevProtobufMasked \ -o public-$today.all.masked.pb \ >& usher.addNew.log mv uncondensed-final-tree.nh public-$today.all.nwk # Masked VCF that goes with the masked protobuf, for public distribution for folks who want # to run UShER and also have the VCF. -$matUtils convert -i public-$today.all.masked.pb -v public-$today.all.masked.vcf +$matUtils extract -i public-$today.all.masked.pb -v public-$today.all.masked.vcf gzip public-$today.all.masked.vcf # Make allele-frequency-filtered versions # Disregard pipefail just for this pipe because head prevents the cat from completing: set +o pipefail sampleCount=$(zcat public-$today.all.vcf.gz | head | grep ^#CHROM | sed -re 's/\t/\n/g' \ | tail -n+10 | wc -l) set -o pipefail minAc001=$(( (($sampleCount + 999) / 1000) )) vcfFilter -minAc=$minAc001 -rename public-$today.all.vcf.gz \ > public-$today.all.minAf.001.vcf wc -l public-$today.all.minAf.001.vcf bgzip -f public-$today.all.minAf.001.vcf tabix -p vcf public-$today.all.minAf.001.vcf.gz @@ -318,46 +319,83 @@ >& annotate.nextclade.out # Add pangolin lineage annotations to protobuf zcat public-$today.metadata.tsv.gz \ | tail -n+2 | tawk '$9 != "" {print $9, $1;}' \ > lineageToPublicName time $matUtils annotate -T 50 \ -i public-$today.all.masked.nextclade.pb \ -c lineageToPublicName \ -o public-$today.all.masked.nextclade.pangolin.pb \ >& annotate.pangolin.out # Not all the Pangolin lineages can be assigned nodes so for now just use nextclade cp -p public-$today.all.masked.nextclade.pb public-$today.all.masked.pb +# Divide up allele-frequency-filtered versions into quarterly versions +set +o pipefail +zcat public-$today.all.vcf.gz | head | grep ^#CHROM | sed -re 's/\t/\n/g' | tail -n+10 \ +| awk -F\| '{if ($3 == "") { print $2 "\t" $0; } else { print $3 "\t" $0;} }' \ +| egrep '^[0-9][0-9]-[0-9][0-9]' \ +| sed -re 's/^([0-9][0-9])-([0-9][0-9])(-[0-9][0-9])?(.*)/\1\t\2\4/;' \ +| tawk '($1 == 19 && $2 == 12) || ($1 > 19 && $2 > 0)' \ +| sort > samplesByDate +set -o pipefail + +tawk '$1 < 20 || ($1 == 20 && $2 <= 3) { print $3, $3; }' samplesByDate \ + > 20Q1.renaming +tawk '$1 == 20 && $2 >= 4 && $2 <= 6 { print $3, $3; }' samplesByDate \ + > 20Q2.renaming +tawk '$1 == 20 && $2 >= 7 && $2 <= 9 { print $3, $3; }' samplesByDate \ + > 20Q3.renaming +tawk '$1 == 20 && $2 >= 10 { print $3, $3; }' samplesByDate \ + > 20Q4.renaming +tawk '$1 == 21 && $2 <= 3{ print $3, $3; }' samplesByDate \ + > 21Q1.renaming + +for q in 20Q1 20Q2 20Q3 20Q4 21Q1; do + for af in 01 001; do + vcfRenameAndPrune public-$today.all.minAf.$af.vcf.gz $q.renaming \ + public-$today.$q.minAf.$af.vcf + wc -l public-$today.$q.minAf.$af.vcf + bgzip public-$today.$q.minAf.$af.vcf + tabix -p vcf public-$today.$q.minAf.$af.vcf.gz + done +done + # Update gbdb links -- not every day, too much churn for getting releases out and the # tracks are getting unmanageably large for VCF. if false; then for f in public-$today.all{,.minAf*}.vcf.gz ; do t=$(echo $f | sed -re "s/-$today//;") ln -sf `pwd`/$f /gbdb/wuhCor1/sarsCov2PhyloPub/$t ln -sf `pwd`/$f.tbi /gbdb/wuhCor1/sarsCov2PhyloPub/$t.tbi done ln -sf `pwd`/public-$today.all.nwk /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.nwk ln -sf `pwd`/public-$today.all.parsimony.bw \ /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.parsimony.bw ln -sf `pwd`/public-$today.lineageColors.gz \ /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.lineageColors.gz ln -sf `pwd`/public-$today.nextstrainColors.gz \ /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.nextstrainColors.gz ln -sf `pwd`/version.txt /gbdb/wuhCor1/sarsCov2PhyloPub/public.all.version.txt +for q in 20Q1 20Q2 20Q3 20Q4 21Q1; do + for af in 01 001; do + ln -sf `pwd`/public-$today.$q.minAf.$af.vcf.gz \ + /gbdb/wuhCor1/sarsCov2PhyloPub/public.$q.minAf.$af.vcf.gz + done +done fi # Link to public trees download directory hierarchy y=$(date +%Y) m=$(date +%m) d=$(date +%d) archiveRoot=/hive/users/angie/publicTrees archive=$archiveRoot/$y/$m/$d mkdir -p $archive ln `pwd`/public-$today.all.nwk $archive/ ln `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/ ln `pwd`/public-$today.metadata.tsv.gz $archive/ ln `pwd`/public-$today.all.masked.nextclade.pangolin.pb $archive/ ln `pwd`/cladeToPublicName $archive/ ln `pwd`/lineageToPublicName $archive/