ea4eb88f661eb1d288f61981093d402830496603 angie Mon Mar 1 01:01:43 2021 -0800 matToVcf has been replaced by 'matUtils convert'. Also, use 'matUtils annotate's new -l option to clear old clade annotations before reassigning them on the updated protobuf. diff --git src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh index 8adb374..b108dfe 100755 --- src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh +++ src/hg/utils/otto/sarscov2phylo/updatePublicTree.sh @@ -26,31 +26,31 @@ echo "prevProtobufMasked=$prevProtobufMasked" echo "prevProtobufUnmasked=$prevProtobufUnmasked" echo "prevMeta=$prevMeta" echo "problematicSitesVcf=$problematicSitesVcf" ncbiDir=$ottoDir/ncbi.latest cogUkDir=$ottoDir/cogUk.latest cncbDir=$ottoDir/cncb.latest today=$(date +%F) minReal=20000 ref2bit=/hive/data/genomes/wuhCor1/wuhCor1.2bit usherDir=~angie/github/usher usher=$usherDir/build/usher -matToVcf=$usherDir/build/matToVcf +matUtils=$usherDir/build/matUtils find_parsimonious_assignments=~angie/github/strain_phylogenetics/build/find_parsimonious_assignments scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh # Before we get started, make sure cog_metadata has the columns we're expecting: expectedHeaderStart='sequence_name,country,adm1,pillar_2,sample_date,epi_week,lineage,' cogUkMetHeader=$(head -1 $cogUkDir/cog_metadata.csv | grep ^$expectedHeaderStart) # The grep will fail if they change one of the first 7 fields. mkdir -p $ottoDir/$today cd $ottoDir/$today # Disregard pipefail just for this pipe because head prevents the cat from completing: @@ -136,51 +136,51 @@ wc -l $renaming # Make VCF -- first without masking, for hgTracks time cat <(twoBitToFa $ref2bit stdout) $alignedFa \ | faToVcf stdin stdout \ | vcfRenameAndPrune stdin $renaming stdout \ | gzip -c \ > new.vcf.gz # Use usher to add new public sequences to tree of mapped subset and infer missing/ambig bases. time $usher -u -T 50 \ -v new.vcf.gz \ -i $prevProtobufUnmasked \ -o public-$today.all.notMasked.pb \ >& usher.addNewUnmasked.log #~10 hours for 56k seqs, ~72m for 6k -$matToVcf -i public-$today.all.notMasked.pb -v public-$today.all.vcf +$matUtils convert -i public-$today.all.notMasked.pb -v public-$today.all.vcf ls -l public-$today.all.vcf wc -l public-$today.all.vcf bgzip -f public-$today.all.vcf tabix -p vcf public-$today.all.vcf.gz # Then with masking to collapse the tree & make protobuf for usher/hgPhyloPlace: tawk '{ if ($1 ~ /^#/) { print; } else if ($7 == "mask") { $1 = "NC_045512v2"; print; } }' \ $problematicSitesVcf > mask.vcf time vcfFilter -excludeVcf=mask.vcf new.vcf.gz \ | gzip -c \ > new.masked.vcf.gz time $usher -u -T 50 \ -v new.masked.vcf.gz \ -i $prevProtobufMasked \ -o public-$today.all.masked.pb \ >& usher.addNew.log mv uncondensed-final-tree.nh public-$today.all.nwk # Masked VCF that goes with the masked protobuf, for public distribution for folks who want # to run UShER and also have the VCF. -$matToVcf -i public-$today.all.masked.pb -v public-$today.all.masked.vcf +$matUtils convert -i public-$today.all.masked.pb -v public-$today.all.masked.vcf gzip public-$today.all.masked.vcf # Make allele-frequency-filtered versions # Disregard pipefail just for this pipe because head prevents the cat from completing: set +o pipefail sampleCount=$(zcat public-$today.all.vcf.gz | head | grep ^#CHROM | sed -re 's/\t/\n/g' \ | tail -n+10 | wc -l) set -o pipefail minAc001=$(( (($sampleCount + 999) / 1000) )) vcfFilter -minAc=$minAc001 -rename public-$today.all.vcf.gz \ > public-$today.all.minAf.001.vcf wc -l public-$today.all.minAf.001.vcf bgzip -f public-$today.all.minAf.001.vcf tabix -p vcf public-$today.all.minAf.001.vcf.gz @@ -299,41 +299,42 @@ > version.txt fi sampleCountComma=$(echo $sampleCount \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ > hgPhyloPlace.description.txt cp -p public-$today.all.masked.pb{,.bak} # Add nextclade annotations to protobuf zcat public-$today.metadata.tsv.gz \ | tail -n+2 | tawk '$8 != "" {print $8, $1;}' \ | sed -re 's/^20E \(EU1\)/20E.EU1/;' \ > cladeToPublicName -time ~/github/usher/build/matUtils annotate -T 50 \ +time $matUtils annotate -T 50 \ + -l \ -i public-$today.all.masked.pb \ -c cladeToPublicName \ -o public-$today.all.masked.nextclade.pb \ >& annotate.nextclade.out # Add pangolin lineage annotations to protobuf zcat public-$today.metadata.tsv.gz \ | tail -n+2 | tawk '$9 != "" {print $9, $1;}' \ > lineageToPublicName -time ~/github/usher/build/matUtils annotate -T 50 \ +time $matUtils annotate -T 50 \ -i public-$today.all.masked.nextclade.pb \ -c lineageToPublicName \ -o public-$today.all.masked.nextclade.pangolin.pb \ >& annotate.pangolin.out # Not all the Pangolin lineages can be assigned nodes so for now just use nextclade cp -p public-$today.all.masked.nextclade.pb public-$today.all.masked.pb # Update gbdb links -- not every day, too much churn for getting releases out and the # tracks are getting unmanageably large for VCF. if false; then for f in public-$today.all{,.minAf*}.vcf.gz ; do t=$(echo $f | sed -re "s/-$today//;") ln -sf `pwd`/$f /gbdb/wuhCor1/sarsCov2PhyloPub/$t ln -sf `pwd`/$f.tbi /gbdb/wuhCor1/sarsCov2PhyloPub/$t.tbi