e931fd3e5fdab83eeb110b2450a0f365eff170e2 angie Mon Oct 4 17:04:17 2021 -0700 Omit ORF1a from taxodium amino acid annotations (redundant with ORF1ab). Add optional baseProtobuf arg to updateCombinedTree.sh / makeNewMaskedVcf.sh to start with a custom protobuf (e.g. optimized) instead of eyesterday's protobuf. diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh index 8602463..93b1303 100755 --- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh +++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh @@ -1,51 +1,52 @@ #!/bin/bash set -beEu -x -o pipefail # Do not modify this script, modify the source tree copy: # kent/src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh usage() { - echo "usage: $0 prevDate today problematicSitesVcf" + echo "usage: $0 prevDate today problematicSitesVcf [baseProtobuf]" echo "This assumes that ncbi.latest and cogUk.latest links/directories have been updated." } -if [ $# != 3 ]; then +if [ $# != 3 && $# != 4]; then usage exit 1 fi prevDate=$1 today=$2 problematicSitesVcf=$3 +baseProtobuf=$4 ottoDir=/hive/data/outside/otto/sarscov2phylo cncbDir=$ottoDir/cncb.latest gisaidDir=/hive/users/angie/gisaid epiToPublic=$gisaidDir/epiToPublicAndDate.latest scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh mkdir -p $ottoDir/$today cd $ottoDir/$today usherDir=~angie/github/usher usher=$usherDir/build/usher matUtils=$usherDir/build/matUtils if [ ! -s new.masked.vcf.gz ]; then - $scriptDir/makeNewMaskedVcf.sh $prevDate $today $problematicSitesVcf + $scriptDir/makeNewMaskedVcf.sh $prevDate $today $problematicSitesVcf $baseProtobuf fi if [ ! -s gisaidAndPublic.$today.masked.pb ]; then $scriptDir/usherClusterRun.sh $today # Prune samples with too many private mutations and internal branches that are too long. $matUtils extract -i gisaidAndPublic.$today.masked.preTrim.pb \ -b 30 \ -O -o gisaidAndPublic.$today.masked.pb fi # Exclude sequences with a very high number of EPPs from future runs grep ^Current usher.addNew.log \ | awk '$16 >= 10 {print $8;}' \ | awk -F\| '{ if ($3 == "") { print $1; } else { print $2; } }' \ > tooManyEpps.ids @@ -128,27 +129,28 @@ # EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have # public names & IDs, we can match them. cut -f 1,3 $epiToPublic > epiToPublic.latest # Update links to latest public+GISAID protobuf and metadata in hgwdev cgi-bin directories for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \ $dir/public.plusGisaid.latest.metadata.tsv.gz ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt ln -sf `pwd`/epiToPublic.latest $dir/ done # Make Taxodium-formatted protobuf for display -zcat /hive/data/genomes/wuhCor1/goldenPath/bigZips/genes/ncbiGenes.gtf.gz > ncbiGenes.gtf +zcat /hive/data/genomes/wuhCor1/goldenPath/bigZips/genes/ncbiGenes.gtf.gz \ +| grep -v '"ORF1a"' > ncbiGenes.gtf zcat /hive/data/genomes/wuhCor1/wuhCor1.fa.gz > wuhCor1.fa zcat gisaidAndPublic.$today.metadata.tsv.gz > metadata.tmp.tsv time $matUtils extract -i gisaidAndPublic.$today.masked.pb \ -f wuhCor1.fa \ -g ncbiGenes.gtf \ -M metadata.tmp.tsv \ --write-taxodium gisaidAndPublic.$today.masked.taxodium.pb rm metadata.tmp.tsv wuhCor1.fa gzip -f gisaidAndPublic.$today.masked.taxodium.pb $scriptDir/extractPublicTree.sh $today