95c99e66b78b8b8c4e26bb1ea91a0113d64caf04 angie Thu Aug 19 11:54:39 2021 -0700 matUtils annotate -P fails too often because of little changes in the tree that affect which mutations are collapsed into the same code (or not). More sophisticated path-finding logic in matUtils could help with that, but until then, use the previous day's clade assignments as input for matUtils annotate -c for the big tree, and use the big tree's clade assignments as input for -c for the public-only tree. diff --git src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh index 59ffeeb..3ad7483 100755 --- src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh +++ src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh @@ -19,81 +19,114 @@ cncbDir=$ottoDir/cncb.latest scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh usherDir=~angie/github/usher matUtils=$usherDir/build/matUtils cd $ottoDir/$today # Extract public samples from tree grep -v EPI_ISL_ samples.$today > newPublicNames $matUtils extract -i gisaidAndPublic.$today.masked.pb \ -s newPublicNames \ -O -o public-$today.all.masked.pb -# Extract Newick and VCF from public-only tree -time $matUtils extract -i public-$today.all.masked.pb \ - -t public-$today.all.nwk \ - -v public-$today.all.masked.vcf -time gzip -f public-$today.all.masked.vcf -zcat gisaidAndPublic.$today.metadata.tsv.gz \ -| grep -v EPI_ISL_ \ -| gzip -c \ - > public-$today.metadata.tsv.gz # Add nextclade annotations to public protobuf +if [ -s cladeToName ]; then + # Use combined tree's clade assignments to annotate clades on public tree + grep -v EPI_ISL cladeToName > cladeToPublicName + time $matUtils annotate -T 50 \ + -l \ + -i public-$today.all.masked.pb \ + -c cladeToPublicName \ + -f 0.95 \ + -D details.nextclade.public \ + -o public-$today.all.masked.nextclade.pb \ + >& annotate.nextclade.public +else time $matUtils annotate -T 50 \ -l \ -i public-$today.all.masked.pb \ -P ../nextstrain.clade-paths.public.tsv \ -o public-$today.all.masked.nextclade.pb +fi # Add pangolin lineage annotations to public protobuf +if [ -s lineageToName ]; then + grep -v EPI_ISL lineageToName > lineageToPublicName + time $matUtils annotate -T 50 \ + -i public-$today.all.masked.nextclade.pb \ + -c lineageToPublicName \ + -f 0.95 \ + -D details.pango.public \ + -o public-$today.all.masked.nextclade.pangolin.pb \ + >& annotate.pango.public +else time $matUtils annotate -T 50 \ -i public-$today.all.masked.nextclade.pb \ -P ../pango.clade-paths.public.tsv \ -o public-$today.all.masked.nextclade.pangolin.pb +fi + +# Extract Newick and VCF from public-only tree +time $matUtils extract -i public-$today.all.masked.pb \ + -t public-$today.all.nwk \ + -v public-$today.all.masked.vcf +time gzip -f public-$today.all.masked.vcf +zcat gisaidAndPublic.$today.metadata.tsv.gz \ +| grep -v EPI_ISL_ \ +| gzip -c \ + > public-$today.metadata.tsv.gz rm public-$today.all.masked.pb ln -f public-$today.all.masked.nextclade.pangolin.pb public-$today.all.masked.pb cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/') echo "sarscov2phylo release 13-11-20; NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \ > version.txt $matUtils extract -i public-$today.all.masked.pb -u samples.public.$today sampleCountComma=$(echo $(wc -l < samples.public.$today) \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ > hgPhyloPlace.description.txt # Link to public trees download directory hierarchy archiveRoot=/hive/users/angie/publicTrees read y m d < <(echo $today | sed -re 's/-/ /g') archive=$archiveRoot/$y/$m/$d mkdir -p $archive gzip -c public-$today.all.nwk > $archive/public-$today.all.nwk.gz -ln `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/ +ln -f `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/ gzip -c public-$today.all.masked.pb > $archive/public-$today.all.masked.pb.gz -ln `pwd`/public-$today.metadata.tsv.gz $archive/ +ln -f `pwd`/public-$today.metadata.tsv.gz $archive/ gzip -c public-$today.all.masked.nextclade.pangolin.pb \ > $archive/public-$today.all.masked.nextclade.pangolin.pb.gz +if [ -s lineageToPublicName ]; then + gzip -c lineageToPublicName > $archive/lineageToPublicName.tsv.gz +else gzip -c ../nextstrain.clade-paths.public.tsv > $archive/nextstrain.clade-paths.public.tsv.gz +fi +if [ -s cladeToPublicName ]; then + gzip -c cladeToPublicName > $archive/cladeToPublicName.tsv.gz +else gzip -c ../pango.clade-paths.public.tsv > $archive/pango.clade-paths.public.tsv.gz -ln `pwd`/hgPhyloPlace.description.txt $archive/public-$today.version.txt +fi +ln -f `pwd`/hgPhyloPlace.description.txt $archive/public-$today.version.txt # Update 'latest' in $archiveRoot ln -f $archive/public-$today.all.nwk.gz $archiveRoot/public-latest.all.nwk.gz ln -f $archive/public-$today.all.masked.pb $archiveRoot/public-latest.all.masked.pb ln -f $archive/public-$today.all.masked.pb.gz $archiveRoot/public-latest.all.masked.pb.gz ln -f $archive/public-$today.all.masked.vcf.gz $archiveRoot/public-latest.all.masked.vcf.gz ln -f $archive/public-$today.metadata.tsv.gz $archiveRoot/public-latest.metadata.tsv.gz ln -f $archive/public-$today.version.txt $archiveRoot/public-latest.version.txt # Update hgdownload-test link for archive mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m ln -sf $archive /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m # Update links to latest public protobuf and metadata in hgwdev cgi-bin directories for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do