df04c1f66cdd6895471998c6d9c70b1a413f7d4e angie Sat Mar 12 20:33:25 2022 -0800 Use new --include-nt option when generating taxonium protobufs. Also watch out for EPI_ISL as part of GenBank name. diff --git src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh index 11bca43..5396ad4 100755 --- src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh +++ src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh @@ -15,31 +15,31 @@ fi today=$1 ottoDir=/hive/data/outside/otto/sarscov2phylo cncbDir=$ottoDir/cncb.latest scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh usherDir=~angie/github/usher matUtils=$usherDir/build/matUtils cd $ottoDir/$today # Extract public samples from tree -grep -v EPI_ISL_ samples.$today > newPublicNames +grep -v \|EPI_ISL_ samples.$today > newPublicNames # Dunno why, but when I tried using -s together with the filtering params, it ran for 3 hours # and I killed it -- stuck in a loop? Run two commands: $matUtils extract -i gisaidAndPublic.$today.masked.pb \ -s newPublicNames \ -o public-$today.all.masked.preTrim.pb $matUtils extract -i public-$today.all.masked.preTrim.pb \ --max-parsimony 20 \ --max-branch-length 45 \ --max-path-length 150 \ -O -o public-$today.all.masked.pb # Add nextclade annotations to protobuf (completely specified by nextstrain.clade-mutations.tsv) grep -v EPI_ISL cladeToName > cladeToPublicName time $matUtils annotate -T 50 \ -l \ @@ -81,30 +81,31 @@ sampleCountComma=$(echo $(wc -l < samples.public.$today) \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ > hgPhyloPlace.description.txt # Make Taxodium-formatted protobuf for display zcat /hive/data/genomes/wuhCor1/goldenPath/bigZips/genes/ncbiGenes.gtf.gz \ | grep -v '"ORF1a"' > ncbiGenes.gtf zcat /hive/data/genomes/wuhCor1/wuhCor1.fa.gz > wuhCor1.fa zcat public-$today.metadata.tsv.gz > metadata.tmp.tsv time $matUtils extract -i public-$today.all.masked.pb \ -f wuhCor1.fa \ -g ncbiGenes.gtf \ -M metadata.tmp.tsv \ --extra-fields pango_lineage_usher \ + --include-nt \ --write-taxodium public-$today.all.masked.taxodium.pb rm metadata.tmp.tsv wuhCor1.fa gzip -f public-$today.all.masked.taxodium.pb # Link to public trees download directory hierarchy archiveRoot=/hive/users/angie/publicTrees read y m d < <(echo $today | sed -re 's/-/ /g') archive=$archiveRoot/$y/$m/$d mkdir -p $archive gzip -c public-$today.all.nwk > $archive/public-$today.all.nwk.gz ln -f `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/ gzip -c public-$today.all.masked.pb > $archive/public-$today.all.masked.pb.gz ln -f `pwd`/public-$today.metadata.tsv.gz $archive/ gzip -c public-$today.all.masked.nextclade.pangolin.pb \ > $archive/public-$today.all.masked.nextclade.pangolin.pb.gz