23036e8b97ae73f56eeee54d68a7668d4881e77b angie Mon Jan 9 18:00:29 2023 -0800 Don't make old taxodium/v1 protobuf anymore, everyone has moved on to taxonium/v2. Make names file for big tree so hgPhyloPlace doesn't have to read in protobuf to get names for matching uploaded names/IDs. diff --git src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh index 47bac66..0c09743 100755 --- src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh +++ src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh @@ -84,81 +84,63 @@ tail -n+2 clade-paths.public \ | grep -E '^[A-Za-z]' \ | cut -f 1,3 > lineageToPath.public cncbDate=$(ls -l $cncbDir | sed -re 's/.*cncb\.([0-9]{4}-[0-9][0-9]-[0-9][0-9]).*/\1/') echo "sarscov2phylo release 13-11-20; NCBI and COG-UK sequences downloaded $today; CNCB sequences downloaded $cncbDate" \ > version.txt $matUtils extract -i public-$today.all.masked.pb -u samples.public.$today sampleCountComma=$(echo $(wc -l < samples.public.$today) \ | sed -re 's/([0-9]+)([0-9]{3})$/\1,\2/; s/([0-9]+)([0-9]{3},[0-9]{3})$/\1,\2/;') echo "$sampleCountComma genomes from GenBank, COG-UK and CNCB ($today); sarscov2phylo 13-11-20 tree with newer sequences added by UShER" \ > hgPhyloPlace.description.txt -# Make Taxonium V1-formatted protobuf for display -zcat /hive/data/genomes/wuhCor1/goldenPath/bigZips/genes/ncbiGenes.gtf.gz \ -| grep -v '"ORF1a"' > ncbiGenes.gtf -zcat /hive/data/genomes/wuhCor1/wuhCor1.fa.gz > wuhCor1.fa -zcat public-$today.metadata.tsv.gz > metadata.tmp.tsv -time $matUtils extract -i public-$today.all.masked.pb \ - -f wuhCor1.fa \ - -g ncbiGenes.gtf \ - -M metadata.tmp.tsv \ - --extra-fields pango_lineage_usher \ - --include-nt \ - --write-taxodium public-$today.all.masked.taxodium.pb -rm metadata.tmp.tsv wuhCor1.fa -gzip -f public-$today.all.masked.taxodium.pb - # Make Taxonium V2 .jsonl.gz protobuf for display usher_to_taxonium --input public-$today.all.masked.pb \ --metadata public-$today.metadata.tsv.gz \ --genbank ~angie/github/taxonium/taxoniumtools/test_data/hu1.gb \ --columns genbank_accession,country,date,pangolin_lineage,pango_lineage_usher \ --clade_types=nextstrain,pango \ --name_internal_nodes \ --title "$today tree with sequences from GISAID, INSDC, COG-UK and CNCB" \ --output public-$today.all.masked.taxonium.jsonl.gz # Link to public trees download directory hierarchy archiveRoot=/hive/users/angie/publicTrees read y m d < <(echo $today | sed -re 's/-/ /g') archive=$archiveRoot/$y/$m/$d mkdir -p $archive gzip -c public-$today.all.nwk > $archive/public-$today.all.nwk.gz ln -f `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/ gzip -c public-$today.all.masked.pb > $archive/public-$today.all.masked.pb.gz ln -f `pwd`/public-$today.metadata.tsv.gz $archive/ gzip -c public-$today.all.masked.nextclade.pangolin.pb \ > $archive/public-$today.all.masked.nextclade.pangolin.pb.gz gzip -c lineageToPublicName > $archive/lineageToPublicName.tsv.gz gzip -c cladeToPublicName > $archive/cladeToPublicName.tsv.gz ln -f `pwd`/hgPhyloPlace.description.txt $archive/public-$today.version.txt -ln -f `pwd`/public-$today.all.masked.taxodium.pb.gz $archive/ ln -f `pwd`/public-$today.all.masked.taxonium.jsonl.gz $archive/ # Update 'latest' in $archiveRoot ln -f $archive/public-$today.all.nwk.gz $archiveRoot/public-latest.all.nwk.gz ln -f $archive/public-$today.all.masked.pb $archiveRoot/public-latest.all.masked.pb ln -f $archive/public-$today.all.masked.pb.gz $archiveRoot/public-latest.all.masked.pb.gz ln -f $archive/public-$today.all.masked.vcf.gz $archiveRoot/public-latest.all.masked.vcf.gz ln -f $archive/public-$today.metadata.tsv.gz $archiveRoot/public-latest.metadata.tsv.gz ln -f $archive/public-$today.version.txt $archiveRoot/public-latest.version.txt -ln -f $archive/public-$today.all.masked.taxodium.pb.gz \ - $archiveRoot/public-latest.all.masked.taxodium.pb.gz ln -f $archive/public-$today.all.masked.taxonium.jsonl.gz \ $archiveRoot/public-latest.all.masked.taxonium.jsonl.gz # Update hgdownload-test link for archive mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m ln -sf $archive /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m # Update links to latest public protobuf and metadata in hgwdev cgi-bin directories pigz -p 8 -c samples.public.$today > samples.public.$today.gz for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do ln -sf `pwd`/public-$today.all.masked.pb $dir/public-latest.all.masked.pb ln -sf `pwd`/public-$today.metadata.tsv.gz $dir/public-latest.metadata.tsv.gz ln -sf `pwd`/hgPhyloPlace.description.txt $dir/public-latest.version.txt ln -sf `pwd`/samples.public.$today.gz $dir/public-latest.names.gz done