0598b71e08f7ce3db112b5a5fa4d5194a8435f5e angie Fri Dec 2 11:05:57 2022 -0800 Add more info to taxonium file, dump out list of samples in public tree. diff --git src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh index 4c0becb..47bac66 100755 --- src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh +++ src/hg/utils/otto/sarscov2phylo/extractPublicTree.sh @@ -104,30 +104,33 @@ time $matUtils extract -i public-$today.all.masked.pb \ -f wuhCor1.fa \ -g ncbiGenes.gtf \ -M metadata.tmp.tsv \ --extra-fields pango_lineage_usher \ --include-nt \ --write-taxodium public-$today.all.masked.taxodium.pb rm metadata.tmp.tsv wuhCor1.fa gzip -f public-$today.all.masked.taxodium.pb # Make Taxonium V2 .jsonl.gz protobuf for display usher_to_taxonium --input public-$today.all.masked.pb \ --metadata public-$today.metadata.tsv.gz \ --genbank ~angie/github/taxonium/taxoniumtools/test_data/hu1.gb \ --columns genbank_accession,country,date,pangolin_lineage,pango_lineage_usher \ + --clade_types=nextstrain,pango \ + --name_internal_nodes \ + --title "$today tree with sequences from GISAID, INSDC, COG-UK and CNCB" \ --output public-$today.all.masked.taxonium.jsonl.gz # Link to public trees download directory hierarchy archiveRoot=/hive/users/angie/publicTrees read y m d < <(echo $today | sed -re 's/-/ /g') archive=$archiveRoot/$y/$m/$d mkdir -p $archive gzip -c public-$today.all.nwk > $archive/public-$today.all.nwk.gz ln -f `pwd`/public-$today.all.masked.{pb,vcf.gz} $archive/ gzip -c public-$today.all.masked.pb > $archive/public-$today.all.masked.pb.gz ln -f `pwd`/public-$today.metadata.tsv.gz $archive/ gzip -c public-$today.all.masked.nextclade.pangolin.pb \ > $archive/public-$today.all.masked.nextclade.pangolin.pb.gz gzip -c lineageToPublicName > $archive/lineageToPublicName.tsv.gz gzip -c cladeToPublicName > $archive/cladeToPublicName.tsv.gz @@ -140,34 +143,36 @@ ln -f $archive/public-$today.all.masked.pb $archiveRoot/public-latest.all.masked.pb ln -f $archive/public-$today.all.masked.pb.gz $archiveRoot/public-latest.all.masked.pb.gz ln -f $archive/public-$today.all.masked.vcf.gz $archiveRoot/public-latest.all.masked.vcf.gz ln -f $archive/public-$today.metadata.tsv.gz $archiveRoot/public-latest.metadata.tsv.gz ln -f $archive/public-$today.version.txt $archiveRoot/public-latest.version.txt ln -f $archive/public-$today.all.masked.taxodium.pb.gz \ $archiveRoot/public-latest.all.masked.taxodium.pb.gz ln -f $archive/public-$today.all.masked.taxonium.jsonl.gz \ $archiveRoot/public-latest.all.masked.taxonium.jsonl.gz # Update hgdownload-test link for archive mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m ln -sf $archive /usr/local/apache/htdocs-hgdownload/goldenPath/wuhCor1/UShER_SARS-CoV-2/$y/$m # Update links to latest public protobuf and metadata in hgwdev cgi-bin directories +pigz -p 8 -c samples.public.$today > samples.public.$today.gz for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do ln -sf `pwd`/public-$today.all.masked.pb $dir/public-latest.all.masked.pb ln -sf `pwd`/public-$today.metadata.tsv.gz $dir/public-latest.metadata.tsv.gz ln -sf `pwd`/hgPhyloPlace.description.txt $dir/public-latest.version.txt + ln -sf `pwd`/samples.public.$today.gz $dir/public-latest.names.gz done # Update MSA and make tree version with only MSA sequences awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' samples.public.$today \ | sort > idToName.public time cat <(faSomeRecords <(xzcat $ottoDir/$prevDate/public-$prevDate.all.aligned.fa.xz) \ <(cut -f 1 idToName.public) stdout) \ <(faSomeRecords new.aligned.fa <(cut -f 1 idToName.public) stdout) \ | xz -T 30 \ > public-$today.all.aligned.fa.xz time faRenameRecords <(xzcat public-$today.all.aligned.fa.xz) idToName.public stdout \ | faUniqify stdin stdout \ | xz -T 30 \ > public-$today.all.msa.fa.xz fastaNames public-$today.all.msa.fa.xz | sort > msaFaNames