19b943548e4c4987365430f090582cc0061156b9 angie Tue May 21 13:31:43 2024 -0700 Generate memory-mapped hash tables for metadata and name lookup. diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh index 8932313..def7a41 100755 --- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh +++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh @@ -155,30 +155,47 @@ | sort > tmp1 tail -n+2 sample-clades \ | sort > tmp2 paste <(zcat gisaidAndPublic.$today.metadata.tsv.gz | cut -f 1-9 | head -1) \ <(echo -e "Nextstrain_clade_usher\tpango_lineage_usher") \ > gisaidAndPublic.$today.metadata.tsv join -t$'\t' tmp1 tmp2 \ >> gisaidAndPublic.$today.metadata.tsv pigz -p 8 -f gisaidAndPublic.$today.metadata.tsv rm tmp1 tmp2 # EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have # public names & IDs, we can match them. cut -f 1,3 $epiToPublic > epiToPublic.latest +# Memory-mapped hash tables for metadata and name lookup +tabToMmHash gisaidAndPublic.$today.metadata.tsv.gz gisaidAndPublic.$today.metadata.mmh +ln -sf $(pwd)/gisaidAndPublic.$today.metadata.mmh \ + /gbdb/wuhCor1/hgPhyloPlaceData/public.plusGisaid.latest.metadata.mmh +awk -F\| '{ print $0 "\t" $0; print $1 "\t" $0; if ($3 != "") { print $2 "\t" $0; } }' \ + samples.$today \ +| tawk '$1 != "RNA" && $1 !~ /\/RNA\// && $1 !~/^Germany\/Molecular_surveillance_of_SARS/ && \ + $1 !~ /^Iceland\/SARS-CoV-2_Iceland/' \ + > nameLookup.tab +cut -f 1,3 $epiToPublic \ +| subColumn -skipMiss 2 stdin nameLookup.tab tmp.tab +cat tmp.tab >> nameLookup.tab +rm tmp.tab +tabToMmHash nameLookup.tab samples.$today.mmh +rm nameLookup.tab +ln -sf $(pwd)/samples.$today.mmh /gbdb/wuhCor1/hgPhyloPlaceData/public.plusGisaid.names.mmh + # Update links to latest public+GISAID protobuf and metadata in hgwdev cgi-bin directories pigz -p 8 -c samples.$today > samples.$today.gz for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \ $dir/public.plusGisaid.latest.metadata.tsv.gz ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt ln -sf `pwd`/epiToPublic.latest $dir/ ln -sf `pwd`/samples.$today.gz $dir/public.plusGisaid.names.gz done # Make Taxonium v2 protobuf for display usher_to_taxonium --input gisaidAndPublic.$today.masked.pb \ --metadata gisaidAndPublic.$today.metadata.tsv.gz \ --genbank ~angie/github/taxonium/taxoniumtools/test_data/hu1.gb \