19b943548e4c4987365430f090582cc0061156b9
angie
  Tue May 21 13:31:43 2024 -0700
Generate memory-mapped hash tables for metadata and name lookup.

diff --git src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
index 8932313..def7a41 100755
--- src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
+++ src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh
@@ -155,30 +155,47 @@
 | sort > tmp1
 tail -n+2 sample-clades \
 | sort > tmp2
 paste <(zcat gisaidAndPublic.$today.metadata.tsv.gz | cut -f 1-9 | head -1) \
       <(echo -e "Nextstrain_clade_usher\tpango_lineage_usher") \
     > gisaidAndPublic.$today.metadata.tsv
 join -t$'\t' tmp1 tmp2 \
     >> gisaidAndPublic.$today.metadata.tsv
 pigz -p 8 -f gisaidAndPublic.$today.metadata.tsv
 rm tmp1 tmp2
 
 # EPI_ISL_ ID to public sequence name mapping, so if users upload EPI_ISL IDs for which we have
 # public names & IDs, we can match them.
 cut -f 1,3 $epiToPublic > epiToPublic.latest
 
+# Memory-mapped hash tables for metadata and name lookup
+tabToMmHash gisaidAndPublic.$today.metadata.tsv.gz gisaidAndPublic.$today.metadata.mmh
+ln -sf $(pwd)/gisaidAndPublic.$today.metadata.mmh \
+    /gbdb/wuhCor1/hgPhyloPlaceData/public.plusGisaid.latest.metadata.mmh
+awk -F\| '{ print $0 "\t" $0;  print $1 "\t" $0; if ($3 != "") { print $2 "\t" $0; } }' \
+    samples.$today \
+| tawk '$1 != "RNA" && $1 !~ /\/RNA\// && $1 !~/^Germany\/Molecular_surveillance_of_SARS/ && \
+        $1 !~ /^Iceland\/SARS-CoV-2_Iceland/' \
+    > nameLookup.tab
+cut -f 1,3 $epiToPublic \
+| subColumn -skipMiss 2 stdin nameLookup.tab tmp.tab
+cat tmp.tab >> nameLookup.tab
+rm tmp.tab
+tabToMmHash nameLookup.tab samples.$today.mmh
+rm nameLookup.tab
+ln -sf $(pwd)/samples.$today.mmh /gbdb/wuhCor1/hgPhyloPlaceData/public.plusGisaid.names.mmh
+
 # Update links to latest public+GISAID protobuf and metadata in hgwdev cgi-bin directories
 pigz -p 8 -c samples.$today > samples.$today.gz
 for dir in /usr/local/apache/cgi-bin{-angie,-beta,}/hgPhyloPlaceData/wuhCor1; do
     ln -sf `pwd`/gisaidAndPublic.$today.masked.pb $dir/public.plusGisaid.latest.masked.pb
     ln -sf `pwd`/gisaidAndPublic.$today.metadata.tsv.gz \
         $dir/public.plusGisaid.latest.metadata.tsv.gz
     ln -sf `pwd`/hgPhyloPlace.plusGisaid.description.txt $dir/public.plusGisaid.latest.version.txt
     ln -sf `pwd`/epiToPublic.latest $dir/
     ln -sf `pwd`/samples.$today.gz $dir/public.plusGisaid.names.gz
 done
 
 # Make Taxonium v2 protobuf for display
 usher_to_taxonium --input gisaidAndPublic.$today.masked.pb \
     --metadata gisaidAndPublic.$today.metadata.tsv.gz \
     --genbank ~angie/github/taxonium/taxoniumtools/test_data/hu1.gb \