2d05d30ed4df1612d72ba84c812d004de935b122
angie
  Fri May 17 16:08:54 2024 -0700
Add lib module mmHash (memory-mapped hash), util tabToMmHash, and hgPhyloPlace support for using mmHash files instead of tab-separated files for metadata and name lookup.
Using mmHash for name lookup saves about 50-55 seconds for SARS-CoV-2 hgPhyloPlace name/ID queries.

diff --git src/hg/hgPhyloPlace/phyloPlace.h src/hg/hgPhyloPlace/phyloPlace.h
index fabc6b0..96f8552 100644
--- src/hg/hgPhyloPlace/phyloPlace.h
+++ src/hg/hgPhyloPlace/phyloPlace.h
@@ -117,52 +117,60 @@
     struct phyloTree *subtree;            // Parsed subtree (#*** with annotated muts?  shoudl we?_
     struct hash *subtreeIdToIx;           // Map of subtree nodes to VCF output sample order
     struct slName *subtreeUserSampleIds;  // List of user-uploaded samples in this subtree
     struct slName *subtreeNameList;       // List of leaf names with nicer names for cond. nodes
     };
 
 struct usherResults
 /* Tree+samples download file, sample placements, and subtrees parsed from usher output. */
     {
     struct tempName *bigTreePlusTn;      // Newick file: original tree plus user's samples
     struct hash *samplePlacements;       // Info about each sample's placement in the tree
     struct subtreeInfo *singleSubtreeInfo;  // Comprehensive subtree with all uploaded samples
     struct subtreeInfo *subtreeInfoList; // For each subtree: tree, file, node info etc.
     };
 
-struct sampleMetadata
-/* Information about a virus sample. */
+struct sampleMetadataStore
+/* Storage for sample metadata: hash of array of strings for named columns. */
     {
+    struct mmHash *mmh;    // Either NULL (if hash is non-NULL) or a memory-mapped hash.
+    struct hash *hash;     // Either NULL (if mmh is non-NULL) or a regular hash.
     size_t columnCount;    // Number of metadata columns
     char **columnNames;    // Metadata column names (e.g. date, genbank_accession, pangolin_lineage)
-                           // -- shared by all metadata rows, not allocated for each struct
-    char **columnValues;   // Metadata column values -- allocated for each struct
     };
 
 struct geneInfo
 /* Information sufficient to determine whether a genome change causes a coding change. */
     {
     struct geneInfo *next;
     struct psl *psl;        // Alignment of transcript to genome
     struct dnaSeq *txSeq;   // Transcript sequence
     struct genbankCds *cds; // CDS (for those few pathogens that have transcript UTRs)
     int cdsStart;           // genePred cdsStart (genome coord, really cds end if - strand)
     int cdsEnd;             // genePred cdsEnd (genome coord, really cds start if - strand)
     };
 
+struct hashOrMmHash
+/* Wrapper for either a regular hash or a memory-mapped hash.  Not using a union because I need
+ * to know which one I have. */
+    {
+    struct mmHash *mmh;     // Either NULL (if hash is non-NULL) or a memory-mapped hash.
+    struct hash *hash;      // Either NULL (if mmh is non-NULL) or a regular hash.
+    };
+
 struct tempName *vcfFromFasta(struct lineFile *lf, char *org, char *db, struct dnaSeq *refGenome,
-                              struct slName **maskSites, struct hash *treeNames,
+                              struct slName **maskSites, struct hashOrMmHash *treeNames,
                               struct slName **retSampleIds, struct seqInfo **retSeqInfo,
                               struct slPair **retFailedSeqs, struct slPair **retFailedPsls,
                               int *pStartTime);
 /* Read in FASTA from lf and make sure each item has a reasonable size and not too high
  * percentage of N's.  Align to reference, extract SNVs from alignment, and save as VCF
  * with sample genotype columns. */
 
 struct usherResults *runUsher(char *org, char *usherPath, char *usherAssignmentsPath, char *vcfFile,
                               int subtreeSize, struct slName **pUserSampleIds,
                               struct treeChoices *treeChoices, int *pStartTime);
 /* Open a pipe from Yatish Turakhia's usher program, save resulting big trees and
  * subtrees to trash files, return list of slRef to struct tempName for the trash files
  * and parse other results out of stderr output.  The usher-sampled version of usher might
  * modify userSampleIds, adding a prefix if a sample with the same name is already in the tree. */
 
@@ -193,45 +201,46 @@
 
 void serverSetTimeout(char *org, int val);
 /* Send timeout command and value (in seconds) to usher server. */
 
 char *microbeTraceHost();
 /* Return the MicrobeTrace hostname from an hg.conf param, or NULL if missing. Do not free result. */
 
 struct slPair *getAaMutations(struct singleNucChange *sncList, struct singleNucChange *ancestorMuts,
                               struct geneInfo *geneInfoList, struct seqWindow *gSeqWin);
 /* Given lists of SNVs and genes, return a list of pairs of { gene name, AA change list }. */
 
 struct geneInfo *getGeneInfoList(char *bigGenePredFile, struct dnaSeq *refGenome);
 /* If config.ra has a source of gene annotations, then return the gene list. */
 
 void treeToAuspiceJson(struct subtreeInfo *sti, char *org, char *db, struct geneInfo *geneInfoList,
-                       struct seqWindow *gSeqWin, struct hash *sampleMetadata,
+                       struct seqWindow *gSeqWin, struct sampleMetadataStore *sampleMetadata,
                        struct hash *sampleUrls, struct hash *samplePlacements,
                        char *jsonFile, char *source);
 /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format
  * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */
 
 struct tempName *writeCustomTracks(char *org, char *ref, char *db,
                                    struct tempName *vcfTn, struct usherResults *ur,
                                    struct slName *sampleIds, char *source, int fontHeight,
                                    struct phyloTree *sampleTree, int *pStartTime);
 /* Write one custom track per subtree, and one custom track with just the user's uploaded samples. */
 
 
-struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId);
-/* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */
+char **metadataForSample(struct sampleMetadataStore *sampleMetadata, char *sampleId);
+/* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession.
+ * Return NULL if not found. */
 
 struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds);
 /* Prune all descendants of node that have no leaf descendants in sampleIds. */
 
 struct slPair *phyloPlaceOrgList(struct cart *cart);
 /* Each subdirectory of PHYLOPLACE_DATA_DIR that contains an organism.ra file is a collection of
  * reference sequences that uploaded sequences will be matched against using nextclade sort.
  * Some of those references might also be dbs or track hub names (without the hub_number_ prefix).
  * Each subdirectory of PHYLOPLACE_DATA_DIR that contains a config.ra file contains a single
  * reference which might also be a db or track hub name (without the hub_number_ prefix).
  * Return a list of {name, label} pairs, SARS-CoV-2 first, combining the two categories. */
 
 char *phyloPlaceOrgSetting(char *org, char *settingName);
 /* Return cloned setting value if found in hgPhyloPlaceData/<org>/organism.ra or
  * old-style hgPhyloPlaceData/<org>/config.ra, or NULL if not found. */