2d05d30ed4df1612d72ba84c812d004de935b122
angie
  Fri May 17 16:08:54 2024 -0700
Add lib module mmHash (memory-mapped hash), util tabToMmHash, and hgPhyloPlace support for using mmHash files instead of tab-separated files for metadata and name lookup.
Using mmHash for name lookup saves about 50-55 seconds for SARS-CoV-2 hgPhyloPlace name/ID queries.

diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c
index ebe22a0..0fb9e22 100644
--- src/hg/hgPhyloPlace/treeToAuspiceJson.c
+++ src/hg/hgPhyloPlace/treeToAuspiceJson.c
@@ -350,130 +350,145 @@
  * formatted. */
 {
 jsonWriteObjectValueUrl(jw, name, value, NULL);
 }
 
 static void makeLineageUrl(char *lineage, char *lineageUrl, size_t lineageUrlSize)
 /* If lineage is not "uploaded sample", make an outbreak.info link to it, otherwise just copy
  * lineage. */
 {
 if (sameString(lineage, "uploaded sample"))
     safecpy(lineageUrl, lineageUrlSize, lineage);
 else
     safef(lineageUrl, lineageUrlSize, OUTBREAK_INFO_URLBASE "%s", lineage);
 }
 
-static void jsonWriteLeafNodeAttributes(struct jsonWrite *jw, char *name,
-                                        struct sampleMetadata *met, boolean isUserSample,
-                                        char *source, struct hash *sampleUrls,
-                                        struct hash *samplePlacements, boolean isRsv,
+struct auspiceJsonInfo
+/* Collection of a bunch of things used when writing out auspice JSON for a subtree, so the
+ * recursive function doesn't need a dozen args. */
+    {
+    struct jsonWrite *jw;
+    struct slName *subtreeUserSampleIds;  // Subtree node names for user samples (not from big tree)
+    struct geneInfo *geneInfoList;        // Transcript seq & alignment for predicting AA change
+    struct seqWindow *gSeqWin;            // Reference genome seq for predicting AA change
+    struct sampleMetadataStore *sampleMetadata; // Sample metadata for decorating tree
+    struct hash *sampleUrls;              // URLs for samples, if applicable
+    struct hash *samplePlacements;        // Sample placement info e.g. clade/lineage from usher
+    int nodeNum;                          // For generating sequential node ID (in absence of name)
+    char *source;                         // Source of non-user sequences in tree (GISAID or public)
+    };
+
+static void jsonWriteLeafNodeAttributes(struct auspiceJsonInfo *aji, char *name,
+                                        boolean isUserSample, boolean isRsv,
                                         int branchAttrCount, char **branchAttrCols,
                                         char **branchAttrVals)
 /* Write elements of node_attrs for a sample which may be preexisting and in our metadata hash,
  * or may be a new sample from the user.  Set rets for color categories so parent branches can
  * determine their color categories. */
 {
-char *userOrOld = isUserSample ? "uploaded sample" : source;
-jsonWriteObjectValue(jw, "userOrOld", userOrOld);
+char *userOrOld = isUserSample ? "uploaded sample" : aji->source;
+jsonWriteObjectValue(aji->jw, "userOrOld", userOrOld);
 int i;
 for (i = 0;  i < branchAttrCount;  i++)
     branchAttrVals[i] = "";
 if (branchAttrCount > 0 && sameString(branchAttrCols[0], "userOrOld"))
     branchAttrVals[0] = userOrOld;
+char **met = name ? metadataForSample(aji->sampleMetadata, name) : NULL;
 if (met != NULL)
     {
     int i;
-    for (i = 0;  i < met->columnCount;  i++)
+    for (i = 0;  i < aji->sampleMetadata->columnCount;  i++)
         {
-        char *colName = met->columnNames[i];
+        char *colName = aji->sampleMetadata->columnNames[i];
         // Tweak old column name if found
         if (sameString(colName, "pangolin_lineage"))
             colName = "pango_lineage";
         // Link out to outbreak.info for Pango lineages
         if (startsWith("pango_lineage", colName))
             {
-            if (isNotEmpty(met->columnValues[i]))
+            if (isNotEmpty(met[i]))
                 {
                 char lineageUrl[1024];
-                makeLineageUrl(met->columnValues[i], lineageUrl, sizeof lineageUrl);
-                jsonWriteObjectValueUrl(jw, colName, met->columnValues[i], lineageUrl);
+                makeLineageUrl(met[i], lineageUrl, sizeof lineageUrl);
+                jsonWriteObjectValueUrl(aji->jw, colName, met[i], lineageUrl);
                 }
-            else if (isNotEmpty(met->columnValues[i]))
-                jsonWriteObjectValue(jw, colName, met->columnValues[i]);
+            else
+                jsonWriteObjectValue(aji->jw, colName, met[i]);
             }
         else
-            jsonWriteObjectValue(jw, colName, met->columnValues[i]);
+            jsonWriteObjectValue(aji->jw, colName, met[i]);
         // Some columns get passed upwards for aggregation so we can color internal nodes/branches.
         int j;
         for (j = 0;  j < branchAttrCount;  j++)
             {
             if (sameString(colName, branchAttrCols[j]))
                 {
-                branchAttrVals[j] = met->columnValues[i];
+                branchAttrVals[j] = met[i];
                 break;
                 }
             }
         }
     }
 else if (isUserSample)
     {
-    struct placementInfo *pi = name ? hashFindVal(samplePlacements, name) : NULL;
+    struct placementInfo *pi = name ? hashFindVal(aji->samplePlacements, name) : NULL;
     int i;
     for (i = 0;  i < branchAttrCount;  i++)
         {
         branchAttrVals[i] = "uploaded sample";
         // Special cases for using placementInfo of user sample for _usher lineage/clade calls
         // and outbreak.info link for Pango lineage
         //#*** TODO: think of a way to make this config-driven
         boolean wroteLink = FALSE;
         if (pi)
             {
             if (pi->nextClade && (sameString(branchAttrCols[i], "Nextstrain_clade_usher") ||
                                   sameString(branchAttrCols[i], "goya_usher")))
                 branchAttrVals[i] = pi->nextClade;
             else if (pi->pangoLineage)
                 {
                 if (sameString(branchAttrCols[i], "pango_lineage_usher"))
                     {
                     branchAttrVals[i] = pi->pangoLineage;
                     char lineageUrl[1024];
                     makeLineageUrl(pi->pangoLineage, lineageUrl, sizeof lineageUrl);
-                    jsonWriteObjectValueUrl(jw, branchAttrCols[i], branchAttrVals[i], lineageUrl);
+                    jsonWriteObjectValueUrl(aji->jw, branchAttrCols[i], branchAttrVals[i],
+                                            lineageUrl);
                     wroteLink = TRUE;
                     }
                 else if (sameString(branchAttrCols[i], "GCC_usher"))
                     branchAttrVals[i] = pi->pangoLineage;
                 }
             }
         if (!wroteLink)
-            jsonWriteObjectValue(jw, branchAttrCols[i], branchAttrVals[i]);
+            jsonWriteObjectValue(aji->jw, branchAttrCols[i], branchAttrVals[i]);
         }
     }
-char *sampleUrl = (sampleUrls && name) ? hashFindVal(sampleUrls, name) : NULL;
+char *sampleUrl = (aji->sampleUrls && name) ? hashFindVal(aji->sampleUrls, name) : NULL;
 if (isNotEmpty(sampleUrl))
     {
     char *p = strstr(sampleUrl, "subtreeAuspice");
     char *subtreeNum = p + strlen("subtreeAuspice");
     if (p && isdigit(*subtreeNum))
         {
         int num = atoi(subtreeNum);
         char subtreeLabel[1024];
         safef(subtreeLabel, sizeof subtreeLabel, "view subtree %d", num);
-        jsonWriteObjectValueUrl(jw, "subtree", subtreeLabel, sampleUrl);
+        jsonWriteObjectValueUrl(aji->jw, "subtree", subtreeLabel, sampleUrl);
         }
     else
-        jsonWriteObjectValueUrl(jw, "subtree", sampleUrl, sampleUrl);
+        jsonWriteObjectValueUrl(aji->jw, "subtree", sampleUrl, sampleUrl);
     }
 }
 
 static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, boolean isRsv,
                                           int branchAttrCount, char **branchAttrCols,
                                           char **branchAttrVals)
 /* Write elements of node_attrs for a branch. */
 {
 int i;
 for (i = 0;  i < branchAttrCount;  i++)
     {
     if (isNotEmpty(branchAttrVals[i]))
         jsonWriteObjectValue(jw, branchAttrCols[i], branchAttrVals[i]);
     }
 }
@@ -677,45 +692,30 @@
         struct slName *aaMut;
         for (aaMut = geneAaMut->val;  aaMut != NULL;  aaMut = aaMut->next)
             jsonWriteString(jw, NULL, aaMut->name);
         jsonWriteListEnd(jw);
         }
     jsonWriteListStart(jw, "nuc");
     struct singleNucChange *snc;
     for (snc = sncList;  snc != NULL;  snc = snc->next)
         jsonWriteStringf(jw, NULL, "%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase);
     jsonWriteListEnd(jw);
     jsonWriteObjectEnd(jw);  // mutations
     jsonWriteObjectEnd(jw); // branch_attrs
     }
 }
 
-struct auspiceJsonInfo
-/* Collection of a bunch of things used when writing out auspice JSON for a subtree, so the
- * recursive function doesn't need a dozen args. */
-    {
-    struct jsonWrite *jw;
-    struct slName *subtreeUserSampleIds;  // Subtree node names for user samples (not from big tree)
-    struct geneInfo *geneInfoList;        // Transcript seq & alignment for predicting AA change
-    struct seqWindow *gSeqWin;            // Reference genome seq for predicting AA change
-    struct hash *sampleMetadata;          // Sample metadata for decorating tree
-    struct hash *sampleUrls;              // URLs for samples, if applicable
-    struct hash *samplePlacements;        // Sample placement info e.g. clade/lineage from usher
-    int nodeNum;                          // For generating sequential node ID (in absence of name)
-    char *source;                         // Source of non-user sequences in tree (GISAID or public)
-    };
-
 static int cmpstringp(const void *p1, const void *p2)
 /* strcmp on pointers to strings, as in 'man qsort' but tolerate NULLs */
 {
 char *s1 = *(char * const *)p1;
 char *s2 = *(char * const *)p2;
 if (s1 && s2)
     return strcmp(s1, s2);
 else if (s1 && !s2)
     return 1;
 else if (s2 && !s1)
     return -1;
 return 0;
 }
 
 static char *majorityMaybe(char *array[], int arraySize)
@@ -746,31 +746,30 @@
 
 static void rTreeToAuspiceJson(struct phyloTree *node, int depth, struct auspiceJsonInfo *aji,
                                struct singleNucChange *ancestorMuts, boolean isRsv,
                                int branchAttrCount, char **branchAttrCols, char **branchAttrVals)
 /* Write Augur/Auspice V2 JSON for tree.  Enclosing object start and end are written by caller. */
 {
 struct singleNucChange *sncList = node->priv;
 if (sncList)
     {
     depth += slCount(sncList);
     }
 boolean isUserSample = FALSE;
 if (node->ident->name)
     isUserSample = slNameInList(aji->subtreeUserSampleIds, node->ident->name);
 char *name = node->ident->name;
-struct sampleMetadata *met = name ? metadataForSample(aji->sampleMetadata, name) : NULL;
 if (name)
     jsonWriteString(aji->jw, "name", name);
 else
     jsonWriteStringf(aji->jw, "name", "NODE%d", aji->nodeNum++);
 jsonWriteBranchAttrs(aji->jw, node, ancestorMuts, aji->geneInfoList, aji->gSeqWin);
 if (node->numEdges > 0)
     {
     struct singleNucChange *allMuts = ancestorMuts;
     struct singleNucChange *ancLast = slLastEl(ancestorMuts);
     if (ancLast != NULL)
         ancLast->next = sncList;
     else
         allMuts = sncList;
     jsonWriteListStart(aji->jw, "children");
     char *kidAttrVals[branchAttrCount][node->numEdges];
@@ -787,32 +786,31 @@
         for (j = 0;  j < branchAttrCount;  j++)
             kidAttrVals[j][i] = kidNodeAttrVals[j];
         }
     jsonWriteListEnd(aji->jw);
     if (branchAttrVals)
         {
         for (i = 0;  i < branchAttrCount;  i++)
             branchAttrVals[i] = majorityMaybe(kidAttrVals[i], node->numEdges);
         }
     if (ancLast)
         ancLast->next = NULL;
     }
 jsonWriteObjectStart(aji->jw, "node_attrs");
 jsonWriteDouble(aji->jw, "div", depth);
 if (node->numEdges == 0)
-    jsonWriteLeafNodeAttributes(aji->jw, name, met, isUserSample, aji->source, aji->sampleUrls,
-                                aji->samplePlacements, isRsv,
+    jsonWriteLeafNodeAttributes(aji, name, isUserSample, isRsv,
                                 branchAttrCount, branchAttrCols, branchAttrVals);
 else if (branchAttrVals)
     jsonWriteBranchNodeAttributes(aji->jw, isRsv, branchAttrCount, branchAttrCols, branchAttrVals);
 jsonWriteObjectEnd(aji->jw);
 }
 
 struct phyloTree *phyloTreeNewNode(char *name)
 /* Alloc & return a new node with no children. */
 {
 struct phyloTree *node;
 AllocVar(node);
 AllocVar(node->ident);
 node->ident->name = cloneString(name);
 return node;
 }
@@ -887,31 +885,31 @@
     {
     attrList = slNameListFromComma(branchAttrSetting);
     branchAttrCount += slCount(attrList);
     }
 char **branchAttrCols = NULL;
 AllocArray(branchAttrCols, branchAttrCount);
 branchAttrCols[0] = cloneString("userOrOld");
 int i;
 for (i = 1, attr = attrList;  i < branchAttrCount && attr != NULL;  i++, attr = attr->next)
     branchAttrCols[i] = cloneString(trimSpaces(attr->name));
 *retBranchAttrCols = branchAttrCols;
 return branchAttrCount;
 }
 
 void treeToAuspiceJson(struct subtreeInfo *sti, char *org, char *db, struct geneInfo *geneInfoList,
-                       struct seqWindow *gSeqWin, struct hash *sampleMetadata,
+                       struct seqWindow *gSeqWin, struct sampleMetadataStore *sampleMetadata,
                        struct hash *sampleUrls, struct hash *samplePlacements,
                        char *jsonFile, char *source)
 /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format
  * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */
 {
 struct phyloTree *tree = sti->subtree;
 FILE *outF = mustOpen(jsonFile, "w");
 struct jsonWrite *jw = jsonWriteNew();
 jsonWriteObjectStart(jw, NULL);
 jsonWriteString(jw, "version", "v2");
 boolean isRsv = (stringIn("GCF_000855545", db) || stringIn("GCF_002815475", db) ||
                  startsWith("RGCC", db));
 boolean isFlu = (stringIn("GCF_000865085", db) || stringIn("GCF_001343785", db));
 writeAuspiceMeta(jw, sti->subtreeUserSampleIds, source, org, db, geneInfoList,
                  gSeqWin->end, isRsv, isFlu);