2d05d30ed4df1612d72ba84c812d004de935b122 angie Fri May 17 16:08:54 2024 -0700 Add lib module mmHash (memory-mapped hash), util tabToMmHash, and hgPhyloPlace support for using mmHash files instead of tab-separated files for metadata and name lookup. Using mmHash for name lookup saves about 50-55 seconds for SARS-CoV-2 hgPhyloPlace name/ID queries. diff --git src/hg/hgPhyloPlace/treeToAuspiceJson.c src/hg/hgPhyloPlace/treeToAuspiceJson.c index ebe22a0..0fb9e22 100644 --- src/hg/hgPhyloPlace/treeToAuspiceJson.c +++ src/hg/hgPhyloPlace/treeToAuspiceJson.c @@ -350,130 +350,145 @@ * formatted. */ { jsonWriteObjectValueUrl(jw, name, value, NULL); } static void makeLineageUrl(char *lineage, char *lineageUrl, size_t lineageUrlSize) /* If lineage is not "uploaded sample", make an outbreak.info link to it, otherwise just copy * lineage. */ { if (sameString(lineage, "uploaded sample")) safecpy(lineageUrl, lineageUrlSize, lineage); else safef(lineageUrl, lineageUrlSize, OUTBREAK_INFO_URLBASE "%s", lineage); } -static void jsonWriteLeafNodeAttributes(struct jsonWrite *jw, char *name, - struct sampleMetadata *met, boolean isUserSample, - char *source, struct hash *sampleUrls, - struct hash *samplePlacements, boolean isRsv, +struct auspiceJsonInfo +/* Collection of a bunch of things used when writing out auspice JSON for a subtree, so the + * recursive function doesn't need a dozen args. */ + { + struct jsonWrite *jw; + struct slName *subtreeUserSampleIds; // Subtree node names for user samples (not from big tree) + struct geneInfo *geneInfoList; // Transcript seq & alignment for predicting AA change + struct seqWindow *gSeqWin; // Reference genome seq for predicting AA change + struct sampleMetadataStore *sampleMetadata; // Sample metadata for decorating tree + struct hash *sampleUrls; // URLs for samples, if applicable + struct hash *samplePlacements; // Sample placement info e.g. clade/lineage from usher + int nodeNum; // For generating sequential node ID (in absence of name) + char *source; // Source of non-user sequences in tree (GISAID or public) + }; + +static void jsonWriteLeafNodeAttributes(struct auspiceJsonInfo *aji, char *name, + boolean isUserSample, boolean isRsv, int branchAttrCount, char **branchAttrCols, char **branchAttrVals) /* Write elements of node_attrs for a sample which may be preexisting and in our metadata hash, * or may be a new sample from the user. Set rets for color categories so parent branches can * determine their color categories. */ { -char *userOrOld = isUserSample ? "uploaded sample" : source; -jsonWriteObjectValue(jw, "userOrOld", userOrOld); +char *userOrOld = isUserSample ? "uploaded sample" : aji->source; +jsonWriteObjectValue(aji->jw, "userOrOld", userOrOld); int i; for (i = 0; i < branchAttrCount; i++) branchAttrVals[i] = ""; if (branchAttrCount > 0 && sameString(branchAttrCols[0], "userOrOld")) branchAttrVals[0] = userOrOld; +char **met = name ? metadataForSample(aji->sampleMetadata, name) : NULL; if (met != NULL) { int i; - for (i = 0; i < met->columnCount; i++) + for (i = 0; i < aji->sampleMetadata->columnCount; i++) { - char *colName = met->columnNames[i]; + char *colName = aji->sampleMetadata->columnNames[i]; // Tweak old column name if found if (sameString(colName, "pangolin_lineage")) colName = "pango_lineage"; // Link out to outbreak.info for Pango lineages if (startsWith("pango_lineage", colName)) { - if (isNotEmpty(met->columnValues[i])) + if (isNotEmpty(met[i])) { char lineageUrl[1024]; - makeLineageUrl(met->columnValues[i], lineageUrl, sizeof lineageUrl); - jsonWriteObjectValueUrl(jw, colName, met->columnValues[i], lineageUrl); + makeLineageUrl(met[i], lineageUrl, sizeof lineageUrl); + jsonWriteObjectValueUrl(aji->jw, colName, met[i], lineageUrl); } - else if (isNotEmpty(met->columnValues[i])) - jsonWriteObjectValue(jw, colName, met->columnValues[i]); + else + jsonWriteObjectValue(aji->jw, colName, met[i]); } else - jsonWriteObjectValue(jw, colName, met->columnValues[i]); + jsonWriteObjectValue(aji->jw, colName, met[i]); // Some columns get passed upwards for aggregation so we can color internal nodes/branches. int j; for (j = 0; j < branchAttrCount; j++) { if (sameString(colName, branchAttrCols[j])) { - branchAttrVals[j] = met->columnValues[i]; + branchAttrVals[j] = met[i]; break; } } } } else if (isUserSample) { - struct placementInfo *pi = name ? hashFindVal(samplePlacements, name) : NULL; + struct placementInfo *pi = name ? hashFindVal(aji->samplePlacements, name) : NULL; int i; for (i = 0; i < branchAttrCount; i++) { branchAttrVals[i] = "uploaded sample"; // Special cases for using placementInfo of user sample for _usher lineage/clade calls // and outbreak.info link for Pango lineage //#*** TODO: think of a way to make this config-driven boolean wroteLink = FALSE; if (pi) { if (pi->nextClade && (sameString(branchAttrCols[i], "Nextstrain_clade_usher") || sameString(branchAttrCols[i], "goya_usher"))) branchAttrVals[i] = pi->nextClade; else if (pi->pangoLineage) { if (sameString(branchAttrCols[i], "pango_lineage_usher")) { branchAttrVals[i] = pi->pangoLineage; char lineageUrl[1024]; makeLineageUrl(pi->pangoLineage, lineageUrl, sizeof lineageUrl); - jsonWriteObjectValueUrl(jw, branchAttrCols[i], branchAttrVals[i], lineageUrl); + jsonWriteObjectValueUrl(aji->jw, branchAttrCols[i], branchAttrVals[i], + lineageUrl); wroteLink = TRUE; } else if (sameString(branchAttrCols[i], "GCC_usher")) branchAttrVals[i] = pi->pangoLineage; } } if (!wroteLink) - jsonWriteObjectValue(jw, branchAttrCols[i], branchAttrVals[i]); + jsonWriteObjectValue(aji->jw, branchAttrCols[i], branchAttrVals[i]); } } -char *sampleUrl = (sampleUrls && name) ? hashFindVal(sampleUrls, name) : NULL; +char *sampleUrl = (aji->sampleUrls && name) ? hashFindVal(aji->sampleUrls, name) : NULL; if (isNotEmpty(sampleUrl)) { char *p = strstr(sampleUrl, "subtreeAuspice"); char *subtreeNum = p + strlen("subtreeAuspice"); if (p && isdigit(*subtreeNum)) { int num = atoi(subtreeNum); char subtreeLabel[1024]; safef(subtreeLabel, sizeof subtreeLabel, "view subtree %d", num); - jsonWriteObjectValueUrl(jw, "subtree", subtreeLabel, sampleUrl); + jsonWriteObjectValueUrl(aji->jw, "subtree", subtreeLabel, sampleUrl); } else - jsonWriteObjectValueUrl(jw, "subtree", sampleUrl, sampleUrl); + jsonWriteObjectValueUrl(aji->jw, "subtree", sampleUrl, sampleUrl); } } static void jsonWriteBranchNodeAttributes(struct jsonWrite *jw, boolean isRsv, int branchAttrCount, char **branchAttrCols, char **branchAttrVals) /* Write elements of node_attrs for a branch. */ { int i; for (i = 0; i < branchAttrCount; i++) { if (isNotEmpty(branchAttrVals[i])) jsonWriteObjectValue(jw, branchAttrCols[i], branchAttrVals[i]); } } @@ -677,45 +692,30 @@ struct slName *aaMut; for (aaMut = geneAaMut->val; aaMut != NULL; aaMut = aaMut->next) jsonWriteString(jw, NULL, aaMut->name); jsonWriteListEnd(jw); } jsonWriteListStart(jw, "nuc"); struct singleNucChange *snc; for (snc = sncList; snc != NULL; snc = snc->next) jsonWriteStringf(jw, NULL, "%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); jsonWriteListEnd(jw); jsonWriteObjectEnd(jw); // mutations jsonWriteObjectEnd(jw); // branch_attrs } } -struct auspiceJsonInfo -/* Collection of a bunch of things used when writing out auspice JSON for a subtree, so the - * recursive function doesn't need a dozen args. */ - { - struct jsonWrite *jw; - struct slName *subtreeUserSampleIds; // Subtree node names for user samples (not from big tree) - struct geneInfo *geneInfoList; // Transcript seq & alignment for predicting AA change - struct seqWindow *gSeqWin; // Reference genome seq for predicting AA change - struct hash *sampleMetadata; // Sample metadata for decorating tree - struct hash *sampleUrls; // URLs for samples, if applicable - struct hash *samplePlacements; // Sample placement info e.g. clade/lineage from usher - int nodeNum; // For generating sequential node ID (in absence of name) - char *source; // Source of non-user sequences in tree (GISAID or public) - }; - static int cmpstringp(const void *p1, const void *p2) /* strcmp on pointers to strings, as in 'man qsort' but tolerate NULLs */ { char *s1 = *(char * const *)p1; char *s2 = *(char * const *)p2; if (s1 && s2) return strcmp(s1, s2); else if (s1 && !s2) return 1; else if (s2 && !s1) return -1; return 0; } static char *majorityMaybe(char *array[], int arraySize) @@ -746,31 +746,30 @@ static void rTreeToAuspiceJson(struct phyloTree *node, int depth, struct auspiceJsonInfo *aji, struct singleNucChange *ancestorMuts, boolean isRsv, int branchAttrCount, char **branchAttrCols, char **branchAttrVals) /* Write Augur/Auspice V2 JSON for tree. Enclosing object start and end are written by caller. */ { struct singleNucChange *sncList = node->priv; if (sncList) { depth += slCount(sncList); } boolean isUserSample = FALSE; if (node->ident->name) isUserSample = slNameInList(aji->subtreeUserSampleIds, node->ident->name); char *name = node->ident->name; -struct sampleMetadata *met = name ? metadataForSample(aji->sampleMetadata, name) : NULL; if (name) jsonWriteString(aji->jw, "name", name); else jsonWriteStringf(aji->jw, "name", "NODE%d", aji->nodeNum++); jsonWriteBranchAttrs(aji->jw, node, ancestorMuts, aji->geneInfoList, aji->gSeqWin); if (node->numEdges > 0) { struct singleNucChange *allMuts = ancestorMuts; struct singleNucChange *ancLast = slLastEl(ancestorMuts); if (ancLast != NULL) ancLast->next = sncList; else allMuts = sncList; jsonWriteListStart(aji->jw, "children"); char *kidAttrVals[branchAttrCount][node->numEdges]; @@ -787,32 +786,31 @@ for (j = 0; j < branchAttrCount; j++) kidAttrVals[j][i] = kidNodeAttrVals[j]; } jsonWriteListEnd(aji->jw); if (branchAttrVals) { for (i = 0; i < branchAttrCount; i++) branchAttrVals[i] = majorityMaybe(kidAttrVals[i], node->numEdges); } if (ancLast) ancLast->next = NULL; } jsonWriteObjectStart(aji->jw, "node_attrs"); jsonWriteDouble(aji->jw, "div", depth); if (node->numEdges == 0) - jsonWriteLeafNodeAttributes(aji->jw, name, met, isUserSample, aji->source, aji->sampleUrls, - aji->samplePlacements, isRsv, + jsonWriteLeafNodeAttributes(aji, name, isUserSample, isRsv, branchAttrCount, branchAttrCols, branchAttrVals); else if (branchAttrVals) jsonWriteBranchNodeAttributes(aji->jw, isRsv, branchAttrCount, branchAttrCols, branchAttrVals); jsonWriteObjectEnd(aji->jw); } struct phyloTree *phyloTreeNewNode(char *name) /* Alloc & return a new node with no children. */ { struct phyloTree *node; AllocVar(node); AllocVar(node->ident); node->ident->name = cloneString(name); return node; } @@ -887,31 +885,31 @@ { attrList = slNameListFromComma(branchAttrSetting); branchAttrCount += slCount(attrList); } char **branchAttrCols = NULL; AllocArray(branchAttrCols, branchAttrCount); branchAttrCols[0] = cloneString("userOrOld"); int i; for (i = 1, attr = attrList; i < branchAttrCount && attr != NULL; i++, attr = attr->next) branchAttrCols[i] = cloneString(trimSpaces(attr->name)); *retBranchAttrCols = branchAttrCols; return branchAttrCount; } void treeToAuspiceJson(struct subtreeInfo *sti, char *org, char *db, struct geneInfo *geneInfoList, - struct seqWindow *gSeqWin, struct hash *sampleMetadata, + struct seqWindow *gSeqWin, struct sampleMetadataStore *sampleMetadata, struct hash *sampleUrls, struct hash *samplePlacements, char *jsonFile, char *source) /* Write JSON for tree in Nextstrain's Augur/Auspice V2 JSON format * (https://github.com/nextstrain/augur/blob/master/augur/data/schema-export-v2.json). */ { struct phyloTree *tree = sti->subtree; FILE *outF = mustOpen(jsonFile, "w"); struct jsonWrite *jw = jsonWriteNew(); jsonWriteObjectStart(jw, NULL); jsonWriteString(jw, "version", "v2"); boolean isRsv = (stringIn("GCF_000855545", db) || stringIn("GCF_002815475", db) || startsWith("RGCC", db)); boolean isFlu = (stringIn("GCF_000865085", db) || stringIn("GCF_001343785", db)); writeAuspiceMeta(jw, sti->subtreeUserSampleIds, source, org, db, geneInfoList, gSeqWin->end, isRsv, isFlu);