2d05d30ed4df1612d72ba84c812d004de935b122 angie Fri May 17 16:08:54 2024 -0700 Add lib module mmHash (memory-mapped hash), util tabToMmHash, and hgPhyloPlace support for using mmHash files instead of tab-separated files for metadata and name lookup. Using mmHash for name lookup saves about 50-55 seconds for SARS-CoV-2 hgPhyloPlace name/ID queries. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 82fa6f1..b19a387 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -7,30 +7,31 @@ #include "cart.h" #include "cheapcgi.h" #include "errCatch.h" #include "fa.h" #include "genePred.h" #include "grp.h" #include "hCommon.h" #include "hash.h" #include "hgConfig.h" #include "htmshell.h" #include "hubConnect.h" #include "hui.h" #include "iupac.h" #include "jsHelper.h" #include "linefile.h" +#include "mmHash.h" #include "obscure.h" #include "parsimonyProto.h" #include "phyloPlace.h" #include "phyloTree.h" #include "pipeline.h" #include "psl.h" #include "pthreadWrap.h" #include "ra.h" #include "regexHelper.h" #include "trackHub.h" #include "trashDir.h" #include "vcf.h" // Globals: static boolean measureTiming = FALSE; @@ -914,154 +915,201 @@ struct variantPathNode *vpn; for (vpn = variantPath; vpn != NULL; vpn = vpn->next) { if (vpn != variantPath) fprintf(f, " > "); struct singleNucChange *snc; for (snc = vpn->sncList; snc != NULL; snc = snc->next) { if (snc != vpn->sncList) fprintf(f, ", "); fprintf(f, "%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); } } } -static struct hash *getSampleMetadata(char *metadataFile) +static void getSampleMetadataHash(char *metadataFile, struct sampleMetadataStore *sms) /* If config.ra defines a metadataFile, load its contents into a hash indexed by name (and INSDC * nucleotide accession sans dot if present) and return the hash; otherwise return NULL. */ { -struct hash *sampleMetadata = NULL; if (isNotEmpty(metadataFile) && fileExists(metadataFile)) { - sampleMetadata = hashNew(0); + struct hash *sampleMetadata = hashNew(0); struct lineFile *lf = lineFileOpen(metadataFile, TRUE); int headerWordCount = 0; char **headerWords = NULL; char *line; // Check for header line if (lineFileNext(lf, &line, NULL)) { if (startsWithWord("strain", line)) { char *headerLine = cloneString(line); - headerWordCount = chopString(headerLine, "\t", NULL, 0); + headerWordCount = chopByChar(headerLine, '\t', NULL, 0); AllocArray(headerWords, headerWordCount); - chopString(headerLine, "\t", headerWords, headerWordCount); + chopByChar(headerLine, '\t', headerWords, headerWordCount); } else errAbort("Missing header line from metadataFile %s", metadataFile); } // Look for genbank_accession in header line so we can index by that as well int genbankIx = stringArrayIx("genbank_accession", headerWords, headerWordCount); while (lineFileNext(lf, &line, NULL)) { - struct sampleMetadata *met; - AllocVar(met); - met->columnCount = headerWordCount; - met->columnNames = headerWords; - AllocArray(met->columnValues, headerWordCount); + char **columnValues = NULL; + AllocArray(columnValues, headerWordCount); char *lineCopy = cloneString(line); - int wordCount = chopByChar(lineCopy, '\t', met->columnValues, headerWordCount); + int wordCount = chopByChar(lineCopy, '\t', columnValues, headerWordCount); lineFileExpectWords(lf, headerWordCount, wordCount); - if (genbankIx >= 0 && !isEmpty(met->columnValues[genbankIx]) && - !sameString("?", met->columnValues[genbankIx])) + if (genbankIx >= 0 && !isEmpty(columnValues[genbankIx]) && + !sameString("?", columnValues[genbankIx])) { - if (strchr(met->columnValues[genbankIx], '.')) + if (strchr(columnValues[genbankIx], '.')) { // Index by versionless accession - char copy[strlen(met->columnValues[genbankIx])+1]; - safecpy(copy, sizeof copy, met->columnValues[genbankIx]); + char copy[strlen(columnValues[genbankIx])+1]; + safecpy(copy, sizeof copy, columnValues[genbankIx]); char *dot = strchr(copy, '.'); *dot = '\0'; - hashAdd(sampleMetadata, copy, met); + hashAdd(sampleMetadata, copy, columnValues); } else - hashAdd(sampleMetadata, met->columnValues[genbankIx], met); + hashAdd(sampleMetadata, columnValues[genbankIx], columnValues); } - hashAdd(sampleMetadata, met->columnValues[0], met); + hashAdd(sampleMetadata, columnValues[0], columnValues); } lineFileClose(&lf); + sms->hash = sampleMetadata; + sms->columnCount = headerWordCount; + sms->columnNames = headerWords; + } +} + +static struct sampleMetadataStore *getSampleMetadata(char *metadataFile) +/* If config.ra defines a metadataFile, load its contents into a hash indexed by name (and INSDC + * nucleotide accession sans dot if present) and return the hash; otherwise return NULL. */ +{ +struct sampleMetadataStore *sampleMetadata = NULL; +if (isNotEmpty(metadataFile) && fileExists(metadataFile)) + { + AllocVar(sampleMetadata); + if (endsWith(metadataFile, ".mmh")) + sampleMetadata->mmh = mmHashFromFile(metadataFile); + else + getSampleMetadataHash(metadataFile, sampleMetadata); } return sampleMetadata; } char *gbIdFromSampleName(char *sampleId) /* If a GenBank accession is present somewhere in sampleId, extract and return it, otherwise NULL. */ { char *gbId = NULL; regmatch_t substrs[2]; // If it's preceded by anything, make sure it's a | so we ignore isolate names that glom on the // reference's GenBank accession in them (e.g. ..._MN908947.3/2022|OQ070230.1). if (regexMatchSubstr(sampleId, "^(.*\\|)?([A-Z][A-Z][0-9]{6})", substrs, ArraySize(substrs))) { // Make sure there is a word boundary at the end of the match too if (!isalnum(sampleId[substrs[1].rm_eo])) gbId = cloneStringZ(sampleId+substrs[1].rm_so, substrs[1].rm_eo - substrs[1].rm_so); } return gbId; } -struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId) +static char **metadataForSampleHash(struct hash *sampleMetadata, char *sampleId) /* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */ { -struct sampleMetadata *met = NULL; -if (sampleMetadata == NULL) - return NULL; -met = hashFindVal(sampleMetadata, sampleId); +char **met = hashFindVal(sampleMetadata, sampleId); if (met) return met; if (!met) { char *gbId = gbIdFromSampleName(sampleId); if (gbId) met = hashFindVal(sampleMetadata, gbId); } if (!met && strchr(sampleId, '|')) { char copy[strlen(sampleId)+1]; safecpy(copy, sizeof copy, sampleId); char *words[4]; - int wordCount = chopString(copy, "|", words, ArraySize(words)); + int wordCount = chopByChar(copy, '|', words, ArraySize(words)); if (isNotEmpty(words[0])) met = hashFindVal(sampleMetadata, words[0]); if (met == NULL && wordCount > 1 && isNotEmpty(words[1])) met = hashFindVal(sampleMetadata, words[1]); } -// If it's one of our collapsed node names, dig out the example name and try that. -if (!met && isdigit(sampleId[0]) && strstr(sampleId, "_from_")) +return met; +} + +static char **metadataForSampleMmHash(struct sampleMetadataStore *sms, char *sampleId) +/* Look up sampleId in mmHash and parse string value into struct sampleMetadata, or return NULL if + * not found. */ +{ +char **met = NULL; +const char *metadataLine = mmHashFindVal(sms->mmh, sampleId); +if (metadataLine) { - char *eg = strstr(sampleId, "_eg_"); - if (eg) - met = hashFindVal(sampleMetadata, eg+strlen("_eg_")); + if (sms->columnNames == NULL) + { + const char *headerLine = mmHashFindVal(sms->mmh, "strain"); + if (headerLine == NULL) + errAbort("metadataForSampleMmHash: can't find column definitions in file " + "(no line starts with 'strain')"); + char *headerLineCopy = cloneString(headerLine); + sms->columnCount = chopByChar(headerLineCopy, '\t', NULL, 0); + AllocArray(sms->columnNames, sms->columnCount); + chopByChar(headerLineCopy, '\t', sms->columnNames, sms->columnCount); + } + char *lineCopy = cloneString(metadataLine); + int metWordCount = chopByChar(lineCopy, '\t', NULL, 0); + if (metWordCount != sms->columnCount) + errAbort("metadataForSampleMmHash: found %lu header columns but %d data columns for '%s'" + " ('%s')", + sms->columnCount, metWordCount, sampleId, metadataLine); + AllocArray(met, sms->columnCount); + chopByChar(lineCopy, '\t', met, sms->columnCount); } return met; } -static char *lineageForSample(struct hash *sampleMetadata, char *sampleId) +char **metadataForSample(struct sampleMetadataStore *sampleMetadata, char *sampleId) +/* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */ +{ +if (sampleMetadata == NULL) + return NULL; +if (sampleMetadata->hash) + return metadataForSampleHash(sampleMetadata->hash, sampleId); +else + return metadataForSampleMmHash(sampleMetadata, sampleId); +} + + +static char *lineageForSample(struct sampleMetadataStore *sms, char *sampleId) /* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */ { char *lineage = NULL; -struct sampleMetadata *met = metadataForSample(sampleMetadata, sampleId); +char **met = metadataForSample(sms, sampleId); if (met) { int ix; - if ((ix = stringArrayIx("pangolin_lineage", met->columnNames, met->columnCount)) >= 0 || - (ix = stringArrayIx("pango_lineage", met->columnNames, met->columnCount)) >= 0 || - (ix = stringArrayIx("Nextstrain_lineage", met->columnNames, met->columnCount)) >= 0 || - (ix = stringArrayIx("GCC_nextclade", met->columnNames, met->columnCount)) >= 0) - lineage = met->columnValues[ix]; + if ((ix = stringArrayIx("pangolin_lineage", sms->columnNames, sms->columnCount)) >= 0 || + (ix = stringArrayIx("pango_lineage", sms->columnNames, sms->columnCount)) >= 0 || + (ix = stringArrayIx("Nextstrain_lineage", sms->columnNames, sms->columnCount)) >= 0 || + (ix = stringArrayIx("GCC_nextclade", sms->columnNames, sms->columnCount)) >= 0) + lineage = met[ix]; } return lineage; } static void printLineageLink(char *lineage, char *db) /* Print lineage with link to outbreak.info or pango-designation issue if appropriate. * Caller must handle case of empty/NULL lineage. */ { boolean isSarsCov2 = sameString(db, "wuhCor1"); if (isEmpty(lineage)) errAbort("programmer error:printLineageLink called with empty lineage"); else if (isSarsCov2 && startsWith("proposed", lineage)) printf("<a href='"PANGO_DESIGNATION_ISSUE_URLBASE"%s' target=_blank>%s</a>", lineage+strlen("proposed"), lineage); else if (isSarsCov2 && differentString(lineage, "None") && differentString(lineage, "Unassigned")) @@ -1222,31 +1270,31 @@ struct placementInfo *info = hashFindVal(samplePlacements, row->name); if (info) { if (isNotEmpty(info->nearestNeighbor)) neighbor = info->nearestNeighbor; if (isNotEmpty(info->neighborLineage)) lineage = info->neighborLineage; } printf("%-*s %s %s\n", maxLen, asciiRow, neighbor, lineage); } slNameFreeList(&rowList); dyStringFree(&dy); } static void describeSamplePlacements(struct slName *sampleIds, struct hash *samplePlacements, - struct phyloTree *subtree, struct hash *sampleMetadata, + struct phyloTree *subtree, char *source, char *refAcc, char *db) /* Report how each sample fits into the big tree. */ { // Sort sample placements by sampleMuts so we can group identical samples. struct slRef *placementRefs = getPlacementRefList(sampleIds, samplePlacements); slSort(&placementRefs, placementInfoRefCmpSampleMuts); int relatedCount = slCount(placementRefs); int clumpSize = countIdentical(placementRefs); if (clumpSize < relatedCount && relatedCount > 2) { // Not all of the related sequences are identical, so they will be broken down into // separate "clumps". List all related samples first to avoid confusion. puts("<pre>"); asciiTreeWithNeighborInfo(subtree, samplePlacements); puts("</pre>"); @@ -1339,47 +1387,47 @@ int *retIx) /* Find the subtree that contains sample name and set *retIx to its index in the list. * If we can't find it, return NULL and set *retIx to -1. */ { struct subtreeInfo *ti; int ix; for (ti = subtreeInfoList, ix = 0; ti != NULL; ti = ti->next, ix++) if (slNameInListUseCase(ti->subtreeUserSampleIds, name)) break; if (ti == NULL) ix = -1; *retIx = ix; return ti; } -static void findNearestNeighbors(struct usherResults *results, struct hash *sampleMetadata) +static void findNearestNeighbors(struct usherResults *results, struct sampleMetadataStore *sms) /* For each placed sample, find the nearest neighbor in the subtree and its assigned lineage, * and fill in those two fields of placementInfo. */ { struct hashCookie cookie = hashFirst(results->samplePlacements); struct hashEl *hel; while ((hel = hashNext(&cookie)) != NULL) { struct placementInfo *info = hel->val; int ix; struct subtreeInfo *ti = subtreeInfoForSample(results->subtreeInfoList, info->sampleId, &ix); if (!ti) continue; info->nearestNeighbor = findNearestNeighbor(ti->subtree, info->sampleId, results->samplePlacements); if (isNotEmpty(info->nearestNeighbor)) - info->neighborLineage = lineageForSample(sampleMetadata, info->nearestNeighbor); + info->neighborLineage = lineageForSample(sms, info->nearestNeighbor); } } static void lookForCladesAndLineages(struct hash *samplePlacements, boolean *retGotClades, boolean *retGotLineages) /* See if UShER has annotated any clades and/or lineages for seqs. */ { boolean gotClades = FALSE, gotLineages = FALSE; struct hashEl *hel; struct hashCookie cookie = hashFirst(samplePlacements); while ((hel = hashNext(&cookie)) != NULL) { struct placementInfo *pi = hel->val; if (pi) { @@ -2098,31 +2146,31 @@ { if (gotClades) printf("<td>n/a</td>"); if (gotLineages) printf("<td>n/a</td>"); printf("<td>n/a</td><td>n/a</td><td>n/a</td><td>n/a</td><td>n/a</td>"); } printSubtreeTd(ur->subtreeInfoList, jsonTns, si->seq->name, subtreeSize); puts("</tr>"); } puts("</tbody></table><p></p>"); } } static void summarizeSubtrees(struct slName *sampleIds, struct usherResults *results, - struct hash *sampleMetadata, struct tempName *jsonTns[], + struct sampleMetadataStore *sms, struct tempName *jsonTns[], char *db, int subtreeSize) /* Print a summary table of pasted/uploaded identifiers and subtrees */ { boolean gotClades = FALSE, gotLineages = FALSE; lookForCladesAndLineages(results->samplePlacements, &gotClades, &gotLineages); puts("<table class='seqSummary'><tbody>"); puts("<tr><th>Sequence</th>"); boolean isRsv = (stringIn("GCF_000855545", db) || stringIn("GCF_002815475", db) || startsWith("RGCC", db)); if (gotClades) { if (sameString(db, "wuhCor1")) puts("<th>Nextstrain clade (UShER)" TOOLTIP("The <a href='https://nextstrain.org/blog/2021-01-06-updated-SARS-CoV-2-clade-naming' " "target=_blank>Nextstrain clade</a> " @@ -2183,31 +2231,31 @@ if (pi) { if (gotClades) printf("<td>%s</td>", pi->nextClade ? pi->nextClade : "n/a"); if (gotLineages) printLineageTd(pi->pangoLineage, "n/a", db); } else { if (gotClades) printf("<td>n/a</td>"); if (gotLineages) printf("<td>n/a</td>"); } // pangolin-assigned lineage - char *lineage = lineageForSample(sampleMetadata, si->name); + char *lineage = lineageForSample(sms, si->name); printLineageTd(lineage, "n/a", db); // Maybe also #mutations with mouseover to show mutation path? printSubtreeTd(results->subtreeInfoList, jsonTns, si->name, subtreeSize); } puts("</tbody></table><p></p>"); } static struct singleNucChange *sncListFromSampleMutsAndImputed(struct slName *sampleMuts, struct baseVal *imputedBases, struct seqWindow *gSeqWin) /* Convert a list of "<ref><pos><alt>" names to struct singleNucChange list. * However, if <alt> is ambiguous, skip it because variantProjector doesn't like it. * Add imputed base predictions. */ { struct singleNucChange *sncList = NULL; @@ -2834,31 +2882,31 @@ static void addNameMaybeComponents(struct hash *nameHash, char *fullName, boolean addComponents) /* Add entries to nameHash mapping fullName to itself, and components of fullName to fullName. * If addComponents and fullName is |-separated name|ID|date, add name and ID to nameHash. */ { char *fullNameHashStored = hashStoreName(nameHash, fullName); // Now that we have hash storage for fullName, make it point to itself. struct hashEl *hel = hashLookup(nameHash, fullName); if (hel == NULL) errAbort("Can't look up '%s' right after adding it", fullName); hel->val = fullNameHashStored; if (addComponents) { char *words[4]; char copy[strlen(fullName)+1]; safecpy(copy, sizeof copy, fullName); - int wordCount = chopString(copy, "|", words, ArraySize(words)); + int wordCount = chopByChar(copy, '|', words, ArraySize(words)); if (wordCount == 3) { // name|ID|date hashAdd(nameHash, words[0], fullNameHashStored); maybeAddIsolate(nameHash, words[0], fullNameHashStored); addModVersion(nameHash, words[1], fullNameHashStored); } else if (wordCount == 2) { // ID|date addModVersion(nameHash, words[0], fullNameHashStored); // ID might be a COG-UK country/isolate/year maybeAddIsolate(nameHash, words[0], fullNameHashStored); } } @@ -2904,208 +2952,239 @@ int wordCount = chopTabs(line, words); lineFileExpectWords(lf, 2, wordCount); char *fullName = hashFindVal(nameHash, words[1]); if (fullName) hashAdd(nameHash, words[0], fullName); else { if (missExample == NULL) missExample = cloneString(words[1]); } } lineFileClose(&lf); } } -static struct hash *getTreeNames(char *nameFile, char *protobufFile, +static struct hash *getTreeNamesHash(char *nameFile, char *protobufFile, struct mutationAnnotatedTree **pBigTree, boolean addComponents, int *pStartTime) /* Make a hash of full names of leaves of bigTree, using nameFile if it exists, otherwise * falling back on the bigTree itself, loading it if necessary. If addComponents, also map * components of those names to the full names in case the user gives us partial names/IDs. */ { struct hash *nameHash = NULL; if (isNotEmpty(nameFile) && fileExists(nameFile)) { nameHash = hashNew(26); struct lineFile *lf = lineFileOpen(nameFile, TRUE); char *line; while (lineFileNext(lf, &line, NULL)) addNameMaybeComponents(nameHash, line, addComponents); lineFileClose(&lf); } else { if (*pBigTree == NULL) { *pBigTree = parseParsimonyProtobuf(protobufFile); reportTiming(pStartTime, "parse protobuf file (at startup, because sample names file was " "not provided)"); } struct mutationAnnotatedTree *bigTree = *pBigTree; int nodeCount = bigTree->nodeHash->elCount; nameHash = hashNew(digitsBaseTwo(nodeCount) + 3); rAddLeafNames(bigTree->tree, bigTree->condensedNodes, nameHash, addComponents); } -reportTiming(pStartTime, "get tree names"); return nameHash; } -static char *matchName(struct hash *nameHash, char *name) +static struct hashOrMmHash *getTreeNames(char *nameFile, char *protobufFile, + struct mutationAnnotatedTree **pBigTree, + boolean addComponents, int *pStartTime) +/* Make a hash of full names of leaves of bigTree, using nameFile if it exists, otherwise + * falling back on the bigTree itself, loading it if necessary. If addComponents, also map + * components of those names to the full names in case the user gives us partial names/IDs. */ +{ +struct hashOrMmHash *treeNames = NULL; +AllocVar(treeNames); +if (isNotEmpty(nameFile) && fileExists(nameFile) && endsWith(nameFile, ".mmh")) + treeNames->mmh = mmHashFromFile(nameFile); +else + treeNames->hash = getTreeNamesHash(nameFile, protobufFile, pBigTree, addComponents, pStartTime); + +reportTiming(pStartTime, "get tree names"); +return treeNames; +} + +static const char *hashOrMmHashFindVal(struct hashOrMmHash *homh, char *key) +/* Look up string value for key in homh; return NULL if not found. */ +{ +const char *val = NULL; +if (homh) + { + if (homh->hash) + val = hashFindVal(homh->hash, key); + else + val = mmHashFindVal(homh->mmh, key); + } +return val; +} + +static const char *matchName(struct hashOrMmHash *nameHash, char *name) /* Look for a possibly partial name or ID provided by the user in nameHash. Return the result, * possibly NULL. If the full name doesn't match, try components of the name. */ { name = trimSpaces(name); // GISAID fasta headers all have hCoV-19/country/isolate/year|EPI_ISL_#|date; strip the hCoV-19 // because Nextstrain strips it in nextmeta/nextfasta download files, and so do I when building // UCSC's tree. if (startsWithNoCase("hCoV-19/", name)) name += strlen("hCoV-19/"); -char *match = hashFindVal(nameHash, name); +const char *match = hashOrMmHashFindVal(nameHash, name); int minWordSize=5; if (match == NULL && strchr(name, '|')) { // GISAID fasta headers have name|ID|date, and so do our tree IDs; try ID and name separately. char *words[4]; char copy[strlen(name)+1]; safecpy(copy, sizeof copy, name); - int wordCount = chopString(copy, "|", words, ArraySize(words)); + int wordCount = chopByChar(copy, '|', words, ArraySize(words)); if (wordCount == 3) { // name|ID|date; try ID first. if (strlen(words[1]) > minWordSize) - match = hashFindVal(nameHash, words[1]); + match = hashOrMmHashFindVal(nameHash, words[1]); if (match == NULL && strlen(words[0]) > minWordSize) { - match = hashFindVal(nameHash, words[0]); + match = hashOrMmHashFindVal(nameHash, words[0]); // Sometimes country/isolate names have spaces... strip out if present. if (match == NULL && strchr(words[0], ' ')) { stripChar(words[0], ' '); - match = hashFindVal(nameHash, words[0]); + match = hashOrMmHashFindVal(nameHash, words[0]); } } } else if (wordCount == 2) { // ID|date if (strlen(words[0]) > minWordSize) - match = hashFindVal(nameHash, words[0]); + match = hashOrMmHashFindVal(nameHash, words[0]); } } else if (match == NULL && strchr(name, ' ')) { // GISAID sequence names may include spaces, in both country names ("South Korea") and // isolate names. That messes up FASTA headers, so Nextstrain strips out spaces when // making the nextmeta and nextfasta download files for GISAID. Try stripping out spaces: char copy[strlen(name)+1]; safecpy(copy, sizeof copy, name); stripChar(copy, ' '); - match = hashFindVal(nameHash, copy); + match = hashOrMmHashFindVal(nameHash, copy); } return match; } -static boolean tallyMatch(char *match, char *term, +static boolean tallyMatch(const char *match, char *term, struct slName **retMatches, struct slName **retUnmatched) /* If match is non-NULL, add result to retMatches and return TRUE, otherwise add term to * retUnmatched and return FALSE. */ { boolean foundIt = FALSE; if (match) { foundIt = TRUE; slNameAddHead(retMatches, match); } else slNameAddHead(retUnmatched, term); return foundIt; } -static boolean matchIdRange(struct hash *nameHash, char *line, +static boolean matchIdRange(struct hashOrMmHash *nameHash, char *line, struct slName **retMatches, struct slName **retUnmatched) /* If line looks like it might contain a range of IDs, for example EPI_ISL_123-129 from an EPI_SET, * then expand the range(s) into individual IDs, look up the IDs, set retMatches and retUnmatched * to per-ID results, and return TRUE. */ { boolean foundAny = FALSE; *retMatches = *retUnmatched = NULL; regmatch_t substrArr[7]; // Line may contain a list of distinct IDs and/or ID ranges #define oneIdExp "([A-Z_]+)([0-9]+)" #define rangeEndExp "- *([A-Z_]*)([0-9]+)" #define rangeListExp "^("oneIdExp",? *("rangeEndExp")?),? *" while (regexMatchSubstr(line, rangeListExp, substrArr, ArraySize(substrArr))) { char *prefixA = regexSubstringClone(line, substrArr[2]); char *numberA = regexSubstringClone(line, substrArr[3]); if (regexSubstrMatched(substrArr[4])) { // Looks like a well-formed ID range char *prefixB = regexSubstringClone(line, substrArr[5]); char *numberB = regexSubstringClone(line, substrArr[6]); int start = atol(numberA); int end = atol(numberB); if ((isEmpty(prefixB) || sameString(prefixA, prefixB)) && end >= start) { char oneId[strlen(line)+1]; int num; for (num = start; num <= end; num++) { safef(oneId, sizeof oneId, "%s%d", prefixA, num); - char *match = hashFindVal(nameHash, oneId); + const char *match = hashOrMmHashFindVal(nameHash, oneId); foundAny |= tallyMatch(match, oneId, retMatches, retUnmatched); } } else { // It matched the regex but the prefixes don't match and/or numbers are out of order // so we don't know what to do with it -- try matchName just in case. char *regMatch = regexSubstringClone(line, substrArr[1]); - char *match = matchName(nameHash, regMatch); + const char *match = matchName(nameHash, regMatch); foundAny |= tallyMatch(match, regMatch, retMatches, retUnmatched); } } else { // Just one ID char oneId[strlen(line)+1]; safef(oneId, sizeof oneId, "%s%s", prefixA, numberA); - char *match = hashFindVal(nameHash, oneId); + const char *match = hashOrMmHashFindVal(nameHash, oneId); foundAny |= tallyMatch(match, oneId, retMatches, retUnmatched); } // Skip past this match to see if the line has another range next. line += (substrArr[0].rm_eo - substrArr[0].rm_so); } return foundAny; } -static struct slName *readSampleIds(struct lineFile *lf, struct hash *nameHash) +static struct slName *readSampleIds(struct lineFile *lf, struct hashOrMmHash *nameHash) /* Read a file of sample names/IDs from the user; typically these will not be exactly the same * as the protobuf's (UCSC protobuf names are typically country/isolate/year|ID|date), so attempt * to find component matches if an exact match isn't found. */ { struct slName *sampleIds = NULL; struct slName *unmatched = NULL; char *line; while (lineFileNext(lf, &line, NULL)) { // If tab-sep, just try first word in line char *tab = strchr(line, '\t'); if (tab) *tab = '\0'; - char *match = matchName(nameHash, line); + const char *match = matchName(nameHash, line); if (match) slNameAddHead(&sampleIds, match); else { struct slName *rangeMatches = NULL, *rangeUnmatched = NULL; if (matchIdRange(nameHash, line, &rangeMatches, &rangeUnmatched)) { sampleIds = slCat(rangeMatches, sampleIds); unmatched = slCat(rangeUnmatched, unmatched); } else { // If comma-sep, just try first word in line char *comma = strchr(line, ','); if (comma) @@ -3137,31 +3216,31 @@ warn("Unable to find %d of your sequences in the tree, e.g. %s", slCount(unmatched), firstFew->string); dyStringFree(&firstFew); } else if (sampleIds == NULL) warn("Could not find any names in input; empty file uploaded?"); slNameFreeList(&unmatched); return sampleIds; } void *loadMetadataWorker(void *arg) /* pthread worker function to read a tab-sep metadata file and return it hashed by name. */ { char *metadataFile = arg; int startTime = clock1000(); -struct hash *sampleMetadata = getSampleMetadata(metadataFile); +struct sampleMetadataStore *sampleMetadata = getSampleMetadata(metadataFile); reportTiming(&startTime, "read sample metadata (in a pthread)"); return sampleMetadata; } static pthread_t *mayStartLoaderPthread(char *filename, void *(*workerFunction)(void *)) /* Fork off a child process that parses a file and returns the resulting data structure. */ { pthread_t *pt; AllocVar(pt); if (! pthreadMayCreate(pt, NULL, workerFunction, filename)) pt = NULL; return pt; } static struct dnaSeq *getChromSeq(char *org, char *ref, char *db) @@ -3243,31 +3322,31 @@ if (0) { bigTree = parseParsimonyProtobuf(protobufPath); reportTiming(&startTime, "parse protobuf file (at startup, for excluding informativeBases " "from maskSites)"); informativeBasesFromTree(bigTree->tree, maskSites, refGenome->size); reportTiming(&startTime, "remove any informative bases in tree from maskSites"); } boolean isFasta = FALSE; boolean subtreesOnly = FALSE; struct seqInfo *seqInfoList = NULL; if (lfLooksLikeFasta(lf)) { struct slPair *failedSeqs; struct slPair *failedPsls; - struct hash *treeNames = NULL; + struct hashOrMmHash *treeNames = NULL; // We need to check uploaded names in fasta only for original usher, not usher-sampled(-server). if (!serverIsConfigured(org) && !endsWith(usherPath, "-sampled")) treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, FALSE, &startTime); vcfTn = vcfFromFasta(lf, org, refName, refGenome, maskSites, treeNames, &sampleIds, &seqInfoList, &failedSeqs, &failedPsls, &startTime); if (failedSeqs) { puts("<p>"); struct slPair *fail; for (fail = failedSeqs; fail != NULL; fail = fail->next) printf("%s<br>\n", fail->name); puts("</p>"); } if (failedPsls) { @@ -3278,60 +3357,64 @@ puts("</p>"); } if (seqInfoList == NULL) printf("<p>Sorry, could not align any sequences to reference well enough to place in " "the phylogenetic tree.</p>\n"); isFasta = TRUE; } else if (lfLooksLikeVcf(lf)) { vcfTn = checkAndSaveVcf(lf, refGenome, maskSites, &seqInfoList, &sampleIds); reportTiming(&startTime, "check uploaded VCF"); } else { subtreesOnly = TRUE; - struct hash *treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, TRUE, &startTime); - addAliases(treeNames, aliasFile); + struct hashOrMmHash *treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, TRUE, + &startTime); + if (treeNames && treeNames->hash) + { + addAliases(treeNames->hash, aliasFile); reportTiming(&startTime, "load sample name aliases"); + } sampleIds = readSampleIds(lf, treeNames); reportTiming(&startTime, "look up uploaded sample names"); } lineFileClose(&lf); if (sampleIds == NULL) { return; } // Kick off child thread to load metadata simultaneously with running usher or matUtils. pthread_t *metadataPthread = mayStartLoaderPthread(metadataFile, loadMetadataWorker); struct usherResults *results = NULL; if (vcfTn) { fflush(stdout); results = runUsher(org, usherPath, protobufPath, vcfTn->forCgi, subtreeSize, &sampleIds, treeChoices, &startTime); } else if (subtreesOnly) { char *matUtilsPath = getMatUtilsPath(TRUE); results = runMatUtilsExtractSubtrees(refName, matUtilsPath, protobufPath, subtreeSize, sampleIds, treeChoices, &startTime); } -struct hash *sampleMetadata = NULL; +struct sampleMetadataStore *sampleMetadata = NULL; if (metadataPthread) { pthreadJoin(metadataPthread, (void **)(&sampleMetadata)); reportTiming(&startTime, "wait for sample metadata loading thread to finish"); } else { // We really need metadata -- load it the slow way. sampleMetadata = getSampleMetadata(metadataFile); reportTiming(&startTime, "load sample metadata without pthread"); } if (results && results->singleSubtreeInfo) { if (retSuccess) @@ -3446,31 +3529,31 @@ { int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds); printf("<h3>Subtree %d: ", ix+1); if (subtreeUserSampleCount > 1) printf("%d related samples", subtreeUserSampleCount); else if (subtreeCount > 1) printf("Unrelated sample"); printf("</h3>\n"); makeNextstrainButtonN("viewNextstrainSub", ix, subtreeUserSampleCount, subtreeSize, jsonTns); puts("<br>"); // Make a sub-subtree with only user samples for display: struct phyloTree *subtree = phyloOpenTree(ti->subtreeTn->forCgi); subtree = phyloPruneToIds(subtree, ti->subtreeUserSampleIds); describeSamplePlacements(ti->subtreeUserSampleIds, results->samplePlacements, - subtree, sampleMetadata, source, refAcc, refName); + subtree, source, refAcc, refName); } reportTiming(&startTime, "describe placements"); } else printf("<p>(Skipping details; " "you uploaded %d sequences, and details are shown only when " "you upload at most %d sequences.)</p>\n", seqCount, MAX_SEQ_DETAILS); } puts("<h3>Downloads</h3>"); if (! subtreesOnly) { puts("<ul>"); // Offer big tree w/new samples for download