0b186effb2d2b536d2c280ec31bec6e153e6bd7e angie Tue May 21 13:26:39 2024 -0700 Add support for uploaded names/IDs for the multi-ref/tree organism.ra case. Add new config option anchorSamples, pass to usher-sampled/matUtils/server if present. anchorSamples is a file with names of sequences that should always be included in the subtree to provide some larger-scale context, e.g. well-known vaccine or reference material strains. Influenza user request. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index b19a387..55a9244 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -3146,31 +3146,32 @@ } else { // Just one ID char oneId[strlen(line)+1]; safef(oneId, sizeof oneId, "%s%s", prefixA, numberA); const char *match = hashOrMmHashFindVal(nameHash, oneId); foundAny |= tallyMatch(match, oneId, retMatches, retUnmatched); } // Skip past this match to see if the line has another range next. line += (substrArr[0].rm_eo - substrArr[0].rm_so); } return foundAny; } -static struct slName *readSampleIds(struct lineFile *lf, struct hashOrMmHash *nameHash) +static struct slName *readSampleIds(struct lineFile *lf, struct hashOrMmHash *nameHash, + struct slName **retUnmatched) /* Read a file of sample names/IDs from the user; typically these will not be exactly the same * as the protobuf's (UCSC protobuf names are typically country/isolate/year|ID|date), so attempt * to find component matches if an exact match isn't found. */ { struct slName *sampleIds = NULL; struct slName *unmatched = NULL; char *line; while (lineFileNext(lf, &line, NULL)) { // If tab-sep, just try first word in line char *tab = strchr(line, '\t'); if (tab) *tab = '\0'; const char *match = matchName(nameHash, line); if (match) @@ -3189,50 +3190,60 @@ char *comma = strchr(line, ','); if (comma) { *comma = '\0'; match = matchName(nameHash, line); if (match) slNameAddHead(&sampleIds, match); else slNameAddHead(&unmatched, line); } else slNameAddHead(&unmatched, line); } } } +if (retUnmatched != NULL) + *retUnmatched = unmatched; +else + slNameFreeList(&unmatched); +return sampleIds; +} + +static void reportUnmatched(struct slName *unmatched, boolean foundNone) +/* Warn the user that some of their names/IDs could not be found in the tree. If it's a long list, + * show only the first handful. If nothing was unmatched but nothing was found either, warn about + * empty input. */ +{ if (unmatched) { struct dyString *firstFew = dyStringNew(0); int maxExamples = 5; struct slName *example; int i; for (i = 0, example = unmatched; example != NULL && i < maxExamples; i++, example = example->next) { dyStringAppendSep(firstFew, ", "); dyStringPrintf(firstFew, "'%s'", example->name); } - warn("Unable to find %d of your sequences in the tree, e.g. %s", + warn("Unable to find %d of your sequence names/IDs, e.g. %s", slCount(unmatched), firstFew->string); dyStringFree(&firstFew); } -else if (sampleIds == NULL) +else if (foundNone) warn("Could not find any names in input; empty file uploaded?"); -slNameFreeList(&unmatched); -return sampleIds; } void *loadMetadataWorker(void *arg) /* pthread worker function to read a tab-sep metadata file and return it hashed by name. */ { char *metadataFile = arg; int startTime = clock1000(); struct sampleMetadataStore *sampleMetadata = getSampleMetadata(metadataFile); reportTiming(&startTime, "read sample metadata (in a pthread)"); return sampleMetadata; } static pthread_t *mayStartLoaderPthread(char *filename, void *(*workerFunction)(void *)) /* Fork off a child process that parses a file and returns the resulting data structure. */ { @@ -3364,54 +3375,58 @@ else if (lfLooksLikeVcf(lf)) { vcfTn = checkAndSaveVcf(lf, refGenome, maskSites, &seqInfoList, &sampleIds); reportTiming(&startTime, "check uploaded VCF"); } else { subtreesOnly = TRUE; struct hashOrMmHash *treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, TRUE, &startTime); if (treeNames && treeNames->hash) { addAliases(treeNames->hash, aliasFile); reportTiming(&startTime, "load sample name aliases"); } - sampleIds = readSampleIds(lf, treeNames); + struct slName *unmatched = NULL; + sampleIds = readSampleIds(lf, treeNames, &unmatched); + reportUnmatched(unmatched, (sampleIds == NULL)); + slNameFreeList(&unmatched); reportTiming(&startTime, "look up uploaded sample names"); } lineFileClose(&lf); if (sampleIds == NULL) { return; } // Kick off child thread to load metadata simultaneously with running usher or matUtils. pthread_t *metadataPthread = mayStartLoaderPthread(metadataFile, loadMetadataWorker); struct usherResults *results = NULL; +char *anchorFile = phyloPlaceRefSettingPath(org, refName, "anchorSamples"); if (vcfTn) { fflush(stdout); results = runUsher(org, usherPath, protobufPath, vcfTn->forCgi, subtreeSize, &sampleIds, - treeChoices, &startTime); + treeChoices, anchorFile, &startTime); } else if (subtreesOnly) { char *matUtilsPath = getMatUtilsPath(TRUE); - results = runMatUtilsExtractSubtrees(refName, matUtilsPath, protobufPath, subtreeSize, - sampleIds, treeChoices, &startTime); + results = runMatUtilsExtractSubtrees(org, matUtilsPath, protobufPath, subtreeSize, + sampleIds, treeChoices, anchorFile, &startTime); } struct sampleMetadataStore *sampleMetadata = NULL; if (metadataPthread) { pthreadJoin(metadataPthread, (void **)(&sampleMetadata)); reportTiming(&startTime, "wait for sample metadata loading thread to finish"); } else { // We really need metadata -- load it the slow way. sampleMetadata = getSampleMetadata(metadataFile); reportTiming(&startTime, "load sample metadata without pthread"); } @@ -3720,31 +3735,31 @@ // Check header line... the nextclade guys change output columns frequently if (lineFileRow(nlf, row)) { if (differentString(row[1], "seqName") || differentString(row[2], "dataset")) errAbort("nextclade sort output header format has changed"); } else errAbort("nextclade sort output is empty"); // Build up a hash of sample names to the best matching ref (the first one in the nextclade output // file; ignore subsequent matches for the same sample), a list of samples that match no ref, and // a hash of ref to open file handle to which we will write the sequences that matched that ref. struct slName *noMatches = NULL; struct hash *sampleRef = hashNew(0); struct hash *refOpenFileHandles = hashNew(0); struct dyString *dyRefDesc = dyStringNew(0); -while (lineFileRow(nlf, row)) +while (lineFileRowTab(nlf, row)) { char *sample = row[1]; char *ref = row[2]; if (isEmpty(ref)) slNameAddHead(&noMatches, sample); else if (! hashFindVal(sampleRef, sample)) { hashAdd(sampleRef, sample, cloneString(ref)); if (! hashFindVal(refOpenFileHandles, ref)) { struct tempName tnRefSamples; trashDirFile(&tnRefSamples, "ct", "usher_ref_samples", ".txt"); hashAdd(refOpenFileHandles, ref, mustOpen(tnRefSamples.forCgi, "w")); describeRef(org, ref, dyRefDesc); slAddHead(&refFiles, refMatchNew(ref, dyRefDesc->string, tnRefSamples.forCgi)); @@ -3783,61 +3798,141 @@ { carefulClose((FILE **)&(hel->val)); } // Clean up hashFree(&refOpenFileHandles); freeHashAndVals(&sampleRef); unlink(uploadedFile); unlink(nextcladeOut); if (retNoMatches) *retNoMatches = noMatches; reportTiming(pStartTime, "collated refs and samples"); return refFiles; } +static char *dumpNamesToTrashFile(struct slName *nameList) +/* Write each item in nameList to a trashfile, one per line, and return the trash file name. */ +{ +struct tempName tmp; +trashDirFile(&tmp, "ct", "usher_tmp", ".txt"); +FILE *f = mustOpen(tmp.forCgi, "w"); +struct slName *sln; +for (sln = nameList; sln != NULL; sln = sln->next) + { + fputs(sln->name, f); + fputc('\n', f); + } +carefulClose(&f); +return cloneString(tmp.forCgi); +} + +static struct refMatch *matchNamesWithReference(char *org, struct lineFile *lf, + struct slName **retNoMatches, int *pStartTime) +/* For each name in lf, find all trees that contain a sample matching that name. Make a temporary + * file per reference with matching sample names from that reference. */ +{ +struct refMatch *refFiles = NULL; +// We need to search for names in lf in every ref/tree, and the searching code expects a lineFile, +// but we can't necessarily rewind lf. So copy lf to trash file that we can read repeatedly. +char *uploadedFile = dumpLfToTrashFile(lf); +// Find all subdirectories for org that contain reference.ra files and search uploaded names/IDs +// in each ref/tree. +struct hash *unmatchedCounts = hashNew(0); +int refCount = 0; +int totalFound = 0; +struct dyString *dyRefDesc = dyStringNew(0); +char *orgDir = catThreeStrings(PHYLOPLACE_DATA_DIR, "/", org); +struct slName *dataDirPaths = pathsInDirAndSubdirs(orgDir, "*"); +struct slName *path; +for (path = dataDirPaths; path != NULL; path = path->next) + { + if (endsWith(path->name, "/reference.ra")) + { + char *ref = finalDirName(path->name); + char *protobufPath = NULL; + char *source = NULL; + char *metadataFile = NULL; + char *aliasFile = NULL; + char *sampleNameFile = NULL; + struct treeChoices *treeChoices = loadTreeChoices(org, ref); + getProtobufMetadataSource(treeChoices, NULL, + &protobufPath, &metadataFile, &source, &aliasFile, &sampleNameFile); + struct mutationAnnotatedTree *bigTree = NULL; + struct hashOrMmHash *treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, TRUE, + pStartTime); + struct lineFile *lf2 = lineFileOpen(uploadedFile, TRUE); + struct slName *unmatched = NULL; + struct slName *sampleIds = readSampleIds(lf2, treeNames, &unmatched); + lineFileClose(&lf2); + if (sampleIds) + { + describeRef(org, ref, dyRefDesc); + slAddHead(&refFiles, + refMatchNew(ref, dyRefDesc->string, dumpNamesToTrashFile(sampleIds))); + totalFound += slCount(sampleIds); + } + struct slName *id; + for (id = unmatched; id != NULL; id = id->next) + { + hashIncInt(unmatchedCounts, id->name); + } + slNameFreeList(&unmatched); + refCount++; + } + } +slNameFreeList(&dataDirPaths); +// Now find ids that were not matched for any ref/tree. +struct slName *unmatchedInAll = NULL; +struct hashEl *hel; +struct hashCookie cookie = hashFirst(unmatchedCounts); +while ((hel = hashNext(&cookie)) != NULL) + { + if (ptToInt(hel->val) == refCount) + slNameAddHead(&unmatchedInAll, hel->name); + } +reportUnmatched(unmatchedInAll, (totalFound == 0)); +slNameFreeList(&unmatchedInAll); +hashFree(&unmatchedCounts); +reportTiming(pStartTime, "looked up names in all trees"); +return refFiles; +} + boolean phyloPlaceSamples(struct lineFile *lf, char *db, char *org, char *defaultProtobuf, boolean doMeasureTiming, int subtreeSize, struct trackLayout *tl, struct cart *cart) /* Given a lineFile that contains either FASTA, VCF, or a list of sequence names/ids: * If FASTA/VCF, then prepare VCF for usher; if that goes well then run usher, report results, * make custom track files. * If list of seq names/ids, then attempt to find their full names in the protobuf, run matUtils * to make subtrees, show subtree results. * Return TRUE if we were able to get at least some results for the user's input. */ { boolean success = FALSE; char *nextcladeIndex = phyloPlaceOrgSettingPath(org, "nextcladeIndex"); if (isNotEmpty(nextcladeIndex)) { if (! fileExists(nextcladeIndex)) errAbort("config for '%s' specifies nextcladeIndex file '%s' but it does not exist", org, nextcladeIndex); - if (! lfLooksLikeFasta(lf)) - { - char *name = phyloPlaceOrgSetting(org, "name"); - if (isEmpty(name)) - name = org; - char *description = emptyForNull(phyloPlaceOrgSetting(org, "description")); - errAbort("Sorry, only fasta input is supported for %s %s", - name, description); - } + struct refMatch *refFiles = NULL; struct slName *noMatches = NULL; int startTime = clock1000(); - struct refMatch *refFiles = matchSamplesWithReferences(org, nextcladeIndex, lf, &noMatches, - &startTime); - lineFileClose(&lf); + if (lfLooksLikeFasta(lf)) + refFiles = matchSamplesWithReferences(org, nextcladeIndex, lf, &noMatches, &startTime); + else + refFiles = matchNamesWithReference(org, lf, &noMatches, &startTime); if (noMatches != NULL) { printf("
No reference was found for the following sequences:\n"); } int refCount = slCount(refFiles); boolean doNav = (refCount > 1); struct refMatch *ref; if (doNav) { // Make some navigation links at the top puts("");