0b186effb2d2b536d2c280ec31bec6e153e6bd7e
angie
Tue May 21 13:26:39 2024 -0700
Add support for uploaded names/IDs for the multi-ref/tree organism.ra case. Add new config option anchorSamples, pass to usher-sampled/matUtils/server if present.
anchorSamples is a file with names of sequences that should always be included in the subtree to provide some larger-scale context, e.g. well-known vaccine or reference material strains. Influenza user request.
diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index b19a387..55a9244 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -3146,31 +3146,32 @@
}
else
{
// Just one ID
char oneId[strlen(line)+1];
safef(oneId, sizeof oneId, "%s%s", prefixA, numberA);
const char *match = hashOrMmHashFindVal(nameHash, oneId);
foundAny |= tallyMatch(match, oneId, retMatches, retUnmatched);
}
// Skip past this match to see if the line has another range next.
line += (substrArr[0].rm_eo - substrArr[0].rm_so);
}
return foundAny;
}
-static struct slName *readSampleIds(struct lineFile *lf, struct hashOrMmHash *nameHash)
+static struct slName *readSampleIds(struct lineFile *lf, struct hashOrMmHash *nameHash,
+ struct slName **retUnmatched)
/* Read a file of sample names/IDs from the user; typically these will not be exactly the same
* as the protobuf's (UCSC protobuf names are typically country/isolate/year|ID|date), so attempt
* to find component matches if an exact match isn't found. */
{
struct slName *sampleIds = NULL;
struct slName *unmatched = NULL;
char *line;
while (lineFileNext(lf, &line, NULL))
{
// If tab-sep, just try first word in line
char *tab = strchr(line, '\t');
if (tab)
*tab = '\0';
const char *match = matchName(nameHash, line);
if (match)
@@ -3189,50 +3190,60 @@
char *comma = strchr(line, ',');
if (comma)
{
*comma = '\0';
match = matchName(nameHash, line);
if (match)
slNameAddHead(&sampleIds, match);
else
slNameAddHead(&unmatched, line);
}
else
slNameAddHead(&unmatched, line);
}
}
}
+if (retUnmatched != NULL)
+ *retUnmatched = unmatched;
+else
+ slNameFreeList(&unmatched);
+return sampleIds;
+}
+
+static void reportUnmatched(struct slName *unmatched, boolean foundNone)
+/* Warn the user that some of their names/IDs could not be found in the tree. If it's a long list,
+ * show only the first handful. If nothing was unmatched but nothing was found either, warn about
+ * empty input. */
+{
if (unmatched)
{
struct dyString *firstFew = dyStringNew(0);
int maxExamples = 5;
struct slName *example;
int i;
for (i = 0, example = unmatched; example != NULL && i < maxExamples;
i++, example = example->next)
{
dyStringAppendSep(firstFew, ", ");
dyStringPrintf(firstFew, "'%s'", example->name);
}
- warn("Unable to find %d of your sequences in the tree, e.g. %s",
+ warn("Unable to find %d of your sequence names/IDs, e.g. %s",
slCount(unmatched), firstFew->string);
dyStringFree(&firstFew);
}
-else if (sampleIds == NULL)
+else if (foundNone)
warn("Could not find any names in input; empty file uploaded?");
-slNameFreeList(&unmatched);
-return sampleIds;
}
void *loadMetadataWorker(void *arg)
/* pthread worker function to read a tab-sep metadata file and return it hashed by name. */
{
char *metadataFile = arg;
int startTime = clock1000();
struct sampleMetadataStore *sampleMetadata = getSampleMetadata(metadataFile);
reportTiming(&startTime, "read sample metadata (in a pthread)");
return sampleMetadata;
}
static pthread_t *mayStartLoaderPthread(char *filename, void *(*workerFunction)(void *))
/* Fork off a child process that parses a file and returns the resulting data structure. */
{
@@ -3364,54 +3375,58 @@
else if (lfLooksLikeVcf(lf))
{
vcfTn = checkAndSaveVcf(lf, refGenome, maskSites, &seqInfoList, &sampleIds);
reportTiming(&startTime, "check uploaded VCF");
}
else
{
subtreesOnly = TRUE;
struct hashOrMmHash *treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, TRUE,
&startTime);
if (treeNames && treeNames->hash)
{
addAliases(treeNames->hash, aliasFile);
reportTiming(&startTime, "load sample name aliases");
}
- sampleIds = readSampleIds(lf, treeNames);
+ struct slName *unmatched = NULL;
+ sampleIds = readSampleIds(lf, treeNames, &unmatched);
+ reportUnmatched(unmatched, (sampleIds == NULL));
+ slNameFreeList(&unmatched);
reportTiming(&startTime, "look up uploaded sample names");
}
lineFileClose(&lf);
if (sampleIds == NULL)
{
return;
}
// Kick off child thread to load metadata simultaneously with running usher or matUtils.
pthread_t *metadataPthread = mayStartLoaderPthread(metadataFile, loadMetadataWorker);
struct usherResults *results = NULL;
+char *anchorFile = phyloPlaceRefSettingPath(org, refName, "anchorSamples");
if (vcfTn)
{
fflush(stdout);
results = runUsher(org, usherPath, protobufPath, vcfTn->forCgi, subtreeSize, &sampleIds,
- treeChoices, &startTime);
+ treeChoices, anchorFile, &startTime);
}
else if (subtreesOnly)
{
char *matUtilsPath = getMatUtilsPath(TRUE);
- results = runMatUtilsExtractSubtrees(refName, matUtilsPath, protobufPath, subtreeSize,
- sampleIds, treeChoices, &startTime);
+ results = runMatUtilsExtractSubtrees(org, matUtilsPath, protobufPath, subtreeSize,
+ sampleIds, treeChoices, anchorFile, &startTime);
}
struct sampleMetadataStore *sampleMetadata = NULL;
if (metadataPthread)
{
pthreadJoin(metadataPthread, (void **)(&sampleMetadata));
reportTiming(&startTime, "wait for sample metadata loading thread to finish");
}
else
{
// We really need metadata -- load it the slow way.
sampleMetadata = getSampleMetadata(metadataFile);
reportTiming(&startTime, "load sample metadata without pthread");
}
@@ -3720,31 +3735,31 @@
// Check header line... the nextclade guys change output columns frequently
if (lineFileRow(nlf, row))
{
if (differentString(row[1], "seqName") || differentString(row[2], "dataset"))
errAbort("nextclade sort output header format has changed");
}
else
errAbort("nextclade sort output is empty");
// Build up a hash of sample names to the best matching ref (the first one in the nextclade output
// file; ignore subsequent matches for the same sample), a list of samples that match no ref, and
// a hash of ref to open file handle to which we will write the sequences that matched that ref.
struct slName *noMatches = NULL;
struct hash *sampleRef = hashNew(0);
struct hash *refOpenFileHandles = hashNew(0);
struct dyString *dyRefDesc = dyStringNew(0);
-while (lineFileRow(nlf, row))
+while (lineFileRowTab(nlf, row))
{
char *sample = row[1];
char *ref = row[2];
if (isEmpty(ref))
slNameAddHead(&noMatches, sample);
else if (! hashFindVal(sampleRef, sample))
{
hashAdd(sampleRef, sample, cloneString(ref));
if (! hashFindVal(refOpenFileHandles, ref))
{
struct tempName tnRefSamples;
trashDirFile(&tnRefSamples, "ct", "usher_ref_samples", ".txt");
hashAdd(refOpenFileHandles, ref, mustOpen(tnRefSamples.forCgi, "w"));
describeRef(org, ref, dyRefDesc);
slAddHead(&refFiles, refMatchNew(ref, dyRefDesc->string, tnRefSamples.forCgi));
@@ -3783,61 +3798,141 @@
{
carefulClose((FILE **)&(hel->val));
}
// Clean up
hashFree(&refOpenFileHandles);
freeHashAndVals(&sampleRef);
unlink(uploadedFile);
unlink(nextcladeOut);
if (retNoMatches)
*retNoMatches = noMatches;
reportTiming(pStartTime, "collated refs and samples");
return refFiles;
}
+static char *dumpNamesToTrashFile(struct slName *nameList)
+/* Write each item in nameList to a trashfile, one per line, and return the trash file name. */
+{
+struct tempName tmp;
+trashDirFile(&tmp, "ct", "usher_tmp", ".txt");
+FILE *f = mustOpen(tmp.forCgi, "w");
+struct slName *sln;
+for (sln = nameList; sln != NULL; sln = sln->next)
+ {
+ fputs(sln->name, f);
+ fputc('\n', f);
+ }
+carefulClose(&f);
+return cloneString(tmp.forCgi);
+}
+
+static struct refMatch *matchNamesWithReference(char *org, struct lineFile *lf,
+ struct slName **retNoMatches, int *pStartTime)
+/* For each name in lf, find all trees that contain a sample matching that name. Make a temporary
+ * file per reference with matching sample names from that reference. */
+{
+struct refMatch *refFiles = NULL;
+// We need to search for names in lf in every ref/tree, and the searching code expects a lineFile,
+// but we can't necessarily rewind lf. So copy lf to trash file that we can read repeatedly.
+char *uploadedFile = dumpLfToTrashFile(lf);
+// Find all subdirectories for org that contain reference.ra files and search uploaded names/IDs
+// in each ref/tree.
+struct hash *unmatchedCounts = hashNew(0);
+int refCount = 0;
+int totalFound = 0;
+struct dyString *dyRefDesc = dyStringNew(0);
+char *orgDir = catThreeStrings(PHYLOPLACE_DATA_DIR, "/", org);
+struct slName *dataDirPaths = pathsInDirAndSubdirs(orgDir, "*");
+struct slName *path;
+for (path = dataDirPaths; path != NULL; path = path->next)
+ {
+ if (endsWith(path->name, "/reference.ra"))
+ {
+ char *ref = finalDirName(path->name);
+ char *protobufPath = NULL;
+ char *source = NULL;
+ char *metadataFile = NULL;
+ char *aliasFile = NULL;
+ char *sampleNameFile = NULL;
+ struct treeChoices *treeChoices = loadTreeChoices(org, ref);
+ getProtobufMetadataSource(treeChoices, NULL,
+ &protobufPath, &metadataFile, &source, &aliasFile, &sampleNameFile);
+ struct mutationAnnotatedTree *bigTree = NULL;
+ struct hashOrMmHash *treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, TRUE,
+ pStartTime);
+ struct lineFile *lf2 = lineFileOpen(uploadedFile, TRUE);
+ struct slName *unmatched = NULL;
+ struct slName *sampleIds = readSampleIds(lf2, treeNames, &unmatched);
+ lineFileClose(&lf2);
+ if (sampleIds)
+ {
+ describeRef(org, ref, dyRefDesc);
+ slAddHead(&refFiles,
+ refMatchNew(ref, dyRefDesc->string, dumpNamesToTrashFile(sampleIds)));
+ totalFound += slCount(sampleIds);
+ }
+ struct slName *id;
+ for (id = unmatched; id != NULL; id = id->next)
+ {
+ hashIncInt(unmatchedCounts, id->name);
+ }
+ slNameFreeList(&unmatched);
+ refCount++;
+ }
+ }
+slNameFreeList(&dataDirPaths);
+// Now find ids that were not matched for any ref/tree.
+struct slName *unmatchedInAll = NULL;
+struct hashEl *hel;
+struct hashCookie cookie = hashFirst(unmatchedCounts);
+while ((hel = hashNext(&cookie)) != NULL)
+ {
+ if (ptToInt(hel->val) == refCount)
+ slNameAddHead(&unmatchedInAll, hel->name);
+ }
+reportUnmatched(unmatchedInAll, (totalFound == 0));
+slNameFreeList(&unmatchedInAll);
+hashFree(&unmatchedCounts);
+reportTiming(pStartTime, "looked up names in all trees");
+return refFiles;
+}
+
boolean phyloPlaceSamples(struct lineFile *lf, char *db, char *org, char *defaultProtobuf,
boolean doMeasureTiming, int subtreeSize, struct trackLayout *tl,
struct cart *cart)
/* Given a lineFile that contains either FASTA, VCF, or a list of sequence names/ids:
* If FASTA/VCF, then prepare VCF for usher; if that goes well then run usher, report results,
* make custom track files.
* If list of seq names/ids, then attempt to find their full names in the protobuf, run matUtils
* to make subtrees, show subtree results.
* Return TRUE if we were able to get at least some results for the user's input. */
{
boolean success = FALSE;
char *nextcladeIndex = phyloPlaceOrgSettingPath(org, "nextcladeIndex");
if (isNotEmpty(nextcladeIndex))
{
if (! fileExists(nextcladeIndex))
errAbort("config for '%s' specifies nextcladeIndex file '%s' but it does not exist",
org, nextcladeIndex);
- if (! lfLooksLikeFasta(lf))
- {
- char *name = phyloPlaceOrgSetting(org, "name");
- if (isEmpty(name))
- name = org;
- char *description = emptyForNull(phyloPlaceOrgSetting(org, "description"));
- errAbort("Sorry, only fasta input is supported for %s %s",
- name, description);
- }
+ struct refMatch *refFiles = NULL;
struct slName *noMatches = NULL;
int startTime = clock1000();
- struct refMatch *refFiles = matchSamplesWithReferences(org, nextcladeIndex, lf, &noMatches,
- &startTime);
- lineFileClose(&lf);
+ if (lfLooksLikeFasta(lf))
+ refFiles = matchSamplesWithReferences(org, nextcladeIndex, lf, &noMatches, &startTime);
+ else
+ refFiles = matchNamesWithReference(org, lf, &noMatches, &startTime);
if (noMatches != NULL)
{
printf("
No reference was found for the following sequences:\n