src/hg/hgPhyloPlace/phyloPlace.c 0b186effb2d2b536d2c280ec31bec6e153e6bd7e

0b186effb2d2b536d2c280ec31bec6e153e6bd7e
angie
  Tue May 21 13:26:39 2024 -0700
Add support for uploaded names/IDs for the multi-ref/tree organism.ra case.  Add new config option anchorSamples, pass to usher-sampled/matUtils/server if present.
anchorSamples is a file with names of sequences that should always be included in the subtree to provide some larger-scale context, e.g. well-known vaccine or reference material strains.  Influenza user request.

diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index b19a387..55a9244 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -3146,31 +3146,32 @@
         }
     else
         {
         // Just one ID
         char oneId[strlen(line)+1];
         safef(oneId, sizeof oneId, "%s%s", prefixA, numberA);
         const char *match = hashOrMmHashFindVal(nameHash, oneId);
         foundAny |= tallyMatch(match, oneId, retMatches, retUnmatched);
         }
     // Skip past this match to see if the line has another range next.
     line += (substrArr[0].rm_eo - substrArr[0].rm_so);
     }
 return foundAny;
 }
 
-static struct slName *readSampleIds(struct lineFile *lf, struct hashOrMmHash *nameHash)
+static struct slName *readSampleIds(struct lineFile *lf, struct hashOrMmHash *nameHash,
+                                    struct slName **retUnmatched)
 /* Read a file of sample names/IDs from the user; typically these will not be exactly the same
  * as the protobuf's (UCSC protobuf names are typically country/isolate/year|ID|date), so attempt
  * to find component matches if an exact match isn't found. */
 {
 struct slName *sampleIds = NULL;
 struct slName *unmatched = NULL;
 char *line;
 while (lineFileNext(lf, &line, NULL))
     {
     // If tab-sep, just try first word in line
     char *tab = strchr(line, '\t');
     if (tab)
         *tab = '\0';
     const char *match = matchName(nameHash, line);
     if (match)
@@ -3189,50 +3190,60 @@
             char *comma = strchr(line, ',');
             if (comma)
                 {
                 *comma = '\0';
                 match = matchName(nameHash, line);
                 if (match)
                     slNameAddHead(&sampleIds, match);
                 else
                     slNameAddHead(&unmatched, line);
                 }
             else
                 slNameAddHead(&unmatched, line);
             }
         }
     }
+if (retUnmatched != NULL)
+    *retUnmatched = unmatched;
+else
+    slNameFreeList(&unmatched);
+return sampleIds;
+}
+
+static void reportUnmatched(struct slName *unmatched, boolean foundNone)
+/* Warn the user that some of their names/IDs could not be found in the tree.  If it's a long list,
+ * show only the first handful.  If nothing was unmatched but nothing was found either, warn about
+ * empty input. */
+{
 if (unmatched)
     {
     struct dyString *firstFew = dyStringNew(0);
     int maxExamples = 5;
     struct slName *example;
     int i;
     for (i = 0, example = unmatched;  example != NULL && i < maxExamples;
          i++, example = example->next)
         {
         dyStringAppendSep(firstFew, ", ");
         dyStringPrintf(firstFew, "'%s'", example->name);
         }
-    warn("Unable to find %d of your sequences in the tree, e.g. %s",
+    warn("Unable to find %d of your sequence names/IDs, e.g. %s",
          slCount(unmatched), firstFew->string);
     dyStringFree(&firstFew);
     }
-else if (sampleIds == NULL)
+else if (foundNone)
     warn("Could not find any names in input; empty file uploaded?");
-slNameFreeList(&unmatched);
-return sampleIds;
 }
 
 void *loadMetadataWorker(void *arg)
 /* pthread worker function to read a tab-sep metadata file and return it hashed by name.  */
 {
 char *metadataFile = arg;
 int startTime = clock1000();
 struct sampleMetadataStore *sampleMetadata = getSampleMetadata(metadataFile);
 reportTiming(&startTime, "read sample metadata (in a pthread)");
 return sampleMetadata;
 }
 
 static pthread_t *mayStartLoaderPthread(char *filename, void *(*workerFunction)(void *))
 /* Fork off a child process that parses a file and returns the resulting data structure. */
 {
@@ -3364,54 +3375,58 @@
 else if (lfLooksLikeVcf(lf))
     {
     vcfTn = checkAndSaveVcf(lf, refGenome, maskSites, &seqInfoList, &sampleIds);
     reportTiming(&startTime, "check uploaded VCF");
     }
 else
     {
     subtreesOnly = TRUE;
     struct hashOrMmHash *treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, TRUE,
                                                   &startTime);
     if (treeNames && treeNames->hash)
         {
         addAliases(treeNames->hash, aliasFile);
         reportTiming(&startTime, "load sample name aliases");
         }
-    sampleIds = readSampleIds(lf, treeNames);
+    struct slName *unmatched = NULL;
+    sampleIds = readSampleIds(lf, treeNames, &unmatched);
+    reportUnmatched(unmatched, (sampleIds == NULL));
+    slNameFreeList(&unmatched);
     reportTiming(&startTime, "look up uploaded sample names");
     }
 lineFileClose(&lf);
 if (sampleIds == NULL)
     {
     return;
     }
 
 // Kick off child thread to load metadata simultaneously with running usher or matUtils.
 pthread_t *metadataPthread = mayStartLoaderPthread(metadataFile, loadMetadataWorker);
 
 struct usherResults *results = NULL;
+char *anchorFile = phyloPlaceRefSettingPath(org, refName, "anchorSamples");
 if (vcfTn)
     {
     fflush(stdout);
     results = runUsher(org, usherPath, protobufPath, vcfTn->forCgi, subtreeSize, &sampleIds,
-                       treeChoices, &startTime);
+                       treeChoices, anchorFile, &startTime);
     }
 else if (subtreesOnly)
     {
     char *matUtilsPath = getMatUtilsPath(TRUE);
-    results = runMatUtilsExtractSubtrees(refName, matUtilsPath, protobufPath, subtreeSize,
-                                         sampleIds, treeChoices, &startTime);
+    results = runMatUtilsExtractSubtrees(org, matUtilsPath, protobufPath, subtreeSize,
+                                         sampleIds, treeChoices, anchorFile, &startTime);
     }
 
 struct sampleMetadataStore *sampleMetadata = NULL;
 if (metadataPthread)
     {
     pthreadJoin(metadataPthread, (void **)(&sampleMetadata));
     reportTiming(&startTime, "wait for sample metadata loading thread to finish");
     }
 else
     {
     // We really need metadata -- load it the slow way.
     sampleMetadata = getSampleMetadata(metadataFile);
     reportTiming(&startTime, "load sample metadata without pthread");
     }
 
@@ -3720,31 +3735,31 @@
 // Check header line... the nextclade guys change output columns frequently
 if (lineFileRow(nlf, row))
     {
     if (differentString(row[1], "seqName") || differentString(row[2], "dataset"))
         errAbort("nextclade sort output header format has changed");
     }
 else
     errAbort("nextclade sort output is empty");
 // Build up a hash of sample names to the best matching ref (the first one in the nextclade output
 // file; ignore subsequent matches for the same sample), a list of samples that match no ref, and
 // a hash of ref to open file handle to which we will write the sequences that matched that ref.
 struct slName *noMatches = NULL;
 struct hash *sampleRef = hashNew(0);
 struct hash *refOpenFileHandles = hashNew(0);
 struct dyString *dyRefDesc = dyStringNew(0);
-while (lineFileRow(nlf, row))
+while (lineFileRowTab(nlf, row))
     {
     char *sample = row[1];
     char *ref = row[2];
     if (isEmpty(ref))
         slNameAddHead(&noMatches, sample);
     else if (! hashFindVal(sampleRef, sample))
         {
         hashAdd(sampleRef, sample, cloneString(ref));
         if (! hashFindVal(refOpenFileHandles, ref))
             {
             struct tempName tnRefSamples;
             trashDirFile(&tnRefSamples, "ct", "usher_ref_samples", ".txt");
             hashAdd(refOpenFileHandles, ref, mustOpen(tnRefSamples.forCgi, "w"));
             describeRef(org, ref, dyRefDesc);
             slAddHead(&refFiles, refMatchNew(ref, dyRefDesc->string, tnRefSamples.forCgi));
@@ -3783,61 +3798,141 @@
     {
     carefulClose((FILE **)&(hel->val));
     }
 
 // Clean up
 hashFree(&refOpenFileHandles);
 freeHashAndVals(&sampleRef);
 unlink(uploadedFile);
 unlink(nextcladeOut);
 if (retNoMatches)
     *retNoMatches = noMatches;
 reportTiming(pStartTime, "collated refs and samples");
 return refFiles;
 }
 
+static char *dumpNamesToTrashFile(struct slName *nameList)
+/* Write each item in nameList to a trashfile, one per line, and return the trash file name. */
+{
+struct tempName tmp;
+trashDirFile(&tmp, "ct", "usher_tmp", ".txt");
+FILE *f = mustOpen(tmp.forCgi, "w");
+struct slName *sln;
+for (sln = nameList;  sln != NULL;  sln = sln->next)
+    {
+    fputs(sln->name, f);
+    fputc('\n', f);
+    }
+carefulClose(&f);
+return cloneString(tmp.forCgi);
+}
+
+static struct refMatch *matchNamesWithReference(char *org, struct lineFile *lf,
+                                                struct slName **retNoMatches, int *pStartTime)
+/* For each name in lf, find all trees that contain a sample matching that name.  Make a temporary
+ * file per reference with matching sample names from that reference. */
+{
+struct refMatch *refFiles = NULL;
+// We need to search for names in lf in every ref/tree, and the searching code expects a lineFile,
+// but we can't necessarily rewind lf.  So copy lf to trash file that we can read repeatedly.
+char *uploadedFile = dumpLfToTrashFile(lf);
+// Find all subdirectories for org that contain reference.ra files and search uploaded names/IDs
+// in each ref/tree.
+struct hash *unmatchedCounts = hashNew(0);
+int refCount = 0;
+int totalFound = 0;
+struct dyString *dyRefDesc = dyStringNew(0);
+char *orgDir = catThreeStrings(PHYLOPLACE_DATA_DIR, "/", org);
+struct slName *dataDirPaths = pathsInDirAndSubdirs(orgDir, "*");
+struct slName *path;
+for (path = dataDirPaths;  path != NULL;  path = path->next)
+    {
+    if (endsWith(path->name, "/reference.ra"))
+        {
+        char *ref = finalDirName(path->name);
+        char *protobufPath = NULL;
+        char *source = NULL;
+        char *metadataFile = NULL;
+        char *aliasFile = NULL;
+        char *sampleNameFile = NULL;
+        struct treeChoices *treeChoices = loadTreeChoices(org, ref);
+        getProtobufMetadataSource(treeChoices, NULL,
+                              &protobufPath, &metadataFile, &source, &aliasFile, &sampleNameFile);
+        struct mutationAnnotatedTree *bigTree = NULL;
+        struct hashOrMmHash *treeNames = getTreeNames(sampleNameFile, protobufPath, &bigTree, TRUE,
+                                                      pStartTime);
+        struct lineFile *lf2 = lineFileOpen(uploadedFile, TRUE);
+        struct slName *unmatched = NULL;
+        struct slName *sampleIds = readSampleIds(lf2, treeNames, &unmatched);
+        lineFileClose(&lf2);
+        if (sampleIds)
+            {
+            describeRef(org, ref, dyRefDesc);
+            slAddHead(&refFiles,
+                      refMatchNew(ref, dyRefDesc->string, dumpNamesToTrashFile(sampleIds)));
+            totalFound += slCount(sampleIds);
+            }
+        struct slName *id;
+        for (id = unmatched;  id != NULL;  id = id->next)
+            {
+            hashIncInt(unmatchedCounts, id->name);
+            }
+        slNameFreeList(&unmatched);
+        refCount++;
+        }
+    }
+slNameFreeList(&dataDirPaths);
+// Now find ids that were not matched for any ref/tree.
+struct slName *unmatchedInAll = NULL;
+struct hashEl *hel;
+struct hashCookie cookie = hashFirst(unmatchedCounts);
+while ((hel = hashNext(&cookie)) != NULL)
+    {
+    if (ptToInt(hel->val) == refCount)
+        slNameAddHead(&unmatchedInAll, hel->name);
+    }
+reportUnmatched(unmatchedInAll, (totalFound == 0));
+slNameFreeList(&unmatchedInAll);
+hashFree(&unmatchedCounts);
+reportTiming(pStartTime, "looked up names in all trees");
+return refFiles;
+}
+
 boolean phyloPlaceSamples(struct lineFile *lf, char *db, char *org, char *defaultProtobuf,
                           boolean doMeasureTiming, int subtreeSize, struct trackLayout *tl,
                           struct cart *cart)
 /* Given a lineFile that contains either FASTA, VCF, or a list of sequence names/ids:
  * If FASTA/VCF, then prepare VCF for usher; if that goes well then run usher, report results,
  * make custom track files.
  * If list of seq names/ids, then attempt to find their full names in the protobuf, run matUtils
  * to make subtrees, show subtree results.
  * Return TRUE if we were able to get at least some results for the user's input. */
 {
 boolean success = FALSE;
 char *nextcladeIndex = phyloPlaceOrgSettingPath(org, "nextcladeIndex");
 if (isNotEmpty(nextcladeIndex))
     {
     if (! fileExists(nextcladeIndex))
         errAbort("config for '%s' specifies nextcladeIndex file '%s' but it does not exist",
                  org, nextcladeIndex);
-    if (! lfLooksLikeFasta(lf))
-        {
-        char *name = phyloPlaceOrgSetting(org, "name");
-        if (isEmpty(name))
-            name = org;
-        char *description = emptyForNull(phyloPlaceOrgSetting(org, "description"));
-        errAbort("Sorry, only fasta input is supported for %s %s",
-                 name, description);
-        }
+    struct refMatch *refFiles = NULL;
     struct slName *noMatches = NULL;
     int startTime = clock1000();
-    struct refMatch *refFiles = matchSamplesWithReferences(org, nextcladeIndex, lf, &noMatches,
-                                                           &startTime);
-    lineFileClose(&lf);
+    if (lfLooksLikeFasta(lf))
+        refFiles = matchSamplesWithReferences(org, nextcladeIndex, lf, &noMatches, &startTime);
+    else
+        refFiles = matchNamesWithReference(org, lf, &noMatches, &startTime);
     if (noMatches != NULL)
         {
         printf("<br>No reference was found for the following sequences:\n<ul>\n");
         struct slName *noMatch;
         for (noMatch = noMatches;  noMatch != NULL;  noMatch = noMatch->next)
             printf("<li>%s\n", noMatch->name);
         puts("</ul>");
         }
     int refCount = slCount(refFiles);
     boolean doNav = (refCount > 1);
     struct refMatch *ref;
     if (doNav)
         {
         // Make some navigation links at the top
         puts("<a name='resultNavTop'></a>");