src/hg/hgPhyloPlace/phyloPlace.c eed992e6dfb41dffe0102882d47300c904aaf1de

eed992e6dfb41dffe0102882d47300c904aaf1de
angie
  Mon Apr 29 19:17:50 2024 -0700
When multiple reference sequences are matched, sort them alphabetically by description.

diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index 37c94d7..291012d 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -3553,80 +3553,137 @@
 char *runNextcladeSort(char *seqFile, char *nextcladeIndex)
 /* Run the nextclade sort command on uploaded seqs, return output TSV file name. */
 {
 char *nextcladePath = getNextcladePath();
 struct tempName tnNextcladeOut;
 trashDirFile(&tnNextcladeOut, "ct", "usher_nextclade_sort", ".tsv");
 #define NEXTCLADE_NUM_THREADS "4"
 char *cmd[] = { nextcladePath, "sort", seqFile, "-m", nextcladeIndex,
                 "-j", NEXTCLADE_NUM_THREADS, "-r", tnNextcladeOut.forCgi, NULL };
 char **cmds[] = { cmd, NULL };
 struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, NULL, 0);
 pipelineClose(&pl);
 return cloneString(tnNextcladeOut.forCgi);
 }
 
-static struct slPair *matchSamplesWithReferences(char *nextcladeIndex, struct lineFile *lf,
+struct refMatch
+/* A reference sequence that has been matched with some uploaded sequence(s) */
+{
+    struct refMatch *next;
+    char *acc;                  // reference sequence accession
+    char *description;          // readable description of reference
+    char *seqFile;              // file containing uploaded sequence(s) matched to reference
+};
+
+static struct refMatch *refMatchNew(char *acc, char *description, char *seqFile)
+/* Alloc and return a new refMatch. */
+{
+struct refMatch *refMatch = NULL;
+AllocVar(refMatch);
+refMatch->acc = cloneString(acc);
+refMatch->description = cloneString(description);
+refMatch->seqFile = cloneString(seqFile);
+return refMatch;
+}
+
+static int refMatchCmp(const void *va, const void *vb)
+/* Compare two refMatch descriptions. */
+{
+const struct refMatch *a = *((struct refMatch **)va);
+const struct refMatch *b = *((struct refMatch **)vb);
+return strcmp(a->description, b->description);
+}
+
+static void describeRef(char *org, char *ref, struct dyString *dyRefDesc)
+/* Depending on whether name and description are specified in config, overwrite dyRefDesc with
+ * a description of the reference. */
+{
+char *name = phyloPlaceRefSetting(org, ref, "name");
+char *description = phyloPlaceRefSetting(org, ref, "description");
+dyStringClear(dyRefDesc);
+if (isNotEmpty(description))
+    {
+    if (isNotEmpty(name))
+        dyStringPrintf(dyRefDesc, "%s %s (%s)", name, description, ref);
+    else
+        dyStringPrintf(dyRefDesc, "%s %s", ref, description);
+    }
+else
+    {
+    if (isNotEmpty(name))
+        dyStringPrintf(dyRefDesc, "%s (%s)", name, ref);
+    else
+        dyStringAppend(dyRefDesc, ref);
+    }
+}
+
+static struct refMatch *matchSamplesWithReferences(char *org, char *nextcladeIndex,
+                                                   struct lineFile *lf,
                                                    struct slName **retNoMatches, int *pStartTime)
 /* Save lf's content (in memory from CGI post) to a temporary file so we can run nextclade sort
  * to identify the best reference match for each uploaded sequence in lf.
  * For each reference identified as the best match for at least one sequence, write all sequences
  * for that reference to a different temporary file; return a list of pairs of {reference accession,
  * filename that holds the uploaded sequences matched with that reference}. */
 {
-struct slPair *refFiles = NULL;
+struct refMatch *refFiles = NULL;
 char *uploadedFile = dumpLfToTrashFile(lf);
 reportTiming(pStartTime, "dump uploaded seqs to file");
 
 char *nextcladeOut = runNextcladeSort(uploadedFile, nextcladeIndex);
 reportTiming(pStartTime, "run nextclade sort");
 
 struct lineFile *nlf = lineFileOpen(nextcladeOut, TRUE);
 char *row[5];
 // Check header line... the nextclade guys change output columns frequently
 if (lineFileRow(nlf, row))
     {
     if (differentString(row[1], "seqName") || differentString(row[2], "dataset"))
         errAbort("nextclade sort output header format has changed");
     }
 else
     errAbort("nextclade sort output is empty");
 // Build up a hash of sample names to the best matching ref (the first one in the nextclade output
 // file; ignore subsequent matches for the same sample), a list of samples that match no ref, and
 // a hash of ref to open file handle to which we will write the sequences that matched that ref.
 struct slName *noMatches = NULL;
 struct hash *sampleRef = hashNew(0);
 struct hash *refOpenFileHandles = hashNew(0);
+struct dyString *dyRefDesc = dyStringNew(0);
 while (lineFileRow(nlf, row))
     {
     char *sample = row[1];
     char *ref = row[2];
     if (isEmpty(ref))
         slNameAddHead(&noMatches, sample);
     else if (! hashFindVal(sampleRef, sample))
         {
         hashAdd(sampleRef, sample, cloneString(ref));
         if (! hashFindVal(refOpenFileHandles, ref))
             {
             struct tempName tnRefSamples;
             trashDirFile(&tnRefSamples, "ct", "usher_ref_samples", ".txt");
             hashAdd(refOpenFileHandles, ref, mustOpen(tnRefSamples.forCgi, "w"));
-            slPairAdd(&refFiles, ref, cloneString(tnRefSamples.forCgi));
+            describeRef(org, ref, dyRefDesc);
+            slAddHead(&refFiles, refMatchNew(ref, dyRefDesc->string, tnRefSamples.forCgi));
             }
         }
     }
+dyStringFree(&dyRefDesc);
 lineFileClose(&nlf);
+// Sort refFiles alphabetically by description
+slSort(&refFiles, refMatchCmp);
 // Now go through the uploaded seqs again to write them out into separate files per ref.
 FILE *f = mustOpen(uploadedFile, "r");
 char *nameLine = NULL;
 struct dnaSeq *seq = NULL;
 while (faReadMixedNext(f, FALSE, "uploaded_seq", TRUE, &nameLine, &seq))
     {
     char *seqName = nameLine;
     if (nameLine[0] == '>')
         seqName = trimSpaces(nameLine+1);
     char *ref = hashFindVal(sampleRef, seqName);
     if (ref)
         {
         FILE *f = hashFindVal(refOpenFileHandles, ref);
         if (f == NULL)
             errAbort("matchSamplesWithReferences: can't find file handle for ref '%s' "
@@ -3643,98 +3700,74 @@
     {
     carefulClose((FILE **)&(hel->val));
     }
 
 // Clean up
 hashFree(&refOpenFileHandles);
 freeHashAndVals(&sampleRef);
 unlink(uploadedFile);
 unlink(nextcladeOut);
 if (retNoMatches)
     *retNoMatches = noMatches;
 reportTiming(pStartTime, "collated refs and samples");
 return refFiles;
 }
 
-static void describeRef(char *org, char *ref, struct dyString *dyRefDesc)
-/* Depending on whether name and description are specified in config, overwrite dyRefDesc with
- * a description of the reference. */
-{
-char *name = phyloPlaceRefSetting(org, ref, "name");
-char *description = phyloPlaceRefSetting(org, ref, "description");
-dyStringClear(dyRefDesc);
-if (isNotEmpty(description))
-    {
-    if (isNotEmpty(name))
-        dyStringPrintf(dyRefDesc, "%s %s (%s)", name, description, ref);
-    else
-        dyStringPrintf(dyRefDesc, "%s %s", ref, description);
-    }
-else
-    {
-    if (isNotEmpty(name))
-        dyStringPrintf(dyRefDesc, "%s (%s)", name, ref);
-    else
-        dyStringAppend(dyRefDesc, ref);
-    }
-}
-
 boolean phyloPlaceSamples(struct lineFile *lf, char *db, char *org, char *defaultProtobuf,
                           boolean doMeasureTiming, int subtreeSize, struct trackLayout *tl,
                           struct cart *cart)
 /* Given a lineFile that contains either FASTA, VCF, or a list of sequence names/ids:
  * If FASTA/VCF, then prepare VCF for usher; if that goes well then run usher, report results,
  * make custom track files.
  * If list of seq names/ids, then attempt to find their full names in the protobuf, run matUtils
  * to make subtrees, show subtree results.
  * Return TRUE if we were able to get at least some results for the user's input. */
 {
 boolean success = FALSE;
 char *nextcladeIndex = phyloPlaceOrgSettingPath(org, "nextcladeIndex");
 if (isNotEmpty(nextcladeIndex))
     {
     if (! fileExists(nextcladeIndex))
         errAbort("config for '%s' specifies nextcladeIndex file '%s' but it does not exist",
                  org, nextcladeIndex);
     if (! lfLooksLikeFasta(lf))
         {
         char *name = phyloPlaceOrgSetting(org, "name");
         if (isEmpty(name))
             name = org;
         char *description = emptyForNull(phyloPlaceOrgSetting(org, "description"));
         errAbort("Sorry, only fasta input is supported for %s %s",
                  name, description);
         }
     struct slName *noMatches = NULL;
     int startTime = clock1000();
-    struct slPair *refFiles = matchSamplesWithReferences(nextcladeIndex, lf, &noMatches, &startTime);
+    struct refMatch *refFiles = matchSamplesWithReferences(org, nextcladeIndex, lf, &noMatches,
+                                                           &startTime);
     lineFileClose(&lf);
     if (noMatches != NULL)
         {
         printf("<br>No reference was found for the following sequences:\n<ul>\n");
         struct slName *noMatch;
         for (noMatch = noMatches;  noMatch != NULL;  noMatch = noMatch->next)
             printf("<li>%s\n", noMatch->name);
         puts("</ul>");
         }
-    struct dyString *dyRefDesc = dyStringNew(0);
-    struct slPair *ref;
+    struct refMatch *ref;
     for (ref = refFiles;  ref != NULL;  ref = ref->next)
         {
         if (ref != refFiles)
             puts("<hr>");
-        describeRef(org, ref->name, dyRefDesc);
-        printf("<h2>Sequence(s) matched reference %s</h2>\n", dyRefDesc->string);
-        struct lineFile *rlf = lineFileOpen(ref->val, TRUE);
-        phyloPlaceSamplesOneRef(rlf, db, org, ref->name, defaultProtobuf,
+        printf("<h2>Sequence(s) matched reference %s</h2>\n", ref->description);
+        struct lineFile *rlf = lineFileOpen(ref->seqFile, TRUE);
+        phyloPlaceSamplesOneRef(rlf, db, org, ref->acc, defaultProtobuf,
                                 doMeasureTiming, subtreeSize, tl, cart, &success);
         }
     puts("<br>");
     }
 else
     {
     // No nextcladeIndex means this is the old-style single-reference setup (i.e. SARS-CoV-2).
     phyloPlaceSamplesOneRef(lf, db, org, org, defaultProtobuf, doMeasureTiming, subtreeSize,
                             tl, cart, &success);
     }
 return success;
 }