src/hg/hgPhyloPlace/phyloPlace.c 466d876c3308a789b3abd3e2ae559d1413ea5fc5

466d876c3308a789b3abd3e2ae559d1413ea5fc5
angie
  Thu Sep 12 10:01:15 2024 -0700
nextclade sort's scoring function, for short similar reference sequences, gives too much weight to reference sequence length and not enough to numHits.  So, instead of using nextclade sort's top-scoring match for each input sequence, run nextclade sort --all-matches and choose the ref with the highest (score * numHits).  This does a better job when distinguishing between 7 RefSeq assemblies for influenza A segments.

diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index d00d9d4..589725b 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -3644,31 +3644,31 @@
     fputs(line, f);
     fputc('\n', f);
     }
 carefulClose(&f);
 return cloneString(tmp.forCgi);
 }
 
 char *runNextcladeSort(char *seqFile, char *nextcladeIndex)
 /* Run the nextclade sort command on uploaded seqs, return output TSV file name. */
 {
 char *nextcladePath = getNextcladePath();
 struct tempName tnNextcladeOut;
 trashDirFile(&tnNextcladeOut, "ct", "usher_nextclade_sort", ".tsv");
 #define NEXTCLADE_NUM_THREADS "4"
 char *cmd[] = { nextcladePath, "sort", seqFile, "-m", nextcladeIndex,
-                "-j", NEXTCLADE_NUM_THREADS, "-r", tnNextcladeOut.forCgi, NULL };
+                "-j", NEXTCLADE_NUM_THREADS, "-r", tnNextcladeOut.forCgi, "--all-matches", NULL };
 char **cmds[] = { cmd, NULL };
 struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, NULL, 0);
 pipelineClose(&pl);
 return cloneString(tnNextcladeOut.forCgi);
 }
 
 struct refMatch
 /* A reference sequence that has been matched with some uploaded sequence(s) */
 {
     struct refMatch *next;
     char *acc;                  // reference sequence accession
     char *description;          // readable description of reference
     char *seqFile;              // file containing uploaded sequence(s) matched to reference
 };
 
@@ -3702,82 +3702,121 @@
     {
     if (isNotEmpty(name))
         dyStringPrintf(dyRefDesc, "%s %s (%s)", name, description, ref);
     else
         dyStringPrintf(dyRefDesc, "%s %s", ref, description);
     }
 else
     {
     if (isNotEmpty(name))
         dyStringPrintf(dyRefDesc, "%s (%s)", name, ref);
     else
         dyStringAppend(dyRefDesc, ref);
     }
 }
 
+static void storeRefMatch(char *org, char *sample, char *ref, struct hash *sampleRef,
+                          struct hash *refOpenFileHandles, struct dyString *dyRefDesc,
+                          struct refMatch **pRefFiles)
+/* sample has been matched to ref -- save the mapping in sampleRef, build up refOpenFileHandles
+ * for writing per-ref fasta files, and add new struct refMatch to head of *pRefFiles. */
+{
+if (! hashFindVal(sampleRef, sample))
+    {
+    hashAdd(sampleRef, sample, cloneString(ref));
+    if (! hashFindVal(refOpenFileHandles, ref))
+        {
+        struct tempName tnRefSamples;
+        trashDirFile(&tnRefSamples, "ct", "usher_ref_samples", ".txt");
+        hashAdd(refOpenFileHandles, ref, mustOpen(tnRefSamples.forCgi, "w"));
+        describeRef(org, ref, dyRefDesc);
+        slAddHead(pRefFiles, refMatchNew(ref, dyRefDesc->string, tnRefSamples.forCgi));
+        }
+    }
+}
+
 static struct refMatch *matchSamplesWithReferences(char *org, char *nextcladeIndex,
                                                    struct lineFile *lf,
                                                    struct slName **retNoMatches, int *pStartTime)
 /* Save lf's content (in memory from CGI post) to a temporary file so we can run nextclade sort
  * to identify the best reference match for each uploaded sequence in lf.
  * For each reference identified as the best match for at least one sequence, write all sequences
  * for that reference to a different temporary file; return a list of pairs of {reference accession,
  * filename that holds the uploaded sequences matched with that reference}. */
 {
 struct refMatch *refFiles = NULL;
 char *uploadedFile = dumpLfToTrashFile(lf);
 reportTiming(pStartTime, "dump uploaded seqs to file");
 
 char *nextcladeOut = runNextcladeSort(uploadedFile, nextcladeIndex);
 reportTiming(pStartTime, "run nextclade sort");
 
 struct lineFile *nlf = lineFileOpen(nextcladeOut, TRUE);
 char *row[5];
 // Check header line... the nextclade guys change output columns frequently
 if (lineFileRow(nlf, row))
     {
-    if (differentString(row[1], "seqName") || differentString(row[2], "dataset"))
+    if (differentString(row[1], "seqName") || differentString(row[2], "dataset") ||
+        differentString(row[3], "score") || differentString(row[4], "numHits"))
         errAbort("nextclade sort output header format has changed");
     }
 else
     errAbort("nextclade sort output is empty");
 // Build up a hash of sample names to the best matching ref (the first one in the nextclade output
 // file; ignore subsequent matches for the same sample), a list of samples that match no ref, and
 // a hash of ref to open file handle to which we will write the sequences that matched that ref.
 struct slName *noMatches = NULL;
 struct hash *sampleRef = hashNew(0);
 struct hash *refOpenFileHandles = hashNew(0);
 struct dyString *dyRefDesc = dyStringNew(0);
+char *prevSample = NULL, *bestRef = NULL;
+double maxAdjustedScore = 0.0;
 while (lineFileRowTab(nlf, row))
     {
     char *sample = row[1];
     char *ref = row[2];
+    double score = atof(row[3]);
+    int numHits = atol(row[4]);
     if (isEmpty(ref))
         slNameAddHead(&noMatches, sample);
-    else if (! hashFindVal(sampleRef, sample))
+    else
         {
-        hashAdd(sampleRef, sample, cloneString(ref));
-        if (! hashFindVal(refOpenFileHandles, ref))
+        double adjustedScore = score * numHits;
+        if (prevSample == NULL || differentString(sample, prevSample))
             {
-            struct tempName tnRefSamples;
-            trashDirFile(&tnRefSamples, "ct", "usher_ref_samples", ".txt");
-            hashAdd(refOpenFileHandles, ref, mustOpen(tnRefSamples.forCgi, "w"));
-            describeRef(org, ref, dyRefDesc);
-            slAddHead(&refFiles, refMatchNew(ref, dyRefDesc->string, tnRefSamples.forCgi));
+            if (prevSample != NULL)
+                storeRefMatch(org, prevSample, bestRef, sampleRef, refOpenFileHandles, dyRefDesc,
+                              &refFiles);
+            freez(&bestRef);
+            bestRef = cloneString(ref);
+            maxAdjustedScore = adjustedScore;
+            freez(&prevSample);
+            prevSample = cloneString(sample);
+            }
+        else
+            {
+            if (adjustedScore > maxAdjustedScore)
+                {
+                maxAdjustedScore = adjustedScore;
+                freez(&bestRef);
+                bestRef = cloneString(ref);
+                }
             }
         }
     }
+if (prevSample)
+    storeRefMatch(org, prevSample, bestRef, sampleRef, refOpenFileHandles, dyRefDesc, &refFiles);
 dyStringFree(&dyRefDesc);
 lineFileClose(&nlf);
 // Sort refFiles alphabetically by description
 slSort(&refFiles, refMatchCmp);
 // Now go through the uploaded seqs again to write them out into separate files per ref.
 FILE *f = mustOpen(uploadedFile, "r");
 char *nameLine = NULL;
 struct dnaSeq *seq = NULL;
 while (faReadMixedNext(f, FALSE, "uploaded_seq", TRUE, &nameLine, &seq))
     {
     char *seqName = nameLine;
     if (nameLine[0] == '>')
         seqName = trimSpaces(nameLine+1);
     char *ref = hashFindVal(sampleRef, seqName);
     if (ref)