eed992e6dfb41dffe0102882d47300c904aaf1de
angie
Mon Apr 29 19:17:50 2024 -0700
When multiple reference sequences are matched, sort them alphabetically by description.
diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index 37c94d7..291012d 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -3553,80 +3553,137 @@
char *runNextcladeSort(char *seqFile, char *nextcladeIndex)
/* Run the nextclade sort command on uploaded seqs, return output TSV file name. */
{
char *nextcladePath = getNextcladePath();
struct tempName tnNextcladeOut;
trashDirFile(&tnNextcladeOut, "ct", "usher_nextclade_sort", ".tsv");
#define NEXTCLADE_NUM_THREADS "4"
char *cmd[] = { nextcladePath, "sort", seqFile, "-m", nextcladeIndex,
"-j", NEXTCLADE_NUM_THREADS, "-r", tnNextcladeOut.forCgi, NULL };
char **cmds[] = { cmd, NULL };
struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, NULL, 0);
pipelineClose(&pl);
return cloneString(tnNextcladeOut.forCgi);
}
-static struct slPair *matchSamplesWithReferences(char *nextcladeIndex, struct lineFile *lf,
+struct refMatch
+/* A reference sequence that has been matched with some uploaded sequence(s) */
+{
+ struct refMatch *next;
+ char *acc; // reference sequence accession
+ char *description; // readable description of reference
+ char *seqFile; // file containing uploaded sequence(s) matched to reference
+};
+
+static struct refMatch *refMatchNew(char *acc, char *description, char *seqFile)
+/* Alloc and return a new refMatch. */
+{
+struct refMatch *refMatch = NULL;
+AllocVar(refMatch);
+refMatch->acc = cloneString(acc);
+refMatch->description = cloneString(description);
+refMatch->seqFile = cloneString(seqFile);
+return refMatch;
+}
+
+static int refMatchCmp(const void *va, const void *vb)
+/* Compare two refMatch descriptions. */
+{
+const struct refMatch *a = *((struct refMatch **)va);
+const struct refMatch *b = *((struct refMatch **)vb);
+return strcmp(a->description, b->description);
+}
+
+static void describeRef(char *org, char *ref, struct dyString *dyRefDesc)
+/* Depending on whether name and description are specified in config, overwrite dyRefDesc with
+ * a description of the reference. */
+{
+char *name = phyloPlaceRefSetting(org, ref, "name");
+char *description = phyloPlaceRefSetting(org, ref, "description");
+dyStringClear(dyRefDesc);
+if (isNotEmpty(description))
+ {
+ if (isNotEmpty(name))
+ dyStringPrintf(dyRefDesc, "%s %s (%s)", name, description, ref);
+ else
+ dyStringPrintf(dyRefDesc, "%s %s", ref, description);
+ }
+else
+ {
+ if (isNotEmpty(name))
+ dyStringPrintf(dyRefDesc, "%s (%s)", name, ref);
+ else
+ dyStringAppend(dyRefDesc, ref);
+ }
+}
+
+static struct refMatch *matchSamplesWithReferences(char *org, char *nextcladeIndex,
+ struct lineFile *lf,
struct slName **retNoMatches, int *pStartTime)
/* Save lf's content (in memory from CGI post) to a temporary file so we can run nextclade sort
* to identify the best reference match for each uploaded sequence in lf.
* For each reference identified as the best match for at least one sequence, write all sequences
* for that reference to a different temporary file; return a list of pairs of {reference accession,
* filename that holds the uploaded sequences matched with that reference}. */
{
-struct slPair *refFiles = NULL;
+struct refMatch *refFiles = NULL;
char *uploadedFile = dumpLfToTrashFile(lf);
reportTiming(pStartTime, "dump uploaded seqs to file");
char *nextcladeOut = runNextcladeSort(uploadedFile, nextcladeIndex);
reportTiming(pStartTime, "run nextclade sort");
struct lineFile *nlf = lineFileOpen(nextcladeOut, TRUE);
char *row[5];
// Check header line... the nextclade guys change output columns frequently
if (lineFileRow(nlf, row))
{
if (differentString(row[1], "seqName") || differentString(row[2], "dataset"))
errAbort("nextclade sort output header format has changed");
}
else
errAbort("nextclade sort output is empty");
// Build up a hash of sample names to the best matching ref (the first one in the nextclade output
// file; ignore subsequent matches for the same sample), a list of samples that match no ref, and
// a hash of ref to open file handle to which we will write the sequences that matched that ref.
struct slName *noMatches = NULL;
struct hash *sampleRef = hashNew(0);
struct hash *refOpenFileHandles = hashNew(0);
+struct dyString *dyRefDesc = dyStringNew(0);
while (lineFileRow(nlf, row))
{
char *sample = row[1];
char *ref = row[2];
if (isEmpty(ref))
slNameAddHead(&noMatches, sample);
else if (! hashFindVal(sampleRef, sample))
{
hashAdd(sampleRef, sample, cloneString(ref));
if (! hashFindVal(refOpenFileHandles, ref))
{
struct tempName tnRefSamples;
trashDirFile(&tnRefSamples, "ct", "usher_ref_samples", ".txt");
hashAdd(refOpenFileHandles, ref, mustOpen(tnRefSamples.forCgi, "w"));
- slPairAdd(&refFiles, ref, cloneString(tnRefSamples.forCgi));
+ describeRef(org, ref, dyRefDesc);
+ slAddHead(&refFiles, refMatchNew(ref, dyRefDesc->string, tnRefSamples.forCgi));
}
}
}
+dyStringFree(&dyRefDesc);
lineFileClose(&nlf);
+// Sort refFiles alphabetically by description
+slSort(&refFiles, refMatchCmp);
// Now go through the uploaded seqs again to write them out into separate files per ref.
FILE *f = mustOpen(uploadedFile, "r");
char *nameLine = NULL;
struct dnaSeq *seq = NULL;
while (faReadMixedNext(f, FALSE, "uploaded_seq", TRUE, &nameLine, &seq))
{
char *seqName = nameLine;
if (nameLine[0] == '>')
seqName = trimSpaces(nameLine+1);
char *ref = hashFindVal(sampleRef, seqName);
if (ref)
{
FILE *f = hashFindVal(refOpenFileHandles, ref);
if (f == NULL)
errAbort("matchSamplesWithReferences: can't find file handle for ref '%s' "
@@ -3643,98 +3700,74 @@
{
carefulClose((FILE **)&(hel->val));
}
// Clean up
hashFree(&refOpenFileHandles);
freeHashAndVals(&sampleRef);
unlink(uploadedFile);
unlink(nextcladeOut);
if (retNoMatches)
*retNoMatches = noMatches;
reportTiming(pStartTime, "collated refs and samples");
return refFiles;
}
-static void describeRef(char *org, char *ref, struct dyString *dyRefDesc)
-/* Depending on whether name and description are specified in config, overwrite dyRefDesc with
- * a description of the reference. */
-{
-char *name = phyloPlaceRefSetting(org, ref, "name");
-char *description = phyloPlaceRefSetting(org, ref, "description");
-dyStringClear(dyRefDesc);
-if (isNotEmpty(description))
- {
- if (isNotEmpty(name))
- dyStringPrintf(dyRefDesc, "%s %s (%s)", name, description, ref);
- else
- dyStringPrintf(dyRefDesc, "%s %s", ref, description);
- }
-else
- {
- if (isNotEmpty(name))
- dyStringPrintf(dyRefDesc, "%s (%s)", name, ref);
- else
- dyStringAppend(dyRefDesc, ref);
- }
-}
-
boolean phyloPlaceSamples(struct lineFile *lf, char *db, char *org, char *defaultProtobuf,
boolean doMeasureTiming, int subtreeSize, struct trackLayout *tl,
struct cart *cart)
/* Given a lineFile that contains either FASTA, VCF, or a list of sequence names/ids:
* If FASTA/VCF, then prepare VCF for usher; if that goes well then run usher, report results,
* make custom track files.
* If list of seq names/ids, then attempt to find their full names in the protobuf, run matUtils
* to make subtrees, show subtree results.
* Return TRUE if we were able to get at least some results for the user's input. */
{
boolean success = FALSE;
char *nextcladeIndex = phyloPlaceOrgSettingPath(org, "nextcladeIndex");
if (isNotEmpty(nextcladeIndex))
{
if (! fileExists(nextcladeIndex))
errAbort("config for '%s' specifies nextcladeIndex file '%s' but it does not exist",
org, nextcladeIndex);
if (! lfLooksLikeFasta(lf))
{
char *name = phyloPlaceOrgSetting(org, "name");
if (isEmpty(name))
name = org;
char *description = emptyForNull(phyloPlaceOrgSetting(org, "description"));
errAbort("Sorry, only fasta input is supported for %s %s",
name, description);
}
struct slName *noMatches = NULL;
int startTime = clock1000();
- struct slPair *refFiles = matchSamplesWithReferences(nextcladeIndex, lf, &noMatches, &startTime);
+ struct refMatch *refFiles = matchSamplesWithReferences(org, nextcladeIndex, lf, &noMatches,
+ &startTime);
lineFileClose(&lf);
if (noMatches != NULL)
{
printf("
No reference was found for the following sequences:\n