eed992e6dfb41dffe0102882d47300c904aaf1de angie Mon Apr 29 19:17:50 2024 -0700 When multiple reference sequences are matched, sort them alphabetically by description. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 37c94d7..291012d 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -3553,80 +3553,137 @@ char *runNextcladeSort(char *seqFile, char *nextcladeIndex) /* Run the nextclade sort command on uploaded seqs, return output TSV file name. */ { char *nextcladePath = getNextcladePath(); struct tempName tnNextcladeOut; trashDirFile(&tnNextcladeOut, "ct", "usher_nextclade_sort", ".tsv"); #define NEXTCLADE_NUM_THREADS "4" char *cmd[] = { nextcladePath, "sort", seqFile, "-m", nextcladeIndex, "-j", NEXTCLADE_NUM_THREADS, "-r", tnNextcladeOut.forCgi, NULL }; char **cmds[] = { cmd, NULL }; struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, NULL, 0); pipelineClose(&pl); return cloneString(tnNextcladeOut.forCgi); } -static struct slPair *matchSamplesWithReferences(char *nextcladeIndex, struct lineFile *lf, +struct refMatch +/* A reference sequence that has been matched with some uploaded sequence(s) */ +{ + struct refMatch *next; + char *acc; // reference sequence accession + char *description; // readable description of reference + char *seqFile; // file containing uploaded sequence(s) matched to reference +}; + +static struct refMatch *refMatchNew(char *acc, char *description, char *seqFile) +/* Alloc and return a new refMatch. */ +{ +struct refMatch *refMatch = NULL; +AllocVar(refMatch); +refMatch->acc = cloneString(acc); +refMatch->description = cloneString(description); +refMatch->seqFile = cloneString(seqFile); +return refMatch; +} + +static int refMatchCmp(const void *va, const void *vb) +/* Compare two refMatch descriptions. */ +{ +const struct refMatch *a = *((struct refMatch **)va); +const struct refMatch *b = *((struct refMatch **)vb); +return strcmp(a->description, b->description); +} + +static void describeRef(char *org, char *ref, struct dyString *dyRefDesc) +/* Depending on whether name and description are specified in config, overwrite dyRefDesc with + * a description of the reference. */ +{ +char *name = phyloPlaceRefSetting(org, ref, "name"); +char *description = phyloPlaceRefSetting(org, ref, "description"); +dyStringClear(dyRefDesc); +if (isNotEmpty(description)) + { + if (isNotEmpty(name)) + dyStringPrintf(dyRefDesc, "%s %s (%s)", name, description, ref); + else + dyStringPrintf(dyRefDesc, "%s %s", ref, description); + } +else + { + if (isNotEmpty(name)) + dyStringPrintf(dyRefDesc, "%s (%s)", name, ref); + else + dyStringAppend(dyRefDesc, ref); + } +} + +static struct refMatch *matchSamplesWithReferences(char *org, char *nextcladeIndex, + struct lineFile *lf, struct slName **retNoMatches, int *pStartTime) /* Save lf's content (in memory from CGI post) to a temporary file so we can run nextclade sort * to identify the best reference match for each uploaded sequence in lf. * For each reference identified as the best match for at least one sequence, write all sequences * for that reference to a different temporary file; return a list of pairs of {reference accession, * filename that holds the uploaded sequences matched with that reference}. */ { -struct slPair *refFiles = NULL; +struct refMatch *refFiles = NULL; char *uploadedFile = dumpLfToTrashFile(lf); reportTiming(pStartTime, "dump uploaded seqs to file"); char *nextcladeOut = runNextcladeSort(uploadedFile, nextcladeIndex); reportTiming(pStartTime, "run nextclade sort"); struct lineFile *nlf = lineFileOpen(nextcladeOut, TRUE); char *row[5]; // Check header line... the nextclade guys change output columns frequently if (lineFileRow(nlf, row)) { if (differentString(row[1], "seqName") || differentString(row[2], "dataset")) errAbort("nextclade sort output header format has changed"); } else errAbort("nextclade sort output is empty"); // Build up a hash of sample names to the best matching ref (the first one in the nextclade output // file; ignore subsequent matches for the same sample), a list of samples that match no ref, and // a hash of ref to open file handle to which we will write the sequences that matched that ref. struct slName *noMatches = NULL; struct hash *sampleRef = hashNew(0); struct hash *refOpenFileHandles = hashNew(0); +struct dyString *dyRefDesc = dyStringNew(0); while (lineFileRow(nlf, row)) { char *sample = row[1]; char *ref = row[2]; if (isEmpty(ref)) slNameAddHead(&noMatches, sample); else if (! hashFindVal(sampleRef, sample)) { hashAdd(sampleRef, sample, cloneString(ref)); if (! hashFindVal(refOpenFileHandles, ref)) { struct tempName tnRefSamples; trashDirFile(&tnRefSamples, "ct", "usher_ref_samples", ".txt"); hashAdd(refOpenFileHandles, ref, mustOpen(tnRefSamples.forCgi, "w")); - slPairAdd(&refFiles, ref, cloneString(tnRefSamples.forCgi)); + describeRef(org, ref, dyRefDesc); + slAddHead(&refFiles, refMatchNew(ref, dyRefDesc->string, tnRefSamples.forCgi)); } } } +dyStringFree(&dyRefDesc); lineFileClose(&nlf); +// Sort refFiles alphabetically by description +slSort(&refFiles, refMatchCmp); // Now go through the uploaded seqs again to write them out into separate files per ref. FILE *f = mustOpen(uploadedFile, "r"); char *nameLine = NULL; struct dnaSeq *seq = NULL; while (faReadMixedNext(f, FALSE, "uploaded_seq", TRUE, &nameLine, &seq)) { char *seqName = nameLine; if (nameLine[0] == '>') seqName = trimSpaces(nameLine+1); char *ref = hashFindVal(sampleRef, seqName); if (ref) { FILE *f = hashFindVal(refOpenFileHandles, ref); if (f == NULL) errAbort("matchSamplesWithReferences: can't find file handle for ref '%s' " @@ -3643,98 +3700,74 @@ { carefulClose((FILE **)&(hel->val)); } // Clean up hashFree(&refOpenFileHandles); freeHashAndVals(&sampleRef); unlink(uploadedFile); unlink(nextcladeOut); if (retNoMatches) *retNoMatches = noMatches; reportTiming(pStartTime, "collated refs and samples"); return refFiles; } -static void describeRef(char *org, char *ref, struct dyString *dyRefDesc) -/* Depending on whether name and description are specified in config, overwrite dyRefDesc with - * a description of the reference. */ -{ -char *name = phyloPlaceRefSetting(org, ref, "name"); -char *description = phyloPlaceRefSetting(org, ref, "description"); -dyStringClear(dyRefDesc); -if (isNotEmpty(description)) - { - if (isNotEmpty(name)) - dyStringPrintf(dyRefDesc, "%s %s (%s)", name, description, ref); - else - dyStringPrintf(dyRefDesc, "%s %s", ref, description); - } -else - { - if (isNotEmpty(name)) - dyStringPrintf(dyRefDesc, "%s (%s)", name, ref); - else - dyStringAppend(dyRefDesc, ref); - } -} - boolean phyloPlaceSamples(struct lineFile *lf, char *db, char *org, char *defaultProtobuf, boolean doMeasureTiming, int subtreeSize, struct trackLayout *tl, struct cart *cart) /* Given a lineFile that contains either FASTA, VCF, or a list of sequence names/ids: * If FASTA/VCF, then prepare VCF for usher; if that goes well then run usher, report results, * make custom track files. * If list of seq names/ids, then attempt to find their full names in the protobuf, run matUtils * to make subtrees, show subtree results. * Return TRUE if we were able to get at least some results for the user's input. */ { boolean success = FALSE; char *nextcladeIndex = phyloPlaceOrgSettingPath(org, "nextcladeIndex"); if (isNotEmpty(nextcladeIndex)) { if (! fileExists(nextcladeIndex)) errAbort("config for '%s' specifies nextcladeIndex file '%s' but it does not exist", org, nextcladeIndex); if (! lfLooksLikeFasta(lf)) { char *name = phyloPlaceOrgSetting(org, "name"); if (isEmpty(name)) name = org; char *description = emptyForNull(phyloPlaceOrgSetting(org, "description")); errAbort("Sorry, only fasta input is supported for %s %s", name, description); } struct slName *noMatches = NULL; int startTime = clock1000(); - struct slPair *refFiles = matchSamplesWithReferences(nextcladeIndex, lf, &noMatches, &startTime); + struct refMatch *refFiles = matchSamplesWithReferences(org, nextcladeIndex, lf, &noMatches, + &startTime); lineFileClose(&lf); if (noMatches != NULL) { printf("
No reference was found for the following sequences:\n"); } - struct dyString *dyRefDesc = dyStringNew(0); - struct slPair *ref; + struct refMatch *ref; for (ref = refFiles; ref != NULL; ref = ref->next) { if (ref != refFiles) puts("
"); - describeRef(org, ref->name, dyRefDesc); - printf("

Sequence(s) matched reference %s

\n", dyRefDesc->string); - struct lineFile *rlf = lineFileOpen(ref->val, TRUE); - phyloPlaceSamplesOneRef(rlf, db, org, ref->name, defaultProtobuf, + printf("

Sequence(s) matched reference %s

\n", ref->description); + struct lineFile *rlf = lineFileOpen(ref->seqFile, TRUE); + phyloPlaceSamplesOneRef(rlf, db, org, ref->acc, defaultProtobuf, doMeasureTiming, subtreeSize, tl, cart, &success); } puts("
"); } else { // No nextcladeIndex means this is the old-style single-reference setup (i.e. SARS-CoV-2). phyloPlaceSamplesOneRef(lf, db, org, org, defaultProtobuf, doMeasureTiming, subtreeSize, tl, cart, &success); } return success; }