466d876c3308a789b3abd3e2ae559d1413ea5fc5 angie Thu Sep 12 10:01:15 2024 -0700 nextclade sort's scoring function, for short similar reference sequences, gives too much weight to reference sequence length and not enough to numHits. So, instead of using nextclade sort's top-scoring match for each input sequence, run nextclade sort --all-matches and choose the ref with the highest (score * numHits). This does a better job when distinguishing between 7 RefSeq assemblies for influenza A segments. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index d00d9d4..589725b 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -3644,31 +3644,31 @@ fputs(line, f); fputc('\n', f); } carefulClose(&f); return cloneString(tmp.forCgi); } char *runNextcladeSort(char *seqFile, char *nextcladeIndex) /* Run the nextclade sort command on uploaded seqs, return output TSV file name. */ { char *nextcladePath = getNextcladePath(); struct tempName tnNextcladeOut; trashDirFile(&tnNextcladeOut, "ct", "usher_nextclade_sort", ".tsv"); #define NEXTCLADE_NUM_THREADS "4" char *cmd[] = { nextcladePath, "sort", seqFile, "-m", nextcladeIndex, - "-j", NEXTCLADE_NUM_THREADS, "-r", tnNextcladeOut.forCgi, NULL }; + "-j", NEXTCLADE_NUM_THREADS, "-r", tnNextcladeOut.forCgi, "--all-matches", NULL }; char **cmds[] = { cmd, NULL }; struct pipeline *pl = pipelineOpen(cmds, pipelineRead, NULL, NULL, 0); pipelineClose(&pl); return cloneString(tnNextcladeOut.forCgi); } struct refMatch /* A reference sequence that has been matched with some uploaded sequence(s) */ { struct refMatch *next; char *acc; // reference sequence accession char *description; // readable description of reference char *seqFile; // file containing uploaded sequence(s) matched to reference }; @@ -3702,82 +3702,121 @@ { if (isNotEmpty(name)) dyStringPrintf(dyRefDesc, "%s %s (%s)", name, description, ref); else dyStringPrintf(dyRefDesc, "%s %s", ref, description); } else { if (isNotEmpty(name)) dyStringPrintf(dyRefDesc, "%s (%s)", name, ref); else dyStringAppend(dyRefDesc, ref); } } +static void storeRefMatch(char *org, char *sample, char *ref, struct hash *sampleRef, + struct hash *refOpenFileHandles, struct dyString *dyRefDesc, + struct refMatch **pRefFiles) +/* sample has been matched to ref -- save the mapping in sampleRef, build up refOpenFileHandles + * for writing per-ref fasta files, and add new struct refMatch to head of *pRefFiles. */ +{ +if (! hashFindVal(sampleRef, sample)) + { + hashAdd(sampleRef, sample, cloneString(ref)); + if (! hashFindVal(refOpenFileHandles, ref)) + { + struct tempName tnRefSamples; + trashDirFile(&tnRefSamples, "ct", "usher_ref_samples", ".txt"); + hashAdd(refOpenFileHandles, ref, mustOpen(tnRefSamples.forCgi, "w")); + describeRef(org, ref, dyRefDesc); + slAddHead(pRefFiles, refMatchNew(ref, dyRefDesc->string, tnRefSamples.forCgi)); + } + } +} + static struct refMatch *matchSamplesWithReferences(char *org, char *nextcladeIndex, struct lineFile *lf, struct slName **retNoMatches, int *pStartTime) /* Save lf's content (in memory from CGI post) to a temporary file so we can run nextclade sort * to identify the best reference match for each uploaded sequence in lf. * For each reference identified as the best match for at least one sequence, write all sequences * for that reference to a different temporary file; return a list of pairs of {reference accession, * filename that holds the uploaded sequences matched with that reference}. */ { struct refMatch *refFiles = NULL; char *uploadedFile = dumpLfToTrashFile(lf); reportTiming(pStartTime, "dump uploaded seqs to file"); char *nextcladeOut = runNextcladeSort(uploadedFile, nextcladeIndex); reportTiming(pStartTime, "run nextclade sort"); struct lineFile *nlf = lineFileOpen(nextcladeOut, TRUE); char *row[5]; // Check header line... the nextclade guys change output columns frequently if (lineFileRow(nlf, row)) { - if (differentString(row[1], "seqName") || differentString(row[2], "dataset")) + if (differentString(row[1], "seqName") || differentString(row[2], "dataset") || + differentString(row[3], "score") || differentString(row[4], "numHits")) errAbort("nextclade sort output header format has changed"); } else errAbort("nextclade sort output is empty"); // Build up a hash of sample names to the best matching ref (the first one in the nextclade output // file; ignore subsequent matches for the same sample), a list of samples that match no ref, and // a hash of ref to open file handle to which we will write the sequences that matched that ref. struct slName *noMatches = NULL; struct hash *sampleRef = hashNew(0); struct hash *refOpenFileHandles = hashNew(0); struct dyString *dyRefDesc = dyStringNew(0); +char *prevSample = NULL, *bestRef = NULL; +double maxAdjustedScore = 0.0; while (lineFileRowTab(nlf, row)) { char *sample = row[1]; char *ref = row[2]; + double score = atof(row[3]); + int numHits = atol(row[4]); if (isEmpty(ref)) slNameAddHead(&noMatches, sample); - else if (! hashFindVal(sampleRef, sample)) + else { - hashAdd(sampleRef, sample, cloneString(ref)); - if (! hashFindVal(refOpenFileHandles, ref)) + double adjustedScore = score * numHits; + if (prevSample == NULL || differentString(sample, prevSample)) { - struct tempName tnRefSamples; - trashDirFile(&tnRefSamples, "ct", "usher_ref_samples", ".txt"); - hashAdd(refOpenFileHandles, ref, mustOpen(tnRefSamples.forCgi, "w")); - describeRef(org, ref, dyRefDesc); - slAddHead(&refFiles, refMatchNew(ref, dyRefDesc->string, tnRefSamples.forCgi)); + if (prevSample != NULL) + storeRefMatch(org, prevSample, bestRef, sampleRef, refOpenFileHandles, dyRefDesc, + &refFiles); + freez(&bestRef); + bestRef = cloneString(ref); + maxAdjustedScore = adjustedScore; + freez(&prevSample); + prevSample = cloneString(sample); + } + else + { + if (adjustedScore > maxAdjustedScore) + { + maxAdjustedScore = adjustedScore; + freez(&bestRef); + bestRef = cloneString(ref); + } } } } +if (prevSample) + storeRefMatch(org, prevSample, bestRef, sampleRef, refOpenFileHandles, dyRefDesc, &refFiles); dyStringFree(&dyRefDesc); lineFileClose(&nlf); // Sort refFiles alphabetically by description slSort(&refFiles, refMatchCmp); // Now go through the uploaded seqs again to write them out into separate files per ref. FILE *f = mustOpen(uploadedFile, "r"); char *nameLine = NULL; struct dnaSeq *seq = NULL; while (faReadMixedNext(f, FALSE, "uploaded_seq", TRUE, &nameLine, &seq)) { char *seqName = nameLine; if (nameLine[0] == '>') seqName = trimSpaces(nameLine+1); char *ref = hashFindVal(sampleRef, seqName); if (ref)