f22f0b5f1d467e570dad2407813748889408d9bd angie Tue Sep 14 09:38:11 2021 -0700 Don't rely on outputs printed only when usher is compiled with -DDEBUG=1; those are going away in server mode. We already weren't showing the 'Best nodes' output, so just remove that code. We already have sample mutations in seqInfo->sncList, so just reformat that into placementInfo->sampleMuts. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index a00a4c0..d583c05 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -24,31 +24,30 @@ #include "psl.h" #include "ra.h" #include "regexHelper.h" #include "trashDir.h" #include "vcf.h" // Globals: static boolean measureTiming = FALSE; // wuhCor1-specific: char *chrom = "NC_045512v2"; int chromSize = 29903; // Parameter constants: int maxGenotypes = 1000; // Upper limit on number of samples user can upload at once. -boolean showBestNodePaths = FALSE; boolean showParsimonyScore = FALSE; char *phyloPlaceDbSetting(char *db, char *settingName) /* Return a setting from hgPhyloPlaceData/<db>/config.ra or NULL if not found. */ { static struct hash *configHash = NULL; static char *configDb = NULL; if (!sameOk(db, configDb)) { char configFile[1024]; safef(configFile, sizeof configFile, PHYLOPLACE_DATA_DIR "/%s/config.ra", db); if (fileExists(configFile)) { configHash = raReadSingle(configFile); @@ -451,30 +450,59 @@ if (errCatch->gotError) { warn("%s", errCatch->message->string); unlink(tn->forCgi); freez(&tn); } errCatchFree(&errCatch); struct seqInfo *si; for (si = seqInfoList; si != NULL; si = si->next) slReverse(&si->sncList); *retSeqInfoList = seqInfoList; *retSampleIds = sampleIds; return tn; } +static void addSampleMutsFromSeqInfo(struct hash *samplePlacements, struct hash *seqInfoHash) +/* We used to get placementInfo->sampleMuts from DEBUG mode usher output in runUsher.c, but + * the DEBUG mode output went away with the change to server-mode usher. Instead, now we fill + * it in from seqInfo->sncList which has the same info. */ +{ +struct hashEl *hel; +struct hashCookie cookie = hashFirst(samplePlacements); +while ((hel = hashNext(&cookie)) != NULL) + { + struct placementInfo *pi = hel->val; + struct seqInfo *si = hashFindVal(seqInfoHash, pi->sampleId); + if (si) + { + struct slName *sampleMuts = NULL; + struct singleNucChange *snc; + for (snc = si->sncList; snc != NULL; snc = snc->next) + { + char mutStr[64]; + safef(mutStr, sizeof mutStr, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase); + slNameAddHead(&sampleMuts, mutStr); + } + slReverse(&sampleMuts); + pi->sampleMuts = sampleMuts; + } + else + errAbort("addSampleMutsFromSeqInfo: no seqInfo for placed sequence '%s'", pi->sampleId); + } +} + static void displaySampleMuts(struct placementInfo *info) { printf("<p>Differences from the reference genome " "(<a href='https://www.ncbi.nlm.nih.gov/nuccore/NC_045512.2' target=_blank>" "NC_045512.2</a>): "); if (info->sampleMuts == NULL) printf("(None; identical to reference)"); else { struct slName *sln; for (sln = info->sampleMuts; sln != NULL; sln = sln->next) { if (sln != info->sampleMuts) printf(", "); printf("%s", sln->name); @@ -869,65 +897,30 @@ } } static void displayNearestNeighbors(struct placementInfo *info, char *source) /* Use info->variantPaths to find sample's nearest neighbor(s) in tree. */ { if (isNotEmpty(info->nearestNeighbor)) { printf("<p>Nearest neighboring %s sequence already in phylogenetic tree: %s", source, info->nearestNeighbor); printLineageUrl(info->neighborLineage); puts("</p>"); } } -static void displayBestNodes(struct placementInfo *info, struct hash *sampleMetadata) -/* Show the node(s) most closely related to sample. */ -{ -if (info->bestNodeCount == 1) - printf("<p>The placement in the tree is unambiguous; " - "there are no other parsimony-optimal placements in the phylogenetic tree.</p>\n"); -else if (info->bestNodeCount > 1) - printf("<p>This placement is not the only parsimony-optimal placement in the tree; " - "%d other placements exist.</p>\n", info->bestNodeCount - 1); -if (showBestNodePaths && info->bestNodes) - { - if (info->bestNodeCount != slCount(info->bestNodes)) - errAbort("Inconsistent bestNodeCount (%d) and number of bestNodes (%d)", - info->bestNodeCount, slCount(info->bestNodes)); - if (info->bestNodeCount > 1) - printf("<ul><li><b>used for placement</b>: "); - if (differentString(info->bestNodes->name, "?") && !isInternalNodeName(info->bestNodes->name, 0)) - printf("%s ", info->bestNodes->name); - printVariantPathNoNodeNames(stdout, info->bestNodes->variantPath); - struct bestNodeInfo *bn; - for (bn = info->bestNodes->next; bn != NULL; bn = bn->next) - { - printf("\n<li>"); - if (differentString(bn->name, "?") && !isInternalNodeName(bn->name, 0)) - printf("%s ", bn->name); - printVariantPathNoNodeNames(stdout, bn->variantPath); - char *lineage = lineageForSample(sampleMetadata, bn->name); - printLineageUrl(lineage); - } - if (info->bestNodeCount > 1) - puts("</ul>"); - puts("</p>"); - } -} - static int placementInfoRefCmpSampleMuts(const void *va, const void *vb) /* Compare slRef->placementInfo->sampleMuts lists. Shorter lists first. Using alpha sort * to distinguish between different sampleMuts contents arbitrarily; the purpose is to * clump samples with identical lists. */ { struct slRef * const *rra = va; struct slRef * const *rrb = vb; struct placementInfo *pa = (*rra)->val; struct placementInfo *pb = (*rrb)->val; int diff = slCount(pa->sampleMuts) - slCount(pb->sampleMuts); if (diff == 0) { struct slName *slnA, *slnB; for (slnA = pa->sampleMuts, slnB = pb->sampleMuts; slnA != NULL; slnA = slnA->next, slnB = slnB->next) @@ -1085,32 +1078,30 @@ printf("<b>%s</b>\n", info->sampleId); ref = ref->next; } refsToGo = ref; displaySampleMuts(info); if (info->imputedBases) { puts("<p>Base values imputed by parsimony:\n<ul>"); struct baseVal *bv; for (bv = info->imputedBases; bv != NULL; bv = bv->next) printf("<li>%d: %s\n", bv->chromStart+1, bv->val); puts("</ul>"); puts("</p>"); } displayVariantPath(info->variantPath, clumpSize == 1 ? info->sampleId : "samples"); - displayBestNodes(info, sampleMetadata); - if (!showBestNodePaths) displayNearestNeighbors(info, source); if (showParsimonyScore && info->parsimonyScore > 0) printf("<p>Parsimony score added by your sample: %d</p>\n", info->parsimonyScore); //#*** TODO: explain parsimony score } } struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds) /* Prune all descendants of node that have no leaf descendants in sampleIds. */ { if (node->numEdges) { struct phyloTree *prunedKids = NULL; int i; for (i = 0; i < node->numEdges; i++) @@ -2180,60 +2171,58 @@ } } static struct hash *hashFromSeqInfoList(struct seqInfo *seqInfoList) /* Hash sequence name to seqInfo for quick lookup. */ { struct hash *hash = hashNew(0); struct seqInfo *si; for (si = seqInfoList; si != NULL; si = si->next) hashAdd(hash, si->seq->name, si); return hash; } static struct tempName *writeTsvSummary(struct usherResults *results, struct phyloTree *sampleTree, - struct slName *sampleIds, struct seqInfo *seqInfoList, + struct slName *sampleIds, struct hash *seqInfoHash, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *spikeChanges, int *pStartTime) /* Write a tab-separated summary file for download. If the user uploaded enough samples to make * a tree, then write out samples in tree order; otherwise use sampleIds list. * Accumulate S gene changes. */ { struct tempName *tsvTn = NULL; AllocVar(tsvTn); trashDirFile(tsvTn, "ct", "usher_samples", ".tsv"); FILE *f = mustOpen(tsvTn->forCgi, "w"); fprintf(f, "name\tnuc_mutations\taa_mutations\timputed_bases\tmutation_path" "\tplacement_count\tparsimony_score_increase\tlength\taligned_bases" "\tins_bases\tins_ranges\tdel_bases\tdel_ranges\tmasked_mutations" "\tnearest_neighbor\tneighbor_lineage\tnextstrain_clade\tpango_lineage" "\n"); -struct hash *seqInfoHash = hashFromSeqInfoList(seqInfoList); if (sampleTree) { rWriteTsvSummaryTreeOrder(sampleTree, f, results, seqInfoHash, geneInfoList, gSeqWin, spikeChanges); } else { struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) writeOneTsvRow(f, sample->name, results, seqInfoHash, geneInfoList, gSeqWin, spikeChanges); } carefulClose(&f); -hashFree(&seqInfoHash); reportTiming(pStartTime, "write tsv summary"); return tsvTn; } static struct tempName *writeSpikeChangeSummary(struct hash *spikeChanges, int totalSampleCount) /* Write a tab-separated summary of S (Spike) gene changes for download. */ { struct tempName *tsvTn = NULL; AllocVar(tsvTn); trashDirFile(tsvTn, "ct", "usher_S_muts", ".tsv"); FILE *f = mustOpen(tsvTn->forCgi, "w"); fprintf(f, "aa_mutation\tsample_count\tsample_frequency\tsample_ids\tcategory" "\n"); struct aaMutInfo *sChanges[spikeChanges->elCount]; struct hashCookie cookie = hashFirst(spikeChanges); @@ -2682,41 +2671,43 @@ printf("<p>Sorry, could not align any sequences to reference well enough to place in " "the phylogenetic tree.</p>\n"); isFasta = TRUE; } else if (lfLooksLikeVcf(lf)) { vcfTn = checkAndSaveVcf(lf, refGenome, maskSites, &seqInfoList, &sampleIds); reportTiming(&startTime, "check uploaded VCF"); } else { subtreesOnly = TRUE; sampleIds = readSampleIds(lf, bigTree, aliasFile); } lineFileClose(&lf); +struct hash *seqInfoHash = hashFromSeqInfoList(seqInfoList); if (sampleIds == NULL) { return ctFile; } struct usherResults *results = NULL; if (vcfTn) { fflush(stdout); results = runUsher(usherPath, protobufPath, vcfTn->forCgi, subtreeSize, sampleIds, bigTree->condensedNodes, &startTime); + addSampleMutsFromSeqInfo(results->samplePlacements, seqInfoHash); } else if (subtreesOnly) { char *matUtilsPath = getMatUtilsPath(TRUE); results = runMatUtilsExtractSubtrees(matUtilsPath, protobufPath, subtreeSize, sampleIds, bigTree->condensedNodes, &startTime); } if (results && results->singleSubtreeInfo) { if (retSuccess) *retSuccess = TRUE; puts("<p></p>"); readQcThresholds(db); int subtreeCount = slCount(results->subtreeInfoList); @@ -2769,31 +2760,31 @@ { summarizeSubtrees(sampleIds, results, sampleMetadata, jsonTns, bigTree); reportTiming(&startTime, "describe subtrees"); } else { findNearestNeighbors(results->samplePlacements, sampleMetadata, bigTree); // Make custom tracks for uploaded samples and subtree(s). struct phyloTree *sampleTree = NULL; ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree, source, fontHeight, &sampleTree, &startTime); // Make a sample summary TSV file and accumulate S gene changes struct hash *spikeChanges = hashNew(0); - tsvTn = writeTsvSummary(results, sampleTree, sampleIds, seqInfoList, + tsvTn = writeTsvSummary(results, sampleTree, sampleIds, seqInfoHash, geneInfoList, gSeqWin, spikeChanges, &startTime); sTsvTn = writeSpikeChangeSummary(spikeChanges, slCount(sampleIds)); downloadsRow(results->bigTreePlusTn->forHtml, tsvTn->forHtml, sTsvTn->forHtml, zipTn->forHtml); int seqCount = slCount(seqInfoList); if (seqCount <= MAX_SEQ_DETAILS) { summarizeSequences(seqInfoList, isFasta, results, jsonTns, sampleMetadata, refGenome); reportTiming(&startTime, "write summary table (including reading in lineages)"); for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds); printf("<h3>Subtree %d: ", ix+1); if (subtreeUserSampleCount > 1)