cbf33d2dcdd87c41ee8e1afe0ff738c62422bf77 angie Tue Mar 30 16:01:54 2021 -0700 Only search for nearest neighbor and neighboring lineage once; then use it in summary table, details output, and now also TSV output. Also set parBase in sncList for TSV output (oops). diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 0583fc3..3b61a80 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -520,90 +520,102 @@ while (variantPath->next && isInternalNodeName(variantPath->next->nodeName, minNewNode)) variantPath = variantPath->next; if (variantPath && isInternalNodeName(variantPath->nodeName, minNewNode)) return variantPath; return NULL; } static int mutCountCmp(const void *a, const void *b) /* Compare number of mutations of phyloTree nodes a and b. */ { const struct phyloTree *nodeA = *(struct phyloTree * const *)a; const struct phyloTree *nodeB = *(struct phyloTree * const *)b; return slCount(nodeA->priv) - slCount(nodeB->priv); } -static struct slName *findNearestNeighbor(struct mutationAnnotatedTree *bigTree, char *sampleId, - struct variantPathNode *variantPath) -/* Use the sequence of mutations in variantPath to find sampleId's parent node in bigTree, - * then look for most similar leaf sibling(s). */ -{ -struct slName *neighbors = NULL; -int bigTreeINodeCount = phyloCountInternalNodes(bigTree->tree); -int minNewNode = bigTreeINodeCount + 1; // 1-based -struct variantPathNode *lastOldNode = findLastInternalNode(variantPath, minNewNode); -struct phyloTree *node = lastOldNode ? hashFindVal(bigTree->nodeHash, lastOldNode->nodeName) : - bigTree->tree; -if (lastOldNode && !node) - errAbort("Can't find last internal node for sample %s", sampleId); -// Look for a leaf kid with no mutations relative to the parent, should be closest. -if (node->numEdges == 0) - { - struct slName *nodeList = hashFindVal(bigTree->condensedNodes, node->ident->name); - if (nodeList) - slNameAddHead(&neighbors, nodeList->name); - else - slNameAddHead(&neighbors, node->ident->name); - } -else +static char *leafWithLeastMuts(struct phyloTree *node, struct mutationAnnotatedTree *bigTree) +/* If node has a leaf child with no mutations of its own, return the name of that leaf. + * Otherwise, if node has leaf children, return the name of the leaf with the fewest mutations. + * Otherwise return NULL. */ { +char *leafName = NULL; int leafCount = 0; int i; for (i = 0; i < node->numEdges; i++) { struct phyloTree *kid = node->edges[i]; if (kid->numEdges == 0) { leafCount++; struct singleNucChange *kidMuts = kid->priv; if (!kidMuts) { struct slName *nodeList = hashFindVal(bigTree->condensedNodes, kid->ident->name); if (nodeList) - slNameAddHead(&neighbors, nodeList->name); + leafName = cloneString(nodeList->name); else - slNameAddHead(&neighbors, kid->ident->name); + leafName = cloneString(kid->ident->name); break; } } } - if (neighbors == NULL && leafCount) +if (leafName == NULL && leafCount) { // Pick the leaf with the fewest mutations. struct phyloTree *leafKids[leafCount]; int leafIx = 0; for (i = 0; i < node->numEdges; i++) { struct phyloTree *kid = node->edges[i]; if (kid->numEdges == 0) leafKids[leafIx++] = kid; } qsort(leafKids, leafCount, sizeof(leafKids[0]), mutCountCmp); - neighbors = slNameNew(leafKids[0]->ident->name); + leafName = cloneString(leafKids[0]->ident->name); + } +return leafName; +} + +static char *findNearestNeighbor(struct mutationAnnotatedTree *bigTree, char *sampleId, + struct variantPathNode *variantPath) +/* Use the sequence of mutations in variantPath to find sampleId's parent node in bigTree, + * then look for most similar leaf sibling(s). */ +{ +char *nearestNeighbor = NULL; +int bigTreeINodeCount = phyloCountInternalNodes(bigTree->tree); +int minNewNode = bigTreeINodeCount + 1; // 1-based +struct variantPathNode *lastOldNode = findLastInternalNode(variantPath, minNewNode); +struct phyloTree *node = lastOldNode ? hashFindVal(bigTree->nodeHash, lastOldNode->nodeName) : + bigTree->tree; +if (lastOldNode && !node) + errAbort("Can't find last internal node for sample %s", sampleId); +// Look for a leaf kid with no mutations relative to the parent, should be closest. +if (node->numEdges == 0) + { + struct slName *nodeList = hashFindVal(bigTree->condensedNodes, node->ident->name); + if (nodeList) + nearestNeighbor = cloneString(nodeList->name); + else + nearestNeighbor = cloneString(node->ident->name); } +else + { + nearestNeighbor = leafWithLeastMuts(node, bigTree); + if (nearestNeighbor == NULL && node->parent != NULL) + nearestNeighbor = leafWithLeastMuts(node->parent, bigTree); } -return neighbors; +return nearestNeighbor; } static void printVariantPathNoNodeNames(FILE *f, struct variantPathNode *variantPath) /* Print out variant path with no node names (even if non-numeric) to f. */ { struct variantPathNode *vpn; for (vpn = variantPath; vpn != NULL; vpn = vpn->next) { if (vpn != variantPath) fprintf(f, " > "); struct singleNucChange *snc; for (snc = vpn->sncList; snc != NULL; snc = snc->next) { if (snc != vpn->sncList) fprintf(f, ", "); @@ -784,48 +796,57 @@ met = hashFindVal(sampleMetadata, eg+strlen("_eg_")); } return met; } static char *lineageForSample(struct hash *sampleMetadata, char *sampleId) /* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */ { char *lineage = NULL; struct sampleMetadata *met = metadataForSample(sampleMetadata, sampleId); if (met) lineage = met->lineage; return lineage; } -static void displayNearestNeighbors(struct mutationAnnotatedTree *bigTree, char *source, - struct placementInfo *info, struct hash *sampleMetadata) -/* Use info->variantPaths to find sample's nearest neighbor(s) in tree. */ +static void findNearestNeighbors(struct hash *samplePlacements, struct hash *sampleMetadata, + struct mutationAnnotatedTree *bigTree) +/* For each placed sample, find the nearest neighbor in the bigTree and its assigned lineage, + * and fill in those two fields of placementInfo. */ { -if (bigTree) - { - printf("<p>Nearest neighboring %s sequence already in phylogenetic tree: ", source); - struct slName *neighbors = findNearestNeighbor(bigTree, info->sampleId, info->variantPath); - struct slName *neighbor; - for (neighbor = neighbors; neighbor != NULL; neighbor = neighbor->next) +if (!bigTree) + return; +struct hashCookie cookie = hashFirst(samplePlacements); +struct hashEl *hel; +while ((hel = hashNext(&cookie)) != NULL) { - if (neighbor != neighbors) - printf(", "); - printf("%s", neighbor->name); - char *lineage = lineageForSample(sampleMetadata, neighbor->name); - if (isNotEmpty(lineage)) - printf(": lineage %s", lineage); + struct placementInfo *info = hel->val; + info->nearestNeighbor = findNearestNeighbor(bigTree, info->sampleId, info->variantPath); + if (isNotEmpty(info->nearestNeighbor)) + info->neighborLineage = lineageForSample(sampleMetadata, info->nearestNeighbor); + } } + +static void displayNearestNeighbors(struct placementInfo *info, char *source) +/* Use info->variantPaths to find sample's nearest neighbor(s) in tree. */ +{ +if (isNotEmpty(info->nearestNeighbor)) + { + printf("<p>Nearest neighboring %s sequence already in phylogenetic tree: %s", + source, info->nearestNeighbor); + if (isNotEmpty(info->neighborLineage)) + printf(": lineage %s", info->neighborLineage); puts("</p>"); } } static void displayBestNodes(struct placementInfo *info, struct hash *sampleMetadata) /* Show the node(s) most closely related to sample. */ { if (info->bestNodeCount == 1) printf("<p>The placement in the tree is unambiguous; " "there are no other parsimony-optimal placements in the phylogenetic tree.</p>\n"); else if (info->bestNodeCount > 1) printf("<p>This placement is not the only parsimony-optimal placement in the tree; " "%d other placements exist.</p>\n", info->bestNodeCount - 1); if (showBestNodePaths && info->bestNodes) { @@ -927,31 +948,31 @@ else safecpy(indentForKids+indentLen - 4, 4 + 1, "| "); } if (node->numEdges > 0) { char kidIndent[strlen(indent)+5]; safef(kidIndent, sizeof kidIndent, "%s%s", indentForKids, "+---"); int i; for (i = 0; i < node->numEdges; i++) asciiTree(node->edges[i], kidIndent, (i == node->numEdges - 1)); } } static void describeSamplePlacements(struct slName *sampleIds, struct hash *samplePlacements, struct phyloTree *subtree, struct hash *sampleMetadata, - struct mutationAnnotatedTree *bigTree, char *source) + char *source) /* Report how each sample fits into the big tree. */ { // Sort sample placements by sampleMuts so we can group identical samples. struct slRef *placementRefs = getPlacementRefList(sampleIds, samplePlacements); slSort(&placementRefs, placementInfoRefCmpSampleMuts); int relatedCount = slCount(placementRefs); int clumpSize = countIdentical(placementRefs); if (clumpSize < relatedCount && relatedCount > 2) { // Not all of the related sequences are identical, so they will be broken down into // separate "clumps". List all related samples first to avoid confusion. puts("<pre>"); asciiTree(subtree, "", TRUE); puts("</pre>"); } @@ -984,31 +1005,31 @@ } refsToGo = ref; displaySampleMuts(info); if (info->imputedBases) { puts("<p>Base values imputed by parsimony:\n<ul>"); struct baseVal *bv; for (bv = info->imputedBases; bv != NULL; bv = bv->next) printf("<li>%d: %s\n", bv->chromStart+1, bv->val); puts("</ul>"); puts("</p>"); } displayVariantPath(info->variantPath, clumpSize == 1 ? info->sampleId : "samples"); displayBestNodes(info, sampleMetadata); if (!showBestNodePaths) - displayNearestNeighbors(bigTree, source, info, sampleMetadata); + displayNearestNeighbors(info, source); if (showParsimonyScore && info->parsimonyScore > 0) printf("<p>Parsimony score added by your sample: %d</p>\n", info->parsimonyScore); //#*** TODO: explain parsimony score } } struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds) /* Prune all descendants of node that have no leaf descendants in sampleIds. */ { if (node->numEdges) { struct phyloTree *prunedKids = NULL; int i; for (i = 0; i < node->numEdges; i++) { @@ -1393,32 +1414,31 @@ static void appendExcludingNs(struct dyString *dy, struct seqInfo *si) /* Append a note to dy about how many N bases and start and/or end are excluded from statistic. */ { dyStringAppend(dy, "excluding "); if (si->nCountStart) dyStringPrintf(dy, "%d N bases at start", si->nCountStart); if (si->nCountStart && si->nCountEnd) dyStringAppend(dy, " and "); if (si->nCountEnd) dyStringPrintf(dy, "%d N bases at end", si->nCountEnd); } static void summarizeSequences(struct seqInfo *seqInfoList, boolean isFasta, struct usherResults *ur, struct tempName *jsonTns[], - struct hash *sampleMetadata, struct mutationAnnotatedTree *bigTree, - struct dnaSeq *refGenome) + struct hash *sampleMetadata, struct dnaSeq *refGenome) /* Show a table with composition & alignment stats for each sequence that passed basic QC. */ { if (seqInfoList) { puts("<table class='seqSummary'>"); boolean gotClades = FALSE, gotLineages = FALSE; lookForCladesAndLineages(seqInfoList, ur->samplePlacements, &gotClades, &gotLineages); printSummaryHeader(isFasta, gotClades, gotLineages); puts("<tbody>"); struct dyString *dy = dyStringNew(0); struct seqInfo *si; for (si = seqInfoList; si != NULL; si = si->next) { puts("<tr>"); printf("<th>%s</td>", replaceChars(si->seq->name, "|", " | ")); @@ -1535,35 +1555,33 @@ replaceChar(reason->name, '_', ' '); dyStringPrintf(dy, ", %s", reason->name); } dyStringAppendC(dy, ')'); } printTooltip(dy->string); } printf("</td>"); struct placementInfo *pi = hashFindVal(ur->samplePlacements, si->seq->name); if (pi) { if (gotClades) printf("<td>%s</td>", pi->nextClade ? pi->nextClade : "n/a"); if (gotLineages) printf("<td>%s</td>", pi->pangoLineage ? pi->pangoLineage : "n/a"); - struct slName *neighbor = findNearestNeighbor(bigTree, pi->sampleId, pi->variantPath); - char *lineage = neighbor ? lineageForSample(sampleMetadata, neighbor->name) : "?"; printf("<td>%s</td><td>%s</td>", - neighbor ? replaceChars(neighbor->name, "|", " | ") : "?", - lineage ? lineage : "?"); + pi->nearestNeighbor ? replaceChars(pi->nearestNeighbor, "|", " | ") : "?", + pi->neighborLineage ? pi->neighborLineage : "?"); int imputedCount = slCount(pi->imputedBases); printf("<td class='%s'>%d", qcClassForImputedBases(imputedCount), imputedCount); if (imputedCount > 0) { dyStringClear(dy); struct baseVal *bv; for (bv = pi->imputedBases; bv != NULL; bv = bv->next) { dyStringAppendSep(dy, ", "); dyStringPrintf(dy, "%d: %s", bv->chromStart+1, bv->val); } printTooltip(dy->string); } printf("</td><td class='%s'>%d", @@ -1616,31 +1634,31 @@ errAbort("sncListFromSampleMuts: expected ref base value, got '%c' in '%s'", ref, mut->name); int pos = atoi(&(mut->name[1])); if (pos < 1 || pos > chromSize) errAbort("sncListFromSampleMuts: expected pos between 1 and %d, got %d in '%s'", chromSize, pos, mut->name); char alt = mut->name[strlen(mut->name)-1]; if (alt < 'A' || alt > 'Z') errAbort("sncListFromSampleMuts: expected alt base value, got '%c' in '%s'", alt, mut->name); if (isIupacAmbiguous(alt)) continue; struct singleNucChange *snc; AllocVar(snc); snc->chromStart = pos-1; - snc->refBase = ref; + snc->refBase = snc->parBase = ref; snc->newBase = alt; slAddHead(&sncList, snc); } struct baseVal *bv; for (bv = imputedBases; bv != NULL; bv = bv->next) { struct singleNucChange *snc; AllocVar(snc); snc->chromStart = bv->chromStart; snc->refBase = '?'; snc->newBase = bv->val[0]; slAddHead(&sncList, snc); } slReverse(&sncList); return sncList; @@ -1886,30 +1904,32 @@ // SNVs that were masked (Problematic Sites track), not used in placement fputc('\t', f); struct singleNucChange *snc; for (snc = si->maskedSncList; snc != NULL; snc = snc->next) { if (snc != si->maskedSncList) fputc(',', f); fprintf(f, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase); } } else { warn("writeOneTsvRow: no sequenceInfo for sample '%s'", sampleId); fprintf(f, "\tn/a\tn/a\tn/a\tn/a\tn/a\tn/a\tn/a"); } + fprintf(f, "\t%s", isNotEmpty(info->nearestNeighbor) ? info->nearestNeighbor : "n/a"); + fprintf(f, "\t%s", isNotEmpty(info->neighborLineage) ? info->neighborLineage : "n/a"); fputc('\n', f); } } static void rWriteTsvSummaryTreeOrder(struct phyloTree *node, FILE *f, struct usherResults *results, struct hash *seqInfoHash, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *spikeChanges) /* As we encounter leaves (user-uploaded samples) in depth-first search order, write out a line * of TSV summary for each one. */ { if (node->numEdges) { int i; for (i = 0; i < node->numEdges; i++) rWriteTsvSummaryTreeOrder(node->edges[i], f, results, seqInfoHash, geneInfoList, gSeqWin, @@ -1935,30 +1955,31 @@ static struct tempName *writeTsvSummary(struct usherResults *results, struct phyloTree *sampleTree, struct slName *sampleIds, struct seqInfo *seqInfoList, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *spikeChanges, int *pStartTime) /* Write a tab-separated summary file for download. If the user uploaded enough samples to make * a tree, then write out samples in tree order; otherwise use sampleIds list. * Accumulate S gene changes. */ { struct tempName *tsvTn = NULL; AllocVar(tsvTn); trashDirFile(tsvTn, "ct", "usher_samples", ".tsv"); FILE *f = mustOpen(tsvTn->forCgi, "w"); fprintf(f, "name\tnuc_mutations\taa_mutations\timputed_bases\tmutation_path" "\tplacement_count\tparsimony_score_increase\tlength\taligned_bases" "\tins_bases\tins_ranges\tdel_bases\tdel_ranges\tmasked_mutations" + "\tnearest_neighbor\tneighbor_lineage" "\n"); struct hash *seqInfoHash = hashFromSeqInfoList(seqInfoList); if (sampleTree) { rWriteTsvSummaryTreeOrder(sampleTree, f, results, seqInfoHash, geneInfoList, gSeqWin, spikeChanges); } else { struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) writeOneTsvRow(f, sample->name, results, seqInfoHash, geneInfoList, gSeqWin, spikeChanges); } carefulClose(&f); hashFree(&seqInfoHash); @@ -2169,33 +2190,30 @@ vcfTn = checkAndSaveVcf(lf, refGenome, maskSites, &seqInfoList, &sampleIds); reportTiming(&startTime, "check uploaded VCF"); } else { if (isNotEmpty(lf->fileName)) warn("Sorry, can't recognize your file %s as FASTA or VCF.\n", lf->fileName); else warn("Sorry, can't recognize your uploaded data as FASTA or VCF.\n"); } lineFileClose(&lf); if (vcfTn) { fflush(stdout); int seqCount = slCount(seqInfoList); - // Don't make smaller subtrees when a large number of sequences are uploaded. - if (seqCount > MAX_SEQ_DETAILS) - subtreeSize = 0; struct usherResults *results = runUsher(usherPath, usherAssignmentsPath, vcfTn->forCgi, subtreeSize, sampleIds, bigTree->condensedNodes, &startTime); if (results->singleSubtreeInfo) { puts("<p></p>"); readQcThresholds(db); int subtreeCount = slCount(results->subtreeInfoList); // Sort subtrees by number of user samples (largest first). slSort(&results->subtreeInfoList, subTreeInfoUserSampleCmp); // Make Nextstrain/auspice JSON file for each subtree. char *bigGenePredFile = phyloPlaceDbSettingPath(db, "bigGenePredFile"); struct geneInfo *geneInfoList = getGeneInfoList(bigGenePredFile, refGenome); struct seqWindow *gSeqWin = chromSeqWindowNew(db, chrom, 0, chromSize); struct hash *sampleMetadata = getSampleMetadata(metadataFile); @@ -2212,81 +2230,82 @@ treeToAuspiceJson(ti, db, geneInfoList, gSeqWin, sampleMetadata, NULL, jsonTns[ix]->forCgi, source); // Add a link for every sample to this subtree, so the single-subtree JSON can // link to subtree JSONs char *subtreeUrl = nextstrainUrlFromTn(jsonTns[ix]); struct slName *sample; for (sample = ti->subtreeUserSampleIds; sample != NULL; sample = sample->next) hashAdd(sampleUrls, sample->name, subtreeUrl); } struct tempName *singleSubtreeJsonTn; AllocVar(singleSubtreeJsonTn); trashDirFile(singleSubtreeJsonTn, "ct", "singleSubtreeAuspice", ".json"); treeToAuspiceJson(results->singleSubtreeInfo, db, geneInfoList, gSeqWin, sampleMetadata, sampleUrls, singleSubtreeJsonTn->forCgi, source); struct subtreeInfo *subtreeInfoForButtons = results->subtreeInfoList; - if (seqCount > MAX_SEQ_DETAILS || subtreeCount > MAX_SUBTREE_BUTTONS) + if (subtreeCount > MAX_SUBTREE_BUTTONS) subtreeInfoForButtons = NULL; makeButtonRow(singleSubtreeJsonTn, jsonTns, subtreeInfoForButtons, subtreeSize, isFasta); printf("<p>If you have metadata you wish to display, click a 'view subtree in " "Nextstrain' button, and then you can drag on a CSV file to " "<a href='"NEXTSTRAIN_DRAG_DROP_DOC"' target=_blank>add it to the tree view</a>." "</p>\n"); + findNearestNeighbors(results->samplePlacements, sampleMetadata, bigTree); + // Make custom tracks for uploaded samples and subtree(s). struct phyloTree *sampleTree = NULL; struct tempName *ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree->tree, source, fontHeight, &sampleTree, &startTime); // Make a sample summary TSV file and accumulate S gene changes struct hash *spikeChanges = hashNew(0); struct tempName *tsvTn = writeTsvSummary(results, sampleTree, sampleIds, seqInfoList, geneInfoList, gSeqWin, spikeChanges, &startTime); struct tempName *sTsvTn = writeSpikeChangeSummary(spikeChanges, slCount(sampleIds)); struct tempName *zipTn = makeSubtreeZipFile(results, jsonTns, singleSubtreeJsonTn, &startTime); downloadsRow(results->bigTreePlusTn->forHtml, tsvTn->forHtml, sTsvTn->forHtml, zipTn->forHtml); if (seqCount <= MAX_SEQ_DETAILS) { - summarizeSequences(seqInfoList, isFasta, results, jsonTns, sampleMetadata, bigTree, - refGenome); + summarizeSequences(seqInfoList, isFasta, results, jsonTns, sampleMetadata, refGenome); reportTiming(&startTime, "write summary table (including reading in lineages)"); for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds); printf("<h3>Subtree %d: ", ix+1); if (subtreeUserSampleCount > 1) printf("%d related samples", subtreeUserSampleCount); else if (subtreeCount > 1) printf("Unrelated sample"); printf("</h3>\n"); makeNextstrainButtonN("viewNextstrainSub", ix, subtreeUserSampleCount, subtreeSize, jsonTns); puts("<br>"); // Make a sub-subtree with only user samples for display: struct phyloTree *subtree = phyloOpenTree(ti->subtreeTn->forCgi); subtree = phyloPruneToIds(subtree, ti->subtreeUserSampleIds); describeSamplePlacements(ti->subtreeUserSampleIds, results->samplePlacements, - subtree, sampleMetadata, bigTree, source); + subtree, sampleMetadata, source); } reportTiming(&startTime, "describe placements"); } else - printf("<p>(Skipping details and subtrees; " - "you uploaded %d sequences, and details/subtrees are shown only when " + printf("<p>(Skipping details; " + "you uploaded %d sequences, and details are shown only when " "you upload at most %d sequences.)</p>\n", seqCount, MAX_SEQ_DETAILS); // Offer big tree w/new samples for download puts("<h3>Downloads</h3>"); puts("<ul>"); printf("<li><a href='%s' download>SARS-CoV-2 phylogenetic tree " "with your samples (Newick file)</a>\n", results->bigTreePlusTn->forHtml); printf("<li><a href='%s' download>TSV summary of sequences and placements</a>\n", tsvTn->forHtml); printf("<li><a href='%s' download>TSV summary of S (Spike) gene changes</a>\n", sTsvTn->forHtml); printf("<li><a href='%s' download>ZIP archive of subtree Newick and JSON files</a>\n", zipTn->forHtml); // For now, leave in the individual links so I don't break anybody's pipeline that's