68f95eec388eb3895787fc885fa01fe7ee3447ac angie Thu Aug 26 11:36:26 2021 -0700 Recently usher started prepending node_ to numeric internal node names stored in the protobuf, so node names in usher outputs are now node_<number> instead of just <number>. However, the protobuf still has numeric names, so it may be necessary to strip the prefix when looking up bigTree nodes. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 2308f95..abe222f 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -470,73 +470,75 @@ if (info->sampleMuts == NULL) printf("(None; identical to reference)"); else { struct slName *sln; for (sln = info->sampleMuts; sln != NULL; sln = sln->next) { if (sln != info->sampleMuts) printf(", "); printf("%s", sln->name); } } puts("</p>"); } +boolean isInternalNodeName(char *nodeName, int minNewNode) +/* Return TRUE if nodeName looks like an internal node ID from the protobuf tree, i.e. is numeric + * or <USHER_NODE_PREFIX>_<number> and, if minNewNode > 0, number is less than minNewNode. */ +{ +if (startsWith(USHER_NODE_PREFIX, nodeName)) + nodeName += strlen(USHER_NODE_PREFIX); +return isAllDigits(nodeName) && (minNewNode <= 0 || (atoi(nodeName) < minNewNode)); +} + static void variantPathPrint(struct variantPathNode *variantPath) /* Print out a variantPath; print nodeName only if non-numeric * (i.e. a sample ID not internal node) */ { struct variantPathNode *vpn; for (vpn = variantPath; vpn != NULL; vpn = vpn->next) { if (vpn != variantPath) printf(" > "); - if (!isAllDigits(vpn->nodeName)) + if (!isInternalNodeName(vpn->nodeName, 0)) printf("%s: ", vpn->nodeName); struct singleNucChange *snc; for (snc = vpn->sncList; snc != NULL; snc = snc->next) { if (snc != vpn->sncList) printf(", "); printf("%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); } } } static void displayVariantPath(struct variantPathNode *variantPath, char *sampleId) /* Display mutations on the path to this sample. */ { printf("<p>Mutations along the path from the root of the phylogenetic tree to %s:<br>", sampleId); if (variantPath) { variantPathPrint(variantPath); puts("<br>"); } else puts("(None; your sample was placed at the root of the phylogenetic tree)"); puts("</p>"); } -static boolean isInternalNodeName(char *nodeName, int minNewNode) -/* Return TRUE if nodeName looks like an internal node ID from the protobuf tree, i.e. is numeric - * and less than minNewNode. */ -{ -return isAllDigits(nodeName) && (atoi(nodeName) < minNewNode); -} - static struct variantPathNode *findLastInternalNode(struct variantPathNode *variantPath, int minNewNode) /* Return the last node in variantPath with a numeric name less than minNewNode, or NULL. */ { if (!variantPath || !isInternalNodeName(variantPath->nodeName, minNewNode)) return NULL; while (variantPath->next && isInternalNodeName(variantPath->next->nodeName, minNewNode)) variantPath = variantPath->next; if (variantPath && isInternalNodeName(variantPath->nodeName, minNewNode)) return variantPath; return NULL; } static int mutCountCmp(const void *a, const void *b) /* Compare number of mutations of phyloTree nodes a and b. */ @@ -589,31 +591,37 @@ return leafName; } static char *findNearestNeighbor(struct mutationAnnotatedTree *bigTree, char *sampleId, struct variantPathNode *variantPath) /* Use the sequence of mutations in variantPath to find sampleId's parent node in bigTree, * then look for most similar leaf sibling(s). */ { char *nearestNeighbor = NULL; int bigTreeINodeCount = phyloCountInternalNodes(bigTree->tree); int minNewNode = bigTreeINodeCount + 1; // 1-based struct variantPathNode *lastOldNode = findLastInternalNode(variantPath, minNewNode); struct phyloTree *node = lastOldNode ? hashFindVal(bigTree->nodeHash, lastOldNode->nodeName) : bigTree->tree; if (lastOldNode && !node) - errAbort("Can't find last internal node for sample %s", sampleId); + { + if (startsWith(USHER_NODE_PREFIX, lastOldNode->nodeName)) + // protobuf still has number even if usher prepends USHER_NODE_PREFIX when parsing. + node = hashFindVal(bigTree->nodeHash, lastOldNode->nodeName+strlen(USHER_NODE_PREFIX)); + if (node == NULL) + errAbort("Can't find last internal node %s for sample %s", lastOldNode->nodeName, sampleId); + } // Look for a leaf kid with no mutations relative to the parent, should be closest. if (node->numEdges == 0) { struct slName *nodeList = hashFindVal(bigTree->condensedNodes, node->ident->name); if (nodeList) nearestNeighbor = cloneString(nodeList->name); else nearestNeighbor = cloneString(node->ident->name); } else { nearestNeighbor = leafWithLeastMuts(node, bigTree); if (nearestNeighbor == NULL && node->parent != NULL) nearestNeighbor = leafWithLeastMuts(node->parent, bigTree); } @@ -870,38 +878,38 @@ /* Show the node(s) most closely related to sample. */ { if (info->bestNodeCount == 1) printf("<p>The placement in the tree is unambiguous; " "there are no other parsimony-optimal placements in the phylogenetic tree.</p>\n"); else if (info->bestNodeCount > 1) printf("<p>This placement is not the only parsimony-optimal placement in the tree; " "%d other placements exist.</p>\n", info->bestNodeCount - 1); if (showBestNodePaths && info->bestNodes) { if (info->bestNodeCount != slCount(info->bestNodes)) errAbort("Inconsistent bestNodeCount (%d) and number of bestNodes (%d)", info->bestNodeCount, slCount(info->bestNodes)); if (info->bestNodeCount > 1) printf("<ul><li><b>used for placement</b>: "); - if (differentString(info->bestNodes->name, "?") && !isAllDigits(info->bestNodes->name)) + if (differentString(info->bestNodes->name, "?") && !isInternalNodeName(info->bestNodes->name, 0)) printf("%s ", info->bestNodes->name); printVariantPathNoNodeNames(stdout, info->bestNodes->variantPath); struct bestNodeInfo *bn; for (bn = info->bestNodes->next; bn != NULL; bn = bn->next) { printf("\n<li>"); - if (differentString(bn->name, "?") && !isAllDigits(bn->name)) + if (differentString(bn->name, "?") && !isInternalNodeName(bn->name, 0)) printf("%s ", bn->name); printVariantPathNoNodeNames(stdout, bn->variantPath); char *lineage = lineageForSample(sampleMetadata, bn->name); printLineageUrl(lineage); } if (info->bestNodeCount > 1) puts("</ul>"); puts("</p>"); } } static int placementInfoRefCmpSampleMuts(const void *va, const void *vb) /* Compare slRef->placementInfo->sampleMuts lists. Shorter lists first. Using alpha sort * to distinguish between different sampleMuts contents arbitrarily; the purpose is to * clump samples with identical lists. */ @@ -951,31 +959,31 @@ { clumpCount++; if (ref->next == NULL || placementInfoRefCmpSampleMuts(&ref, &ref->next)) break; } return clumpCount; } static void asciiTree(struct phyloTree *node, char *indent, boolean isLast, struct dyString *dyLine, struct slPair **pRowList) /* Until we can make a real graphic, at least print an ascii tree build up a (reversed) list of * lines so that we can add some text to the right later. */ { if (isNotEmpty(indent) || isNotEmpty(node->ident->name)) { - if (node->ident->name && !isAllDigits(node->ident->name)) + if (node->ident->name && !isInternalNodeName(node->ident->name, 0)) { dyStringPrintf(dyLine, "%s %s", indent, node->ident->name); slPairAdd(pRowList, node->ident->name, cloneString(dyLine->string)); dyStringClear(dyLine); } } int indentLen = strlen(indent); char indentForKids[indentLen+1]; safecpy(indentForKids, sizeof indentForKids, indent); if (indentLen >= 4) { if (isLast) safecpy(indentForKids+indentLen - 4, 4 + 1, " "); else safecpy(indentForKids+indentLen - 4, 4 + 1, "| "); @@ -2128,30 +2136,32 @@ struct singleNucChange *snc; for (snc = si->maskedSncList; snc != NULL; snc = snc->next) { if (snc != si->maskedSncList) fputc(',', f); fprintf(f, "%c%d%c", snc->refBase, snc->chromStart+1, snc->newBase); } } else { warn("writeOneTsvRow: no sequenceInfo for sample '%s'", sampleId); fprintf(f, "\tn/a\tn/a\tn/a\tn/a\tn/a\tn/a\tn/a"); } fprintf(f, "\t%s", isNotEmpty(info->nearestNeighbor) ? info->nearestNeighbor : "n/a"); fprintf(f, "\t%s", isNotEmpty(info->neighborLineage) ? info->neighborLineage : "n/a"); + fprintf(f, "\t%s", isNotEmpty(info->nextClade) ? info->nextClade : "n/a"); + fprintf(f, "\t%s", isNotEmpty(info->pangoLineage) ? info->pangoLineage : "n/a"); fputc('\n', f); } } static void rWriteTsvSummaryTreeOrder(struct phyloTree *node, FILE *f, struct usherResults *results, struct hash *seqInfoHash, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *spikeChanges) /* As we encounter leaves (user-uploaded samples) in depth-first search order, write out a line * of TSV summary for each one. */ { if (node->numEdges) { int i; for (i = 0; i < node->numEdges; i++) rWriteTsvSummaryTreeOrder(node->edges[i], f, results, seqInfoHash, geneInfoList, gSeqWin, @@ -2177,31 +2187,31 @@ static struct tempName *writeTsvSummary(struct usherResults *results, struct phyloTree *sampleTree, struct slName *sampleIds, struct seqInfo *seqInfoList, struct geneInfo *geneInfoList, struct seqWindow *gSeqWin, struct hash *spikeChanges, int *pStartTime) /* Write a tab-separated summary file for download. If the user uploaded enough samples to make * a tree, then write out samples in tree order; otherwise use sampleIds list. * Accumulate S gene changes. */ { struct tempName *tsvTn = NULL; AllocVar(tsvTn); trashDirFile(tsvTn, "ct", "usher_samples", ".tsv"); FILE *f = mustOpen(tsvTn->forCgi, "w"); fprintf(f, "name\tnuc_mutations\taa_mutations\timputed_bases\tmutation_path" "\tplacement_count\tparsimony_score_increase\tlength\taligned_bases" "\tins_bases\tins_ranges\tdel_bases\tdel_ranges\tmasked_mutations" - "\tnearest_neighbor\tneighbor_lineage" + "\tnearest_neighbor\tneighbor_lineage\tnextstrain_clade\tpango_lineage" "\n"); struct hash *seqInfoHash = hashFromSeqInfoList(seqInfoList); if (sampleTree) { rWriteTsvSummaryTreeOrder(sampleTree, f, results, seqInfoHash, geneInfoList, gSeqWin, spikeChanges); } else { struct slName *sample; for (sample = sampleIds; sample != NULL; sample = sample->next) writeOneTsvRow(f, sample->name, results, seqInfoHash, geneInfoList, gSeqWin, spikeChanges); } carefulClose(&f); hashFree(&seqInfoHash); @@ -2375,31 +2385,31 @@ { *p = '\0'; hashAdd(nameHash, id, fullName); } } static void maybeAddIsolate(struct hash *nameHash, char *name, char *fullName) /* If name looks like country/isolate/year and isolate looks sufficiently distinctive * then also map isolate to fullName. */ { regmatch_t substrs[2]; if (regexMatchSubstr(name, "^[A-Za-z _]+/(.*)/[0-9]{4}$", substrs, ArraySize(substrs))) { char isolate[substrs[1].rm_eo - substrs[1].rm_so + 1]; regexSubstringCopy(name, substrs[1], isolate, sizeof isolate); - if (! isAllDigits(isolate) && + if (! isInternalNodeName(isolate, 0) && (regexMatch(isolate, "[A-Za-z0-9]{4}") || regexMatch(isolate, "[A-Za-z0-9][A-Za-z0-9]+[-_][A-Za-z0-9][A-Za-z0-9]+"))) { hashAdd(nameHash, isolate, fullName); } } } static void addNameMaybeComponents(struct hash *nameHash, char *fullName, boolean addComponents) /* Add entries to nameHash mapping fullName to itself, and components of fullName to fullName. * If addComponents and fullName is |-separated name|ID|date, add name and ID to nameHash. */ { char *fullNameHashStored = hashStoreName(nameHash, fullName); // Now that we have hash storage for fullName, make it point to itself. struct hashEl *hel = hashLookup(nameHash, fullName); if (hel == NULL)