7c0d73b5b9eb8303ee04ba236d31a6703552581c angie Wed Dec 2 16:32:15 2020 -0800 hgPhyloPlace metadata: add Nextstrain_clade and genbank_accession columns, refactor in prep for public data. * Move the metadatafile-reading code into phyloPlace.c so metadata file can be used for looking up lineage (no need for separate idToLineage file). * Support looking up metadata by GenBank ID in addition to EPI_ISL ID, in preparation for using protobuf and metadata from public sequences instead of GISAID. diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 4f16792..f2071b8 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -539,112 +539,241 @@ struct variantPathNode *vpn; for (vpn = variantPath; vpn != NULL; vpn = vpn->next) { if (vpn != variantPath) printf(" > "); struct singleNucChange *snc; for (snc = vpn->sncList; snc != NULL; snc = snc->next) { if (snc != vpn->sncList) printf(", "); printf("%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase); } } } +static struct hash *getSampleMetadata(char *metadataFile) +/* If config.ra defines a metadataFile, load its contents into a hash indexed by EPI ID and return; + * otherwise return NULL. */ +{ +struct hash *sampleMetadata = NULL; +if (isNotEmpty(metadataFile) && fileExists(metadataFile)) + { + sampleMetadata = hashNew(0); + struct lineFile *lf = lineFileOpen(metadataFile, TRUE); + int headerWordCount = 0; + char **headerWords = NULL; + char *line; + // Check for header line + if (lineFileNext(lf, &line, NULL)) + { + if (startsWithWord("strain", line)) + { + char *headerLine = cloneString(line); + headerWordCount = chopString(headerLine, "\t", NULL, 0); + AllocArray(headerWords, headerWordCount); + chopString(headerLine, "\t", headerWords, headerWordCount); + } + else + errAbort("Missing header line from metadataFile %s", metadataFile); + } + int strainIx = stringArrayIx("strain", headerWords, headerWordCount); + int epiIdIx = stringArrayIx("gisaid_epi_isl", headerWords, headerWordCount); + int genbankIx = stringArrayIx("genbank_accession", headerWords, headerWordCount); + int dateIx = stringArrayIx("date", headerWords, headerWordCount); + int authorIx = stringArrayIx("authors", headerWords, headerWordCount); + int nCladeIx = stringArrayIx("Nextstrain_clade", headerWords, headerWordCount); + int gCladeIx = stringArrayIx("GISAID_clade", headerWords, headerWordCount); + int lineageIx = stringArrayIx("pangolin_lineage", headerWords, headerWordCount); + int countryIx = stringArrayIx("country", headerWords, headerWordCount); + int divisionIx = stringArrayIx("division", headerWords, headerWordCount); + int locationIx = stringArrayIx("location", headerWords, headerWordCount); + int countryExpIx = stringArrayIx("country_exposure", headerWords, headerWordCount); + int divExpIx = stringArrayIx("division_exposure", headerWords, headerWordCount); + int origLabIx = stringArrayIx("originating_lab", headerWords, headerWordCount); + int subLabIx = stringArrayIx("submitting_lab", headerWords, headerWordCount); + int regionIx = stringArrayIx("region", headerWords, headerWordCount); + while (lineFileNext(lf, &line, NULL)) + { + char *words[headerWordCount]; + int wordCount = chopTabs(line, words); + lineFileExpectWords(lf, headerWordCount, wordCount); + struct sampleMetadata *met; + AllocVar(met); + if (strainIx >= 0) + met->strain = cloneString(words[strainIx]); + if (epiIdIx >= 0) + met->epiId = cloneString(words[epiIdIx]); + if (genbankIx >= 0 && !sameString("?", words[genbankIx])) + met->gbAcc = cloneString(words[genbankIx]); + if (dateIx >= 0) + met->date = cloneString(words[dateIx]); + if (authorIx >= 0) + met->author = cloneString(words[authorIx]); + if (nCladeIx >= 0) + met->nClade = cloneString(words[nCladeIx]); + if (gCladeIx >= 0) + met->gClade = cloneString(words[gCladeIx]); + if (lineageIx >= 0) + met->lineage = cloneString(words[lineageIx]); + if (countryIx >= 0) + met->country = cloneString(words[countryIx]); + if (divisionIx >= 0) + met->division = cloneString(words[divisionIx]); + if (locationIx >= 0) + met->location = cloneString(words[locationIx]); + if (countryExpIx >= 0) + met->countryExp = cloneString(words[countryExpIx]); + if (divExpIx >= 0) + met->divExp = cloneString(words[divExpIx]); + if (origLabIx >= 0) + met->origLab = cloneString(words[origLabIx]); + if (subLabIx >= 0) + met->subLab = cloneString(words[subLabIx]); + if (regionIx >= 0) + met->region = cloneString(words[regionIx]); + // If epiId and/or genbank ID is included, we'll probably be using that to look up items. + if (epiIdIx >= 0 && !isEmpty(words[epiIdIx])) + hashAdd(sampleMetadata, words[epiIdIx], met); + if (genbankIx >= 0 && !isEmpty(words[genbankIx]) && !sameString("?", words[genbankIx])) + { + if (strchr(words[genbankIx], '.')) + { + // Index by versionless accession + char copy[strlen(words[genbankIx])+1]; + safecpy(copy, sizeof copy, words[genbankIx]); + char *dot = strchr(copy, '.'); + *dot = '\0'; + hashAdd(sampleMetadata, copy, met); + } + else + hashAdd(sampleMetadata, words[genbankIx], met); + } + if (strainIx >= 0 && !isEmpty(words[strainIx])) + hashAdd(sampleMetadata, words[strainIx], met); + } + lineFileClose(&lf); + } +return sampleMetadata; +} + char *epiIdFromSampleName(char *sampleId) /* If an EPI_ISL_# ID is present somewhere in sampleId, extract and return it, otherwise NULL. */ { char *epiId = cloneString(strstr(sampleId, "EPI_ISL_")); if (epiId) { char *p = epiId + strlen("EPI_ISL_"); while (isdigit(*p)) p++; *p = '\0'; } return epiId; } -char *lineageForSample(char *db, char *sampleId) -/* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */ +char *gbIdFromSampleName(char *sampleId) +/* If a GenBank accession is present somewhere in sampleId, extract and return it, otherwise NULL. */ { -static struct hash *sampleToLineage = NULL; -if (sampleToLineage == NULL) +char *gbId = NULL; +regmatch_t substrs[2]; +if (regexMatchSubstr(sampleId, "([A-Z][A-Z][0-9]{6})", substrs, ArraySize(substrs))) { - char *epiToLineagePath = phyloPlaceDbSettingPath(db, "idToLineageFile"); - if (epiToLineagePath) - sampleToLineage = hashTwoColumnFile(epiToLineagePath); - else - return NULL; + // Make sure there are word boundaries around the match + if ((substrs[1].rm_so == 0 || !isalnum(sampleId[substrs[1].rm_so-1])) && + !isalnum(sampleId[substrs[1].rm_eo])) + gbId = cloneStringZ(sampleId+substrs[1].rm_so, substrs[1].rm_eo - substrs[1].rm_so); } -char *lineage = NULL; +return gbId; +} + +struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId) +/* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */ +{ +struct sampleMetadata *met = NULL; char *epiId = epiIdFromSampleName(sampleId); if (epiId) - lineage = hashFindVal(sampleToLineage, epiId); + met = hashFindVal(sampleMetadata, epiId); +if (!met) + { + char *gbId = gbIdFromSampleName(sampleId); + if (gbId) + met = hashFindVal(sampleMetadata, gbId); + } +if (!met) + met = hashFindVal(sampleMetadata, sampleId); +return met; +} + +static char *lineageForSample(struct hash *sampleMetadata, char *sampleId) +/* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */ +{ +char *lineage = NULL; +struct sampleMetadata *met = metadataForSample(sampleMetadata, sampleId); +if (met) + lineage = met->lineage; return lineage; } static void displayNearestNeighbors(struct mutationAnnotatedTree *bigTree, - struct placementInfo *info, char *db) + struct placementInfo *info, struct hash *sampleMetadata) /* Use info->variantPaths to find sample's nearest neighbor(s) in tree. */ { if (bigTree) { printf("<p>Nearest neighboring GISAID sequence already in phylogenetic tree: "); struct slName *neighbors = findNearestNeighbor(bigTree, info->sampleId, info->variantPath); struct slName *neighbor; for (neighbor = neighbors; neighbor != NULL; neighbor = neighbor->next) { if (neighbor != neighbors) printf(", "); printf("%s", neighbor->name); - char *lineage = lineageForSample(db, neighbor->name); + char *lineage = lineageForSample(sampleMetadata, neighbor->name); if (isNotEmpty(lineage)) printf(": lineage %s", lineage); } puts("</p>"); } } -static void displayBestNodes(struct placementInfo *info, char *db) +static void displayBestNodes(struct placementInfo *info, struct hash *sampleMetadata) /* Show the node(s) most closely related to sample. */ { if (info->bestNodeCount == 1) printf("<p>The placement in the tree is unambiguous; " "there are no other parsimony-optimal placements in the phylogenetic tree.</p>\n"); else if (info->bestNodeCount > 1) printf("<p>This placement is not the only parsimony-optimal placement in the tree; " "%d other placements exist.</p>\n", info->bestNodeCount - 1); if (showBestNodePaths && info->bestNodes) { if (info->bestNodeCount != slCount(info->bestNodes)) errAbort("Inconsistent bestNodeCount (%d) and number of bestNodes (%d)", info->bestNodeCount, slCount(info->bestNodes)); if (info->bestNodeCount > 1) printf("<ul><li><b>used for placement</b>: "); if (differentString(info->bestNodes->name, "?") && !isAllDigits(info->bestNodes->name)) printf("%s ", info->bestNodes->name); printVariantPathNoNodeNames(info->bestNodes->variantPath); struct bestNodeInfo *bn; for (bn = info->bestNodes->next; bn != NULL; bn = bn->next) { printf("\n<li>"); if (differentString(bn->name, "?") && !isAllDigits(bn->name)) printf("%s ", bn->name); printVariantPathNoNodeNames(bn->variantPath); - char *lineage = lineageForSample(db, bn->name); + char *lineage = lineageForSample(sampleMetadata, bn->name); if (isNotEmpty(lineage)) printf(": lineage %s", lineage); } if (info->bestNodeCount > 1) puts("</ul>"); puts("</p>"); } } static int placementInfoRefCmpSampleMuts(const void *va, const void *vb) /* Compare slRef->placementInfo->sampleMuts lists. Shorter lists first. Using alpha sort * to distinguish between different sampleMuts contents arbitrarily; the purpose is to * clump samples with identical lists. */ { struct slRef * const *rra = va; @@ -714,32 +843,32 @@ safecpy(indentForKids+indentLen - 4, 4 + 1, " "); else safecpy(indentForKids+indentLen - 4, 4 + 1, "| "); } if (node->numEdges > 0) { char kidIndent[strlen(indent)+5]; safef(kidIndent, sizeof kidIndent, "%s%s", indentForKids, "+---"); int i; for (i = 0; i < node->numEdges; i++) asciiTree(node->edges[i], kidIndent, (i == node->numEdges - 1)); } } static void describeSamplePlacements(struct slName *sampleIds, struct hash *samplePlacements, - struct phyloTree *subtree, - struct mutationAnnotatedTree *bigTree, char *db) + struct phyloTree *subtree, struct hash *sampleMetadata, + struct mutationAnnotatedTree *bigTree) /* Report how each sample fits into the big tree. */ { // Sort sample placements by sampleMuts so we can group identical samples. struct slRef *placementRefs = getPlacementRefList(sampleIds, samplePlacements); slSort(&placementRefs, placementInfoRefCmpSampleMuts); int relatedCount = slCount(placementRefs); int clumpSize = countIdentical(placementRefs); if (clumpSize < relatedCount && relatedCount > 2) { // Not all of the related sequences are identical, so they will be broken down into // separate "clumps". List all related samples first to avoid confusion. puts("<pre>"); asciiTree(subtree, "", TRUE); puts("</pre>"); } @@ -770,33 +899,33 @@ printf("<b>%s</b>\n", info->sampleId); ref = ref->next; } refsToGo = ref; displaySampleMuts(info); if (info->imputedBases) { puts("<p>Base values imputed by parsimony:\n<ul>"); struct baseVal *bv; for (bv = info->imputedBases; bv != NULL; bv = bv->next) printf("<li>%d: %s\n", bv->chromStart+1, bv->val); puts("</ul>"); puts("</p>"); } displayVariantPath(info->variantPath, clumpSize == 1 ? info->sampleId : "samples"); - displayBestNodes(info, db); + displayBestNodes(info, sampleMetadata); if (!showBestNodePaths) - displayNearestNeighbors(bigTree, info, db); + displayNearestNeighbors(bigTree, info, sampleMetadata); if (showParsimonyScore && info->parsimonyScore > 0) printf("<p>Parsimony score added by your sample: %d</p>\n", info->parsimonyScore); //#*** TODO: explain parsimony score } } struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds) /* Prune all descendants of node that have no leaf descendants in sampleIds. */ { if (node->numEdges) { struct phyloTree *prunedKids = NULL; int i; for (i = 0; i < node->numEdges; i++) { @@ -1059,32 +1188,32 @@ } static void appendExcludingNs(struct dyString *dy, struct seqInfo *si) /* Append a note to dy about how many N bases and start and/or end are excluded from statistic. */ { dyStringAppend(dy, "excluding "); if (si->nCountStart) dyStringPrintf(dy, "%d N bases at start", si->nCountStart); if (si->nCountStart && si->nCountEnd) dyStringAppend(dy, " and "); if (si->nCountEnd) dyStringPrintf(dy, "%d N bases at end", si->nCountEnd); } static void summarizeSequences(struct seqInfo *seqInfoList, boolean isFasta, - struct usherResults *ur, struct tempName *jsonTns[], char *db, - struct mutationAnnotatedTree *bigTree) + struct usherResults *ur, struct tempName *jsonTns[], + struct hash *sampleMetadata, struct mutationAnnotatedTree *bigTree) /* Show a table with composition & alignment stats for each sequence that passed basic QC. */ { if (seqInfoList) { puts("<table class='seqSummary'>"); printSummaryHeader(isFasta); puts("<tbody>"); struct dyString *dy = dyStringNew(0); struct seqInfo *si; for (si = seqInfoList; si != NULL; si = si->next) { puts("<tr>"); printf("<th>%s</td>", replaceChars(si->seq->name, "|", " | ")); if (isFasta) { @@ -1202,31 +1331,31 @@ snc->refBase, snc->chromStart+1, snc->newBase, reasonList->name); for (reason = reasonList->next; reason != NULL; reason = reason->next) { replaceChar(reason->name, '_', ' '); dyStringPrintf(dy, ", %s", reason->name); } dyStringAppendC(dy, ')'); } printTooltip(dy->string); } printf("</td>"); struct placementInfo *pi = hashFindVal(ur->samplePlacements, si->seq->name); if (pi) { struct slName *neighbor = findNearestNeighbor(bigTree, pi->sampleId, pi->variantPath); - char *lineage = neighbor ? lineageForSample(db, neighbor->name) : "?"; + char *lineage = neighbor ? lineageForSample(sampleMetadata, neighbor->name) : "?"; printf("<td>%s</td><td>%s</td>", neighbor ? replaceChars(neighbor->name, "|", " | ") : "?", lineage ? lineage : "?"); int imputedCount = slCount(pi->imputedBases); printf("<td class='%s'>%d", qcClassForImputedBases(imputedCount), imputedCount); if (imputedCount > 0) { dyStringClear(dy); struct baseVal *bv; for (bv = pi->imputedBases; bv != NULL; bv = bv->next) { dyStringAppendSep(dy, ", "); dyStringPrintf(dy, "%d: %s", bv->chromStart+1, bv->val); } @@ -1360,63 +1489,65 @@ } lineFileClose(&lf); if (vcfTn) { struct usherResults *results = runUsher(usherPath, usherAssignmentsPath, vcfTn->forCgi, subtreeSize, sampleIds, bigTree->condensedNodes, &startTime); if (results->subtreeInfoList) { int subtreeCount = slCount(results->subtreeInfoList); // Sort subtrees by number of user samples (largest first). slSort(&results->subtreeInfoList, subTreeInfoUserSampleCmp); // Make Nextstrain/auspice JSON file for each subtree. char *bigGenePredFile = phyloPlaceDbSettingPath(db, "bigGenePredFile"); char *metadataFile = phyloPlaceDbSettingPath(db, "metadataFile"); + struct hash *sampleMetadata = getSampleMetadata(metadataFile); struct tempName *jsonTns[subtreeCount]; struct subtreeInfo *ti; int ix; for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { AllocVar(jsonTns[ix]); trashDirFile(jsonTns[ix], "ct", "subtreeAuspice", ".json"); - treeToAuspiceJson(ti, db, refGenome, bigGenePredFile, metadataFile, jsonTns[ix]->forCgi); + treeToAuspiceJson(ti, db, refGenome, bigGenePredFile, sampleMetadata, + jsonTns[ix]->forCgi); } puts("<p></p>"); makeButtonRow(jsonTns, subtreeCount, isFasta); printf("<p>If you have metadata you wish to display, click a 'view subtree in Nextstrain' " "button, and then you can drag on a CSV file to " "<a href='"NEXTSTRAIN_DRAG_DROP_DOC"' target=_blank>add it to the tree view</a>." "</p>\n"); - summarizeSequences(seqInfoList, isFasta, results, jsonTns, db, bigTree); + summarizeSequences(seqInfoList, isFasta, results, jsonTns, sampleMetadata, bigTree); reportTiming(&startTime, "write summary table (including reading in lineages)"); for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds); printf("<h3>Subtree %d: ", ix+1); if (subtreeUserSampleCount > 1) printf("%d related samples", subtreeUserSampleCount); else if (subtreeCount > 1) printf("Unrelated sample"); printf("</h3>\n"); makeNextstrainButton("viewNextstrainSub", ix, jsonTns); puts("<br>"); // Make a sub-subtree with only user samples for display: struct phyloTree *subtree = phyloOpenTree(ti->subtreeTn->forCgi); subtree = phyloPruneToIds(subtree, ti->subtreeUserSampleIds); describeSamplePlacements(ti->subtreeUserSampleIds, results->samplePlacements, subtree, - bigTree, db); + sampleMetadata, bigTree); } reportTiming(&startTime, "describe placements"); // Make custom tracks for uploaded samples and subtree(s). struct tempName *ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree->tree, fontHeight, &startTime); // Offer big tree w/new samples for download puts("<h3>Downloads</h3>"); puts("<ul>"); printf("<li><a href='%s' download>SARS-CoV-2 phylogenetic tree " "with your samples (Newick file)</a>\n", results->bigTreePlusTn->forHtml); for (ix = 0, ti = results->subtreeInfoList; ti != NULL; ti = ti->next, ix++) { printf("<li><a href='%s' download>Subtree with %s", ti->subtreeTn->forHtml,