src/hg/hgPhyloPlace/phyloPlace.c 7c0d73b5b9eb8303ee04ba236d31a6703552581c

7c0d73b5b9eb8303ee04ba236d31a6703552581c
angie
  Wed Dec 2 16:32:15 2020 -0800
hgPhyloPlace metadata: add Nextstrain_clade and genbank_accession columns, refactor in prep for public data.
* Move the metadatafile-reading code into phyloPlace.c so metadata file can be used for looking up lineage (no need for separate idToLineage file).
* Support looking up metadata by GenBank ID in addition to EPI_ISL ID, in preparation for using protobuf and metadata from public sequences instead of GISAID.

diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index 4f16792..f2071b8 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -539,112 +539,241 @@
 struct variantPathNode *vpn;
 for (vpn = variantPath;  vpn != NULL;  vpn = vpn->next)
     {
     if (vpn != variantPath)
         printf(" > ");
     struct singleNucChange *snc;
     for (snc = vpn->sncList;  snc != NULL;  snc = snc->next)
         {
         if (snc != vpn->sncList)
             printf(", ");
         printf("%c%d%c", snc->parBase, snc->chromStart+1, snc->newBase);
         }
     }
 }
 
+static struct hash *getSampleMetadata(char *metadataFile)
+/* If config.ra defines a metadataFile, load its contents into a hash indexed by EPI ID and return;
+ * otherwise return NULL. */
+{
+struct hash *sampleMetadata = NULL;
+if (isNotEmpty(metadataFile) && fileExists(metadataFile))
+    {
+    sampleMetadata = hashNew(0);
+    struct lineFile *lf = lineFileOpen(metadataFile, TRUE);
+    int headerWordCount = 0;
+    char **headerWords = NULL;
+    char *line;
+    // Check for header line
+    if (lineFileNext(lf, &line, NULL))
+        {
+        if (startsWithWord("strain", line))
+            {
+            char *headerLine = cloneString(line);
+            headerWordCount = chopString(headerLine, "\t", NULL, 0);
+            AllocArray(headerWords, headerWordCount);
+            chopString(headerLine, "\t", headerWords, headerWordCount);
+            }
+        else
+            errAbort("Missing header line from metadataFile %s", metadataFile);
+        }
+    int strainIx = stringArrayIx("strain", headerWords, headerWordCount);
+    int epiIdIx = stringArrayIx("gisaid_epi_isl", headerWords, headerWordCount);
+    int genbankIx = stringArrayIx("genbank_accession", headerWords, headerWordCount);
+    int dateIx = stringArrayIx("date", headerWords, headerWordCount);
+    int authorIx = stringArrayIx("authors", headerWords, headerWordCount);
+    int nCladeIx = stringArrayIx("Nextstrain_clade", headerWords, headerWordCount);
+    int gCladeIx = stringArrayIx("GISAID_clade", headerWords, headerWordCount);
+    int lineageIx = stringArrayIx("pangolin_lineage", headerWords, headerWordCount);
+    int countryIx = stringArrayIx("country", headerWords, headerWordCount);
+    int divisionIx = stringArrayIx("division", headerWords, headerWordCount);
+    int locationIx = stringArrayIx("location", headerWords, headerWordCount);
+    int countryExpIx = stringArrayIx("country_exposure", headerWords, headerWordCount);
+    int divExpIx = stringArrayIx("division_exposure", headerWords, headerWordCount);
+    int origLabIx = stringArrayIx("originating_lab", headerWords, headerWordCount);
+    int subLabIx = stringArrayIx("submitting_lab", headerWords, headerWordCount);
+    int regionIx = stringArrayIx("region", headerWords, headerWordCount);
+    while (lineFileNext(lf, &line, NULL))
+        {
+        char *words[headerWordCount];
+        int wordCount = chopTabs(line, words);
+        lineFileExpectWords(lf, headerWordCount, wordCount);
+        struct sampleMetadata *met;
+        AllocVar(met);
+        if (strainIx >= 0)
+            met->strain = cloneString(words[strainIx]);
+        if (epiIdIx >= 0)
+            met->epiId = cloneString(words[epiIdIx]);
+        if (genbankIx >= 0 && !sameString("?", words[genbankIx]))
+            met->gbAcc = cloneString(words[genbankIx]);
+        if (dateIx >= 0)
+            met->date = cloneString(words[dateIx]);
+        if (authorIx >= 0)
+            met->author = cloneString(words[authorIx]);
+        if (nCladeIx >= 0)
+            met->nClade = cloneString(words[nCladeIx]);
+        if (gCladeIx >= 0)
+            met->gClade = cloneString(words[gCladeIx]);
+        if (lineageIx >= 0)
+            met->lineage = cloneString(words[lineageIx]);
+        if (countryIx >= 0)
+            met->country = cloneString(words[countryIx]);
+        if (divisionIx >= 0)
+            met->division = cloneString(words[divisionIx]);
+        if (locationIx >= 0)
+            met->location = cloneString(words[locationIx]);
+        if (countryExpIx >= 0)
+            met->countryExp = cloneString(words[countryExpIx]);
+        if (divExpIx >= 0)
+            met->divExp = cloneString(words[divExpIx]);
+        if (origLabIx >= 0)
+            met->origLab = cloneString(words[origLabIx]);
+        if (subLabIx >= 0)
+            met->subLab = cloneString(words[subLabIx]);
+        if (regionIx >= 0)
+            met->region = cloneString(words[regionIx]);
+        // If epiId and/or genbank ID is included, we'll probably be using that to look up items.
+        if (epiIdIx >= 0 && !isEmpty(words[epiIdIx]))
+            hashAdd(sampleMetadata, words[epiIdIx], met);
+        if (genbankIx >= 0 && !isEmpty(words[genbankIx]) && !sameString("?", words[genbankIx]))
+            {
+            if (strchr(words[genbankIx], '.'))
+                {
+                // Index by versionless accession
+                char copy[strlen(words[genbankIx])+1];
+                safecpy(copy, sizeof copy, words[genbankIx]);
+                char *dot = strchr(copy, '.');
+                *dot = '\0';
+                hashAdd(sampleMetadata, copy, met);
+                }
+            else
+                hashAdd(sampleMetadata, words[genbankIx], met);
+            }
+        if (strainIx >= 0 && !isEmpty(words[strainIx]))
+            hashAdd(sampleMetadata, words[strainIx], met);
+        }
+    lineFileClose(&lf);
+    }
+return sampleMetadata;
+}
+
 char *epiIdFromSampleName(char *sampleId)
 /* If an EPI_ISL_# ID is present somewhere in sampleId, extract and return it, otherwise NULL. */
 {
 char *epiId = cloneString(strstr(sampleId, "EPI_ISL_"));
 if (epiId)
     {
     char *p = epiId + strlen("EPI_ISL_");
     while (isdigit(*p))
         p++;
     *p = '\0';
     }
 return epiId;
 }
 
-char *lineageForSample(char *db, char *sampleId)
-/* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */
+char *gbIdFromSampleName(char *sampleId)
+/* If a GenBank accession is present somewhere in sampleId, extract and return it, otherwise NULL. */
 {
-static struct hash *sampleToLineage = NULL;
-if (sampleToLineage == NULL)
+char *gbId = NULL;
+regmatch_t substrs[2];
+if (regexMatchSubstr(sampleId, "([A-Z][A-Z][0-9]{6})", substrs, ArraySize(substrs)))
     {
-    char *epiToLineagePath = phyloPlaceDbSettingPath(db, "idToLineageFile");
-    if (epiToLineagePath)
-        sampleToLineage = hashTwoColumnFile(epiToLineagePath);
-    else
-        return NULL;
+    // Make sure there are word boundaries around the match
+    if ((substrs[1].rm_so == 0 || !isalnum(sampleId[substrs[1].rm_so-1])) &&
+        !isalnum(sampleId[substrs[1].rm_eo]))
+        gbId = cloneStringZ(sampleId+substrs[1].rm_so, substrs[1].rm_eo - substrs[1].rm_so);
     }
-char *lineage = NULL;
+return gbId;
+}
+
+struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId)
+/* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */
+{
+struct sampleMetadata *met = NULL;
 char *epiId = epiIdFromSampleName(sampleId);
 if (epiId)
-    lineage = hashFindVal(sampleToLineage, epiId);
+    met = hashFindVal(sampleMetadata, epiId);
+if (!met)
+    {
+    char *gbId = gbIdFromSampleName(sampleId);
+    if (gbId)
+        met = hashFindVal(sampleMetadata, gbId);
+    }
+if (!met)
+    met = hashFindVal(sampleMetadata, sampleId);
+return met;
+}
+
+static char *lineageForSample(struct hash *sampleMetadata, char *sampleId)
+/* Look up sampleId's lineage in epiToLineage file. Return NULL if we don't find a match. */
+{
+char *lineage = NULL;
+struct sampleMetadata *met = metadataForSample(sampleMetadata, sampleId);
+if (met)
+    lineage = met->lineage;
 return lineage;
 }
 
 static void displayNearestNeighbors(struct mutationAnnotatedTree *bigTree,
-                                    struct placementInfo *info, char *db)
+                                    struct placementInfo *info, struct hash *sampleMetadata)
 /* Use info->variantPaths to find sample's nearest neighbor(s) in tree. */
 {
 if (bigTree)
     {
     printf("<p>Nearest neighboring GISAID sequence already in phylogenetic tree: ");
     struct slName *neighbors = findNearestNeighbor(bigTree, info->sampleId, info->variantPath);
     struct slName *neighbor;
     for (neighbor = neighbors;  neighbor != NULL;  neighbor = neighbor->next)
         {
         if (neighbor != neighbors)
             printf(", ");
         printf("%s", neighbor->name);
-        char *lineage = lineageForSample(db, neighbor->name);
+        char *lineage = lineageForSample(sampleMetadata, neighbor->name);
         if (isNotEmpty(lineage))
             printf(": lineage %s", lineage);
         }
     puts("</p>");
     }
 }
 
-static void displayBestNodes(struct placementInfo *info, char *db)
+static void displayBestNodes(struct placementInfo *info, struct hash *sampleMetadata)
 /* Show the node(s) most closely related to sample. */
 {
 if (info->bestNodeCount == 1)
     printf("<p>The placement in the tree is unambiguous; "
            "there are no other parsimony-optimal placements in the phylogenetic tree.</p>\n");
 else if (info->bestNodeCount > 1)
     printf("<p>This placement is not the only parsimony-optimal placement in the tree; "
            "%d other placements exist.</p>\n", info->bestNodeCount - 1);
 if (showBestNodePaths && info->bestNodes)
     {
     if (info->bestNodeCount != slCount(info->bestNodes))
         errAbort("Inconsistent bestNodeCount (%d) and number of bestNodes (%d)",
                  info->bestNodeCount, slCount(info->bestNodes));
     if (info->bestNodeCount > 1)
         printf("<ul><li><b>used for placement</b>: ");
     if (differentString(info->bestNodes->name, "?") && !isAllDigits(info->bestNodes->name))
         printf("%s ", info->bestNodes->name);
     printVariantPathNoNodeNames(info->bestNodes->variantPath);
     struct bestNodeInfo *bn;
     for (bn = info->bestNodes->next;  bn != NULL;  bn = bn->next)
         {
         printf("\n<li>");
         if (differentString(bn->name, "?") && !isAllDigits(bn->name))
             printf("%s ", bn->name);
         printVariantPathNoNodeNames(bn->variantPath);
-        char *lineage = lineageForSample(db, bn->name);
+        char *lineage = lineageForSample(sampleMetadata, bn->name);
         if (isNotEmpty(lineage))
             printf(": lineage %s", lineage);
         }
     if (info->bestNodeCount > 1)
         puts("</ul>");
     puts("</p>");
     }
 }
 
 static int placementInfoRefCmpSampleMuts(const void *va, const void *vb)
 /* Compare slRef->placementInfo->sampleMuts lists.  Shorter lists first.  Using alpha sort
  * to distinguish between different sampleMuts contents arbitrarily; the purpose is to
  * clump samples with identical lists. */
 {
 struct slRef * const *rra = va;
@@ -714,32 +843,32 @@
         safecpy(indentForKids+indentLen - 4, 4 + 1, "    ");
     else
         safecpy(indentForKids+indentLen - 4, 4 + 1, "|   ");
     }
 if (node->numEdges > 0)
     {
     char kidIndent[strlen(indent)+5];
     safef(kidIndent, sizeof kidIndent, "%s%s", indentForKids, "+---");
     int i;
     for (i = 0;  i < node->numEdges;  i++)
         asciiTree(node->edges[i], kidIndent, (i == node->numEdges - 1));
     }
 }
 
 static void describeSamplePlacements(struct slName *sampleIds, struct hash *samplePlacements,
-                                     struct phyloTree *subtree,
-                                     struct mutationAnnotatedTree *bigTree, char *db)
+                                     struct phyloTree *subtree, struct hash *sampleMetadata,
+                                     struct mutationAnnotatedTree *bigTree)
 /* Report how each sample fits into the big tree. */
 {
 // Sort sample placements by sampleMuts so we can group identical samples.
 struct slRef *placementRefs = getPlacementRefList(sampleIds, samplePlacements);
 slSort(&placementRefs, placementInfoRefCmpSampleMuts);
 int relatedCount = slCount(placementRefs);
 int clumpSize = countIdentical(placementRefs);
 if (clumpSize < relatedCount && relatedCount > 2)
     {
     // Not all of the related sequences are identical, so they will be broken down into
     // separate "clumps".  List all related samples first to avoid confusion.
     puts("<pre>");
     asciiTree(subtree, "", TRUE);
     puts("</pre>");
     }
@@ -770,33 +899,33 @@
         printf("<b>%s</b>\n", info->sampleId);
         ref = ref->next;
         }
     refsToGo = ref;
     displaySampleMuts(info);
     if (info->imputedBases)
         {
         puts("<p>Base values imputed by parsimony:\n<ul>");
         struct baseVal *bv;
         for (bv = info->imputedBases;  bv != NULL;  bv = bv->next)
             printf("<li>%d: %s\n", bv->chromStart+1, bv->val);
         puts("</ul>");
         puts("</p>");
         }
     displayVariantPath(info->variantPath, clumpSize == 1 ? info->sampleId : "samples");
-    displayBestNodes(info, db);
+    displayBestNodes(info, sampleMetadata);
     if (!showBestNodePaths)
-        displayNearestNeighbors(bigTree, info, db);
+        displayNearestNeighbors(bigTree, info, sampleMetadata);
     if (showParsimonyScore && info->parsimonyScore > 0)
         printf("<p>Parsimony score added by your sample: %d</p>\n", info->parsimonyScore);
         //#*** TODO: explain parsimony score
     }
 }
 
 struct phyloTree *phyloPruneToIds(struct phyloTree *node, struct slName *sampleIds)
 /* Prune all descendants of node that have no leaf descendants in sampleIds. */
 {
 if (node->numEdges)
     {
     struct phyloTree *prunedKids = NULL;
     int i;
     for (i = 0;  i < node->numEdges;  i++)
         {
@@ -1059,32 +1188,32 @@
 }
 
 static void appendExcludingNs(struct dyString *dy, struct seqInfo *si)
 /* Append a note to dy about how many N bases and start and/or end are excluded from statistic. */
 {
 dyStringAppend(dy, "excluding ");
 if (si->nCountStart)
     dyStringPrintf(dy, "%d N bases at start", si->nCountStart);
 if (si->nCountStart && si->nCountEnd)
     dyStringAppend(dy, " and ");
 if (si->nCountEnd)
     dyStringPrintf(dy, "%d N bases at end", si->nCountEnd);
 }
 
 static void summarizeSequences(struct seqInfo *seqInfoList, boolean isFasta,
-                               struct usherResults *ur, struct tempName *jsonTns[], char *db,
-                               struct mutationAnnotatedTree *bigTree)
+                               struct usherResults *ur, struct tempName *jsonTns[],
+                               struct hash *sampleMetadata, struct mutationAnnotatedTree *bigTree)
 /* Show a table with composition & alignment stats for each sequence that passed basic QC. */
 {
 if (seqInfoList)
     {
     puts("<table class='seqSummary'>");
     printSummaryHeader(isFasta);
     puts("<tbody>");
     struct dyString *dy = dyStringNew(0);
     struct seqInfo *si;
     for (si = seqInfoList;  si != NULL;  si = si->next)
         {
         puts("<tr>");
         printf("<th>%s</td>", replaceChars(si->seq->name, "|", " | "));
         if (isFasta)
             {
@@ -1202,31 +1331,31 @@
                                snc->refBase, snc->chromStart+1, snc->newBase, reasonList->name);
                 for (reason = reasonList->next;  reason != NULL;  reason = reason->next)
                     {
                     replaceChar(reason->name, '_', ' ');
                     dyStringPrintf(dy, ", %s", reason->name);
                     }
                 dyStringAppendC(dy, ')');
                 }
             printTooltip(dy->string);
             }
         printf("</td>");
         struct placementInfo *pi = hashFindVal(ur->samplePlacements, si->seq->name);
         if (pi)
             {
             struct slName *neighbor = findNearestNeighbor(bigTree, pi->sampleId, pi->variantPath);
-            char *lineage = neighbor ?  lineageForSample(db, neighbor->name) : "?";
+            char *lineage = neighbor ?  lineageForSample(sampleMetadata, neighbor->name) : "?";
             printf("<td>%s</td><td>%s</td>",
                    neighbor ? replaceChars(neighbor->name, "|", " | ") : "?",
                    lineage ? lineage : "?");
             int imputedCount = slCount(pi->imputedBases);
             printf("<td class='%s'>%d",
                    qcClassForImputedBases(imputedCount), imputedCount);
             if (imputedCount > 0)
                 {
                 dyStringClear(dy);
                 struct baseVal *bv;
                 for (bv = pi->imputedBases;  bv != NULL;  bv = bv->next)
                     {
                     dyStringAppendSep(dy, ", ");
                     dyStringPrintf(dy, "%d: %s", bv->chromStart+1, bv->val);
                     }
@@ -1360,63 +1489,65 @@
     }
 lineFileClose(&lf);
 if (vcfTn)
     {
     struct usherResults *results = runUsher(usherPath, usherAssignmentsPath, vcfTn->forCgi,
                                             subtreeSize, sampleIds, bigTree->condensedNodes,
                                             &startTime);
     if (results->subtreeInfoList)
         {
         int subtreeCount = slCount(results->subtreeInfoList);
         // Sort subtrees by number of user samples (largest first).
         slSort(&results->subtreeInfoList, subTreeInfoUserSampleCmp);
         // Make Nextstrain/auspice JSON file for each subtree.
         char *bigGenePredFile = phyloPlaceDbSettingPath(db, "bigGenePredFile");
         char *metadataFile = phyloPlaceDbSettingPath(db, "metadataFile");
+        struct hash *sampleMetadata = getSampleMetadata(metadataFile);
         struct tempName *jsonTns[subtreeCount];
         struct subtreeInfo *ti;
         int ix;
         for (ix = 0, ti = results->subtreeInfoList;  ti != NULL;  ti = ti->next, ix++)
             {
             AllocVar(jsonTns[ix]);
             trashDirFile(jsonTns[ix], "ct", "subtreeAuspice", ".json");
-            treeToAuspiceJson(ti, db, refGenome, bigGenePredFile, metadataFile, jsonTns[ix]->forCgi);
+            treeToAuspiceJson(ti, db, refGenome, bigGenePredFile, sampleMetadata,
+                              jsonTns[ix]->forCgi);
             }
         puts("<p></p>");
         makeButtonRow(jsonTns, subtreeCount, isFasta);
         printf("<p>If you have metadata you wish to display, click a 'view subtree in Nextstrain' "
                "button, and then you can drag on a CSV file to "
                "<a href='"NEXTSTRAIN_DRAG_DROP_DOC"' target=_blank>add it to the tree view</a>."
                "</p>\n");
-        summarizeSequences(seqInfoList, isFasta, results, jsonTns, db, bigTree);
+        summarizeSequences(seqInfoList, isFasta, results, jsonTns, sampleMetadata, bigTree);
         reportTiming(&startTime, "write summary table (including reading in lineages)");
         for (ix = 0, ti = results->subtreeInfoList;  ti != NULL;  ti = ti->next, ix++)
             {
             int subtreeUserSampleCount = slCount(ti->subtreeUserSampleIds);
             printf("<h3>Subtree %d: ", ix+1);
             if (subtreeUserSampleCount > 1)
                 printf("%d related samples", subtreeUserSampleCount);
             else if (subtreeCount > 1)
                 printf("Unrelated sample");
             printf("</h3>\n");
             makeNextstrainButton("viewNextstrainSub", ix, jsonTns);
             puts("<br>");
             // Make a sub-subtree with only user samples for display:
             struct phyloTree *subtree = phyloOpenTree(ti->subtreeTn->forCgi);
             subtree = phyloPruneToIds(subtree, ti->subtreeUserSampleIds);
             describeSamplePlacements(ti->subtreeUserSampleIds, results->samplePlacements, subtree,
-                                     bigTree, db);
+                                     sampleMetadata, bigTree);
             }
         reportTiming(&startTime, "describe placements");
 
         // Make custom tracks for uploaded samples and subtree(s).
         struct tempName *ctTn = writeCustomTracks(vcfTn, results, sampleIds, bigTree->tree,
                                                   fontHeight, &startTime);
                                
         // Offer big tree w/new samples for download
         puts("<h3>Downloads</h3>");
         puts("<ul>");
         printf("<li><a href='%s' download>SARS-CoV-2 phylogenetic tree "
                "with your samples (Newick file)</a>\n", results->bigTreePlusTn->forHtml);
         for (ix = 0, ti = results->subtreeInfoList;  ti != NULL;  ti = ti->next, ix++)
            {
             printf("<li><a href='%s' download>Subtree with %s", ti->subtreeTn->forHtml,