src/hg/hgPhyloPlace/phyloPlace.c 0e66591e44d669ef83fa016069b1ceb283c01800

0e66591e44d669ef83fa016069b1ceb283c01800
angie
  Thu Jul 15 12:40:43 2021 -0700
Support more flexible ID search: accessions without dot-versions and isolate names.  When loading FASTA, detect sequence names that are numeric or already in the tree and add a prefix to prevent usher from rejecting the sequences.

diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index fb8b18e..20cb86f 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -2356,78 +2356,116 @@
         *retMetadataFile = treeChoices->metadataFiles[0];
         *retSource = treeChoices->sources[0];
         *retAliasFile = treeChoices->aliasFiles[0];
         }
     }
 else
     {
     // Fall back on old settings
     *retProtobufPath = getUsherAssignmentsPath(db, TRUE);
     *retMetadataFile = phyloPlaceDbSettingPath(db, "metadataFile");
     *retSource = "GISAID";
     *retAliasFile = NULL;
     }
 }
 
-static void addNameAndComponents(struct hash *nameHash, char *fullName)
-/* Add entries to nameHash mapping fullName to itself, and components of fullName to fullName. */
+static void addModVersion(struct hash *nameHash, char *id, char *fullName)
+/* Map id to fullName.  If id has .version, strip that (modifying id!) and map versionless id
+ * to fullName. */
+{
+hashAdd(nameHash, id, fullName);
+char *p = strchr(id, '.');
+if (p)
+    {
+    *p = '\0';
+    hashAdd(nameHash, id, fullName);
+    }
+}
+
+static void maybeAddIsolate(struct hash *nameHash, char *name, char *fullName)
+/* If name looks like country/isolate/year and isolate looks sufficiently distinctive
+ * then also map isolate to fullName. */
+{
+regmatch_t substrs[2];
+if (regexMatchSubstr(name, "^[A-Za-z _]+/(.*)/[0-9]{4}$", substrs, ArraySize(substrs)))
+    {
+    char isolate[substrs[1].rm_eo - substrs[1].rm_so + 1];
+    regexSubstringCopy(name, substrs[1], isolate, sizeof isolate);
+    if (! isAllDigits(isolate) &&
+        (regexMatch(isolate, "[A-Za-z0-9]{4}") ||
+         regexMatch(isolate, "[A-Za-z0-9][A-Za-z0-9]+[-_][A-Za-z0-9][A-Za-z0-9]+")))
+        {
+        hashAdd(nameHash, isolate, fullName);
+        }
+    }
+}
+static void addNameMaybeComponents(struct hash *nameHash, char *fullName, boolean addComponents)
+/* Add entries to nameHash mapping fullName to itself, and components of fullName to fullName.
+ * If addComponents and fullName is |-separated name|ID|date, add name and ID to nameHash. */
 {
 char *fullNameHashStored = hashStoreName(nameHash, fullName);
 // Now that we have hash storage for fullName, make it point to itself.
 struct hashEl *hel = hashLookup(nameHash, fullName);
 if (hel == NULL)
     errAbort("Can't look up '%s' right after adding it", fullName);
 hel->val = fullNameHashStored;
+if (addComponents)
+    {
     char *words[4];
     char copy[strlen(fullName)+1];
     safecpy(copy, sizeof copy, fullName);
     int wordCount = chopString(copy, "|", words, ArraySize(words));
     if (wordCount == 3)
         {
         // name|ID|date
         hashAdd(nameHash, words[0], fullNameHashStored);
-    hashAdd(nameHash, words[1], fullNameHashStored);
+        maybeAddIsolate(nameHash, words[0], fullNameHashStored);
+        addModVersion(nameHash, words[1], fullNameHashStored);
         }
     else if (wordCount == 2)
         {
         // ID|date
-    hashAdd(nameHash, words[0], fullNameHashStored);
+        addModVersion(nameHash, words[0], fullNameHashStored);
+        // ID might be a COG-UK country/isolate/year
+        maybeAddIsolate(nameHash, words[0], fullNameHashStored);
+        }
     }
 }
 
-static void rAddLeafNames(struct phyloTree *node, struct hash *condensedNodes, struct hash *nameHash)
+static void rAddLeafNames(struct phyloTree *node, struct hash *condensedNodes, struct hash *nameHash,
+                          boolean addComponents)
 /* Recursively descend tree, adding leaf node names to nameHash (including all names of condensed
  * leaf nodes).  Also map components of full names (country/isolate/year|ID|date) to full names. */
 {
 if (node->numEdges == 0)
     {
     char *leafName = node->ident->name;
     struct slName *nodeList = hashFindVal(condensedNodes, leafName);
     if (nodeList)
         {
         struct slName *sample;
         for (sample = nodeList;  sample != NULL;  sample = sample->next)
-            addNameAndComponents(nameHash, sample->name);
+            addNameMaybeComponents(nameHash, sample->name, addComponents);
         }
     else
-        addNameAndComponents(nameHash, leafName);
+        addNameMaybeComponents(nameHash, leafName, addComponents);
     }
 else
     {
     int i;
     for (i = 0;  i < node->numEdges;  i++)
-        rAddLeafNames(node->edges[i], condensedNodes, nameHash);
+        rAddLeafNames(node->edges[i], condensedNodes, nameHash, addComponents);
     }
 }
 
 static void addAliases(struct hash *nameHash, char *aliasFile)
 /* If there is an aliasFile, then add its mappings of ID/alias to full tree name to nameHash. */
 {
 if (isNotEmpty(aliasFile) && fileExists(aliasFile))
     {
     struct lineFile *lf = lineFileOpen(aliasFile, TRUE);
     int missCount = 0;
     char *missExample = NULL;
     char *line;
     while (lineFileNextReal(lf, &line))
         {
         char *words[3];
@@ -2438,37 +2476,38 @@
             hashAdd(nameHash, words[0], fullName);
         else
             {
             missCount++;
             if (missExample == NULL)
                 missExample = cloneString(words[1]);
             }
         }
     lineFileClose(&lf);
     if (missCount > 0)
         fprintf(stderr, "aliasFile %s: %d values in second column were not found in tree, "
                 "e.g. '%s'", aliasFile, missCount, missExample);
     }
 }
 
-static struct hash *getTreeNames(struct mutationAnnotatedTree *bigTree, char *aliasFile)
-/* Make a hash of full names of leaves of bigTree; also map components of those names to the
- * full names in case the user gives us partial names. */
+static struct hash *getTreeNames(struct mutationAnnotatedTree *bigTree, char *aliasFile,
+                                 boolean addComponents)
+/* Make a hash of full names of leaves of bigTree; if addComponents, also map components of those
+ * names to the full names in case the user gives us partial names/IDs. */
 {
 int nodeCount = bigTree->nodeHash->elCount;
 struct hash *nameHash = hashNew(digitsBaseTwo(nodeCount) + 3);
-rAddLeafNames(bigTree->tree, bigTree->condensedNodes, nameHash);
+rAddLeafNames(bigTree->tree, bigTree->condensedNodes, nameHash, addComponents);
 addAliases(nameHash, aliasFile);
 return nameHash;
 }
 
 static char *matchName(struct hash *nameHash, char *name)
 /* Look for a possibly partial name or ID provided by the user in nameHash.  Return the result,
  * possibly NULL.  If the full name doesn't match, try components of the name. */
 {
 name = trimSpaces(name);
 // GISAID fasta headers all have hCoV-19/country/isolate/year|EPI_ISL_#|date; strip the hCoV-19
 // because Nextstrain strips it in nextmeta/nextfasta download files, and so do I when building
 // UCSC's tree.
 if (startsWithNoCase("hCoV-19/", name))
     name += strlen("hCoV-19/");
 char *match = hashFindVal(nameHash, name);
@@ -2512,31 +2551,31 @@
     safecpy(copy, sizeof copy, name);
     stripChar(copy, ' ');
     match = hashFindVal(nameHash, copy);
     }
 return match;
 }
 
 static struct slName *readSampleIds(struct lineFile *lf, struct mutationAnnotatedTree *bigTree,
                                     char *aliasFile)
 /* Read a file of sample names/IDs from the user; typically these will not be exactly the same
  * as the protobuf's (UCSC protobuf names are typically country/isolate/year|ID|date), so attempt
  * to find component matches if an exact match isn't found. */
 {
 struct slName *sampleIds = NULL;
 struct slName *unmatched = NULL;
-struct hash *nameHash = getTreeNames(bigTree, aliasFile);
+struct hash *nameHash = getTreeNames(bigTree, aliasFile, TRUE);
 char *line;
 while (lineFileNext(lf, &line, NULL))
     {
     // If tab-sep or comma-sep, just try first word in line
     char *tab = strchr(line, '\t');
     if (tab)
         *tab = '\0';
     else
         {
         char *comma = strchr(line, ',');
         if (comma)
             *comma = '\0';
         }
     char *match = matchName(nameHash, line);
     if (match)
@@ -2594,31 +2633,32 @@
 if (! bigTree)
     {
     warn("Problem parsing %s; can't make subtree subtracks.", protobufPath);
     }
 lineFileCarefulNewlines(lf);
 struct slName **maskSites = getProblematicSites(db);
 struct dnaSeq *refGenome = hChromSeq(db, chrom, 0, chromSize);
 boolean isFasta = FALSE;
 boolean subtreesOnly = FALSE;
 struct seqInfo *seqInfoList = NULL;
 if (lfLooksLikeFasta(lf))
     {
     boolean *informativeBases = informativeBasesFromTree(bigTree->tree, maskSites);
     struct slPair *failedSeqs;
     struct slPair *failedPsls;
-    vcfTn = vcfFromFasta(lf, db, refGenome, informativeBases, maskSites,
+    struct hash *treeNames = getTreeNames(bigTree, NULL, FALSE);
+    vcfTn = vcfFromFasta(lf, db, refGenome, informativeBases, maskSites, treeNames,
                          &sampleIds, &seqInfoList, &failedSeqs, &failedPsls, &startTime);
     if (failedSeqs)
         {
         puts("<p>");
         struct slPair *fail;
         for (fail = failedSeqs;  fail != NULL;  fail = fail->next)
             printf("%s<br>\n", fail->name);
         puts("</p>");
         }
     if (failedPsls)
         {
         puts("<p>");
         struct slPair *fail;
         for (fail = failedPsls;  fail != NULL;  fail = fail->next)
             printf("%s<br>\n", fail->name);
@@ -2808,21 +2848,17 @@
         puts(" (JSON file)</a>");
         }
     puts("</ul>");
 
     if (!subtreesOnly)
         {
         // Notify in opposite order of custom track creation.
         puts("<h3>Custom tracks for viewing in the Genome Browser</h3>");
         printf("<p>Added custom track of uploaded samples.</p>\n");
         if (subtreeCount > 0 && subtreeCount <= MAX_SUBTREE_CTS)
             printf("<p>Added %d subtree custom track%s.</p>\n",
                    subtreeCount, (subtreeCount > 1 ? "s" : ""));
         ctFile = urlFromTn(ctTn);
         }
     }
-else
-    {
-    warn("No subtree output from usher.\n");
-    }
 return ctFile;
 }