src/hg/hgPhyloPlace/phyloPlace.c a384cc55ed66357a2e58c5d7a20f2125909b30b8

a384cc55ed66357a2e58c5d7a20f2125909b30b8
angie
  Wed Oct 12 14:32:51 2022 -0700
Parse ranges of IDs such as EPI_SET notation (EPI_ISL_1234-5678), requested by @corneliusroemer in https://github.com/yatisht/usher/issues/276

diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index c54637d..7cf8d44 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -2665,58 +2665,141 @@
         }
     }
 else if (match == NULL && strchr(name, ' '))
     {
     // GISAID sequence names may include spaces, in both country names ("South Korea") and
     // isolate names.  That messes up FASTA headers, so Nextstrain strips out spaces when
     // making the nextmeta and nextfasta download files for GISAID.  Try stripping out spaces:
     char copy[strlen(name)+1];
     safecpy(copy, sizeof copy, name);
     stripChar(copy, ' ');
     match = hashFindVal(nameHash, copy);
     }
 return match;
 }
 
+static boolean tallyMatch(char *match, char *term,
+                          struct slName **retMatches, struct slName **retUnmatched)
+/* If match is non-NULL, add result to retMatches and return TRUE, otherwise add term to
+ * retUnmatched and return FALSE. */
+{
+boolean foundIt = FALSE;
+if (match)
+    {
+    foundIt = TRUE;
+    slNameAddHead(retMatches, match);
+    }
+else
+    slNameAddHead(retUnmatched, term);
+return foundIt;
+}
+
+static boolean matchIdRange(struct hash *nameHash, char *line,
+                            struct slName **retMatches, struct slName **retUnmatched)
+/* If line looks like it might contain a range of IDs, for example EPI_ISL_123-129 from an EPI_SET,
+ * then expand the range(s) into individual IDs, look up the IDs, set retMatches and retUnmatched
+ * to per-ID results, and return TRUE. */
+{
+boolean foundAny = FALSE;
+*retMatches = *retUnmatched = NULL;
+regmatch_t substrArr[7];
+// Line may contain a list of distinct IDs and/or ID ranges
+#define oneIdExp "([A-Z_]+)([0-9]+)"
+#define rangeEndExp "- *([A-Z_]*)([0-9]+)"
+#define rangeListExp "^("oneIdExp" *("rangeEndExp")?),? *"
+while (regexMatchSubstr(line, rangeListExp, substrArr, ArraySize(substrArr)))
+    {
+    char *prefixA = regexSubstringClone(line, substrArr[2]);
+    char *numberA = regexSubstringClone(line, substrArr[3]);
+    if (regexSubstrMatched(substrArr[4]))
+        {
+        // Looks like a well-formed ID range
+        char *prefixB = regexSubstringClone(line, substrArr[5]);
+        char *numberB = regexSubstringClone(line, substrArr[6]);
+        int start = atol(numberA);
+        int end = atol(numberB);
+        if ((isEmpty(prefixB) || sameString(prefixA, prefixB)) && end >= start)
+            {
+            char oneId[strlen(line)+1];
+            int num;
+            for (num = start;  num <= end;  num++)
+                {
+                safef(oneId, sizeof oneId, "%s%d", prefixA, num);
+                char *match = hashFindVal(nameHash, oneId);
+                foundAny |= tallyMatch(match, oneId, retMatches, retUnmatched);
+                }
+            }
+        else
+            {
+            // It matched the regex but the prefixes don't match and/or numbers are out of order
+            // so we don't know what to do with it -- try matchName just in case.
+            char *regMatch = regexSubstringClone(line, substrArr[1]);
+            char *match = matchName(nameHash, regMatch);
+            foundAny |= tallyMatch(match, regMatch, retMatches, retUnmatched);
+            }
+        }
+    else
+        {
+        // Just one ID
+        char oneId[strlen(line)+1];
+        safef(oneId, sizeof oneId, "%s%s", prefixA, numberA);
+        char *match = hashFindVal(nameHash, oneId);
+        foundAny |= tallyMatch(match, oneId, retMatches, retUnmatched);
+        }
+    // Skip past this match to see if the line has another range next.
+    line += (substrArr[0].rm_eo - substrArr[0].rm_so);
+    }
+return foundAny;
+}
+
 static struct slName *readSampleIds(struct lineFile *lf, struct mutationAnnotatedTree *bigTree,
                                     char *aliasFile)
 /* Read a file of sample names/IDs from the user; typically these will not be exactly the same
  * as the protobuf's (UCSC protobuf names are typically country/isolate/year|ID|date), so attempt
  * to find component matches if an exact match isn't found. */
 {
 struct slName *sampleIds = NULL;
 struct slName *unmatched = NULL;
 struct hash *nameHash = getTreeNames(bigTree, aliasFile, TRUE);
 char *line;
 while (lineFileNext(lf, &line, NULL))
     {
     // If tab-sep or comma-sep, just try first word in line
     char *tab = strchr(line, '\t');
     if (tab)
         *tab = '\0';
     else
         {
         char *comma = strchr(line, ',');
         if (comma)
             *comma = '\0';
         }
     char *match = matchName(nameHash, line);
     if (match)
         slNameAddHead(&sampleIds, match);
     else
+        {
+        struct slName *rangeMatches = NULL, *rangeUnmatched = NULL;
+        if (matchIdRange(nameHash, line, &rangeMatches, &rangeUnmatched))
+            {
+            sampleIds = slCat(rangeMatches, sampleIds);
+            unmatched = slCat(rangeUnmatched, unmatched);
+            }
+        else
             slNameAddHead(&unmatched, line);
         }
+    }
 if (unmatched)
     {
     struct dyString *firstFew = dyStringNew(0);
     int maxExamples = 5;
     struct slName *example;
     int i;
     for (i = 0, example = unmatched;  example != NULL && i < maxExamples;
          i++, example = example->next)
         {
         dyStringAppendSep(firstFew, ", ");
         dyStringPrintf(firstFew, "'%s'", example->name);
         }
     warn("Unable to find %d of your sequences in the tree, e.g. %s",
          slCount(unmatched), firstFew->string);
     dyStringFree(&firstFew);