17743793f41d56eb703ba368534cbdad0c1b735e
angie
  Sat Dec 31 13:09:33 2022 -0800
When parsing GB accession out of full name, look for | so we skip over embedded reference accession in isolate name.  Reported in https://github.com/cov-lineages/pango-designation/issues/1494

diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c
index 8ae9ea4..a8b9492 100644
--- src/hg/hgPhyloPlace/phyloPlace.c
+++ src/hg/hgPhyloPlace/phyloPlace.c
@@ -864,35 +864,36 @@
 if (epiId)
     {
     char *p = epiId + strlen("EPI_ISL_");
     while (isdigit(*p))
         p++;
     *p = '\0';
     }
 return epiId;
 }
 
 char *gbIdFromSampleName(char *sampleId)
 /* If a GenBank accession is present somewhere in sampleId, extract and return it, otherwise NULL. */
 {
 char *gbId = NULL;
 regmatch_t substrs[2];
-if (regexMatchSubstr(sampleId, "([A-Z][A-Z][0-9]{6})", substrs, ArraySize(substrs)))
+// If it's preceded by anything, make sure it's a | so we ignore isolate names that glom on the
+// reference's GenBank accession in them (e.g. ..._MN908947.3/2022|OQ070230.1).
+if (regexMatchSubstr(sampleId, "^(.*\\|)?([A-Z][A-Z][0-9]{6})", substrs, ArraySize(substrs)))
     {
-    // Make sure there are word boundaries around the match
-    if ((substrs[1].rm_so == 0 || !isalnum(sampleId[substrs[1].rm_so-1])) &&
-        !isalnum(sampleId[substrs[1].rm_eo]))
+    // Make sure there is a word boundary at the end of the match too
+    if (!isalnum(sampleId[substrs[1].rm_eo]))
         gbId = cloneStringZ(sampleId+substrs[1].rm_so, substrs[1].rm_eo - substrs[1].rm_so);
     }
 return gbId;
 }
 
 struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId)
 /* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */
 {
 struct sampleMetadata *met = NULL;
 if (sampleMetadata == NULL)
     return NULL;
 char *epiId = epiIdFromSampleName(sampleId);
 if (epiId)
     met = hashFindVal(sampleMetadata, epiId);
 if (!met)