17743793f41d56eb703ba368534cbdad0c1b735e angie Sat Dec 31 13:09:33 2022 -0800 When parsing GB accession out of full name, look for | so we skip over embedded reference accession in isolate name. Reported in https://github.com/cov-lineages/pango-designation/issues/1494 diff --git src/hg/hgPhyloPlace/phyloPlace.c src/hg/hgPhyloPlace/phyloPlace.c index 8ae9ea4..a8b9492 100644 --- src/hg/hgPhyloPlace/phyloPlace.c +++ src/hg/hgPhyloPlace/phyloPlace.c @@ -864,35 +864,36 @@ if (epiId) { char *p = epiId + strlen("EPI_ISL_"); while (isdigit(*p)) p++; *p = '\0'; } return epiId; } char *gbIdFromSampleName(char *sampleId) /* If a GenBank accession is present somewhere in sampleId, extract and return it, otherwise NULL. */ { char *gbId = NULL; regmatch_t substrs[2]; -if (regexMatchSubstr(sampleId, "([A-Z][A-Z][0-9]{6})", substrs, ArraySize(substrs))) +// If it's preceded by anything, make sure it's a | so we ignore isolate names that glom on the +// reference's GenBank accession in them (e.g. ..._MN908947.3/2022|OQ070230.1). +if (regexMatchSubstr(sampleId, "^(.*\\|)?([A-Z][A-Z][0-9]{6})", substrs, ArraySize(substrs))) { - // Make sure there are word boundaries around the match - if ((substrs[1].rm_so == 0 || !isalnum(sampleId[substrs[1].rm_so-1])) && - !isalnum(sampleId[substrs[1].rm_eo])) + // Make sure there is a word boundary at the end of the match too + if (!isalnum(sampleId[substrs[1].rm_eo])) gbId = cloneStringZ(sampleId+substrs[1].rm_so, substrs[1].rm_eo - substrs[1].rm_so); } return gbId; } struct sampleMetadata *metadataForSample(struct hash *sampleMetadata, char *sampleId) /* Look up sampleId in sampleMetadata, by accession if sampleId seems to include an accession. */ { struct sampleMetadata *met = NULL; if (sampleMetadata == NULL) return NULL; char *epiId = epiIdFromSampleName(sampleId); if (epiId) met = hashFindVal(sampleMetadata, epiId); if (!met)