029e75e2b3558b8c136e8ea4761bf181747627ac angie Wed Jun 7 11:43:56 2017 -0700 Doh, HGVS n. search was not supported for NR_ terms with which it is usually used. refs #19423 note-3 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index 4ba4bfa..ec5efb2 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -33,31 +33,31 @@ // NG = RefSeq incomplete genomic region (e.g. gene locus) // NM = RefSeq curated mRNA // NP = RefSeq curated protein // NR = RefSeq curated (non-coding) RNA #define geneSymbolExp "([A-Za-z0-9./_-]+)" #define optionalGeneSymbolExp "(\\(" geneSymbolExp "\\))?" #define versionedAccPrefixExp(p) "(" p "_[0-9]+(\\.[0-9]+)?)" optionalGeneSymbolExp // ........................ accession and optional dot version // ........... optional dot version // ...... optional gene symbol in ()s // .... optional gene symbol #define versionedRefSeqNCExp versionedAccPrefixExp("NC") #define versionedRefSeqNGExp versionedAccPrefixExp("NG") #define versionedRefSeqNMExp versionedAccPrefixExp("NM") #define versionedRefSeqNPExp versionedAccPrefixExp("NP") -#define versionedRefSeqNRExp versionedAccPrefixExp("NR") +#define versionedRefSeqNMRExp versionedAccPrefixExp("N[MR]") // Nucleotide position regexes // (c. = CDS, g. = genomic, m. = mitochondrial, n.= non-coding RNA, r. = RNA) #define posIntExp "([0-9]+)" #define hgvsNtPosExp posIntExp "(_" posIntExp ")?" // ...... 1-based start position // ............. optional range separator and end position // ...... 1-based end position // Now for a regex that can handle positions like "26" or "*80" or "-24+75_-24+92"... #define anchorExp "([-*])?" #define offsetExp "([-+])" #define complexNumExp anchorExp posIntExp "(" offsetExp posIntExp ")?" #define hgvsCdsPosExp complexNumExp "(_" complexNumExp ")?" // ... optional anchor "-" or "*" // ... mandatory first number @@ -110,31 +110,31 @@ // 4.... (n/a) optional gene symbol // 5. g or m // 6.. 1-based start position // 7... optional range separator and end position // 8.. 1-based end position // 9... change description #define hgvsLrgNDotExp hgvsFullRegex(lrgTranscriptExp, hgvsNDotPosExp) "(.*)" // 0..................................... whole matching string // 1................... LRG transcript // 2...... 1-based start position // 3.......... optional range separator and end position // 4..... 1-based end position // 5... change description -#define hgvsRefSeqNMNDotExp hgvsFullRegex(versionedRefSeqNMExp, hgvsNDotPosExp) "(.*)" +#define hgvsRefSeqNMRNDotExp hgvsFullRegex(versionedRefSeqNMRExp, hgvsNDotPosExp) "(.*)" // substring numbering: // 0..................................... whole matching string // 1................. accession and optional dot version // 2........ optional dot version // 3...... (n/a) optional gene symbol in ()s // 4.... (n/a) optional gene symbol // 5.. 1-based start position // 6... optional range separator and end position // 7.. 1-based end position // 8... change description #define hgvsLrgCDotPosExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotPosExp) #define hgvsLrgCDotExp hgvsLrgCDotPosExp "(.*)" // substring numbering: // 0..................................................... whole matching string @@ -288,31 +288,31 @@ struct hgvsVariant *hgvs = NULL; boolean matches = FALSE; int accIx = 1; int startPosIx = 2; int endPosIx = 4; int changeIx = 5; // The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that // affects all substring offsets after the accession. int refSeqExtra = 3; int geneSymbolIx = -1; regmatch_t substrs[10]; if (regexMatchSubstr(term, hgvsLrgNDotExp, substrs, ArraySize(substrs))) { matches = TRUE; } -else if (regexMatchSubstr(term, hgvsRefSeqNMNDotExp, substrs, ArraySize(substrs))) +else if (regexMatchSubstr(term, hgvsRefSeqNMRNDotExp, substrs, ArraySize(substrs))) { matches = TRUE; geneSymbolIx = 4; startPosIx += refSeqExtra; endPosIx += refSeqExtra; changeIx += refSeqExtra; } if (matches) { AllocVar(hgvs); hgvs->type = hgvstNoncoding; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); @@ -1300,31 +1300,31 @@ pslFree(&variantPsl); pslFree(&txAli); } sqlFreeResult(&sr); hFreeConn(&conn); return mappedToGenome; } static char *pslTableForAcc(char *db, char *acc) /* Based on acc (and whether db has NCBI RefSeq alignments), pick a PSL table where * acc should be found (table may or may not exist). Don't free the returned string. */ { char *pslTable = NULL; if (startsWith("LRG_", acc)) pslTable = "lrgTranscriptAli"; -else if (startsWith("NM_", acc)) +else if (startsWith("NM_", acc) || startsWith("NR_", acc)) { // Use NCBI's alignments if they are available if (dbHasNcbiRefSeq(db)) pslTable = "ncbiRefSeqPsl"; else pslTable = "refSeqAli"; } return pslTable; } #define limitToRange(val, min, max) { if (val < min) { val = min; } \ if (val > max) { val = max; } } static struct bed *pslAndFriendsToRegion(struct psl *psl, struct hgvsVariant *hgvs, int upstream, int downstream)