b9af5d88785ba21a316027a5e22e5fbd5fb9a1af angie Mon Jul 31 10:34:03 2017 -0700 Added support for "chr" sequence names in HGVS genomic (g.) terms. refs #19832 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index 7bc0ae6..9e490a7 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -206,30 +206,41 @@ #define hgvsRefSeqNPPDotRangeExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotRangeExp) // substring numbering: // 0..................................................... whole matching string // 1................ accession and optional dot version // 2..... optional dot version // 3..... optional gene symbol in ()s // 4... optional gene symbol // 5... original start AA // 6... 1-based start position // 7................. optional range sep and AA+pos // 8... original end AA // 9... 1-based end position // 10..... change description // Pseudo-HGVS in common usage +// g. with "chr" ID: +#define pseudoHgvsChrGDotExp hgvsFullRegex("(chr[0-9A-Za-z_]+)", hgvsGMDotPosExp) "(.*)" +// substring numbering: +// 0..................................... whole matching string +// 1........... chr... +// 2. g or m +// 3...... 1-based start position +// 4....... optional range separator and end position +// 5..... 1-based end position +// 6.... change description + // Sometimes users give an NM_ accession, but a protein change. #define pseudoHgvsNMPDotSubstExp "^" versionedRefSeqNMExp "[ :]+p?\\.?" hgvsAminoAcidSubstExp // substring numbering: // 0..................................................... whole matching string // 1............... acc & optional dot version // 2........ optional dot version // 3..... optional gene sym in ()s // 4... optional gene symbol // 5..... original sequence // 6...... 1-based position // 7...... replacement sequence #define pseudoHgvsNMPDotRangeExp "^" versionedRefSeqNMExp "[ :]+p?\\.?" hgvsAaRangeExp // substring numbering: // 0..................................................... whole matching string @@ -699,30 +710,46 @@ char geneSymbol[len+1]; safencpy(geneSymbol, sizeof(geneSymbol), term, len); char *nmAcc = nmForGeneSymbol(db, geneSymbol); if (isNotEmpty(nmAcc)) { // Make it a real HGVS term with the NM and pass that on to the usual parser. int descStartIx = regexSubstrMatched(substrs[2]) ? 2 : 3; char *description = term + substrs[descStartIx].rm_so; struct dyString *nmTerm = dyStringCreate("%s(%s):c.%s", nmAcc, geneSymbol, description); hgvs = hgvsParseTerm(nmTerm->string); dyStringFree(&nmTerm); freeMem(nmAcc); } } +else if (regexMatchSubstr(term, pseudoHgvsChrGDotExp, substrs, ArraySize(substrs))) + { + int chrIx = 1; + int startPosIx = 3; + int endPosIx = 5; + int changeIx = 6; + AllocVar(hgvs); + hgvs->type = hgvstGenomic; + hgvs->seqAcc = regexSubstringClone(term, substrs[chrIx]); + hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); + if (regexSubstrMatched(substrs[endPosIx])) + hgvs->end = regexSubstringInt(term, substrs[endPosIx]); + else + hgvs->end = hgvs->start1; + hgvs->changes = regexSubstringClone(term, substrs[changeIx]); + } return hgvs; } static char refBaseFromNucSubst(char *change) /* If sequence change is a nucleotide substitution, return the reference base, else null char. */ { if (regexMatchNoCase(change, "^([ACGTU])>")) return toupper(change[0]); return '\0'; } static void hgvsStartEndToZeroBasedHalfOpen(struct hgvsVariant *hgvs, int *retStart, int *retEnd) /* Convert HGVS's fully closed, 1-based-unless-negative start and end to UCSC start and end. */ { // If hgvs->start1 is negative, it is effectively 0-based, so subtract 1 only if positive.