904b4e4ad971a5be51a26dee4809efbe9c7b6430 angie Thu Aug 10 13:22:26 2017 -0700 Support XM_/XR_/XP_ accs in HGVS position search. Generate correct HGVS p.*ext terms for stop codon loss. refs #19968 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index 77fc0eb..e00c269 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -30,35 +30,35 @@ #define lrgTranscriptExp "(LRG_[0-9]+t[0-9+])" #define lrgRegionExp "(LRG_[0-9+])" // NC = RefSeq reference assembly chromosome // NG = RefSeq incomplete genomic region (e.g. gene locus) // NM = RefSeq curated mRNA // NP = RefSeq curated protein // NR = RefSeq curated (non-coding) RNA #define geneSymbolExp "([A-Za-z0-9./_-]+)" #define optionalGeneSymbolExp "(\\(" geneSymbolExp "\\))?" #define versionedAccPrefixExp(p) "(" p "_[0-9]+(\\.[0-9]+)?)" optionalGeneSymbolExp // ........................ accession and optional dot version // ........... optional dot version // ...... optional gene symbol in ()s // .... optional gene symbol -#define versionedRefSeqNCExp versionedAccPrefixExp("NC") -#define versionedRefSeqNGExp versionedAccPrefixExp("NG") -#define versionedRefSeqNMExp versionedAccPrefixExp("NM") -#define versionedRefSeqNPExp versionedAccPrefixExp("NP") -#define versionedRefSeqNMRExp versionedAccPrefixExp("N[MR]") +#define versionedRefSeqNCExp versionedAccPrefixExp("[NX]C") +#define versionedRefSeqNGExp versionedAccPrefixExp("[NX]G") +#define versionedRefSeqNMExp versionedAccPrefixExp("[NX]M") +#define versionedRefSeqNPExp versionedAccPrefixExp("[NX]P") +#define versionedRefSeqNMRExp versionedAccPrefixExp("[NX][MR]") // Nucleotide position regexes // (c. = CDS, g. = genomic, m. = mitochondrial, n.= non-coding RNA, r. = RNA) #define posIntExp "([0-9]+)" #define hgvsGenoPosExp posIntExp "(_" posIntExp ")?" // ...... 1-based start position // ............. optional range separator and end position // ...... 1-based end position // n. terms can have exonic anchor base and intron offset for both start and end: #define offsetExp "([-+])" // c. terms may also have a UTR indicator before the anchor base (- for UTR5, * for UTR3) #define anchorExp "([-*])?" #define complexNumExp anchorExp posIntExp "(" offsetExp posIntExp ")?" #define hgvsCdsPosExp complexNumExp "(_" complexNumExp ")?" // ... optional UTR anchor "-" or "*" @@ -1324,31 +1324,32 @@ pslFree(&variantPsl); pslFree(&txAli); } sqlFreeResult(&sr); hFreeConn(&conn); return mappedToGenome; } static char *pslTableForAcc(char *db, char *acc) /* Based on acc (and whether db has NCBI RefSeq alignments), pick a PSL table where * acc should be found (table may or may not exist). Don't free the returned string. */ { char *pslTable = NULL; if (startsWith("LRG_", acc)) pslTable = "lrgTranscriptAli"; -else if (startsWith("NM_", acc) || startsWith("NR_", acc)) +else if (startsWith("NM_", acc) || startsWith("NR_", acc) || + startsWith("XM_", acc) || startsWith("XR_", acc)) { // Use NCBI's alignments if they are available if (hDbHasNcbiRefSeq(db)) pslTable = "ncbiRefSeqPsl"; else pslTable = "refSeqAli"; } return pslTable; } #define limitToRange(val, min, max) { if (val < min) { val = min; } \ if (val > max) { val = max; } } static struct bed *pslAndFriendsToRegion(struct psl *psl, struct hgvsVariant *hgvs, int upstream, int downstream) @@ -1429,31 +1430,31 @@ if (region) { adjustInsStartEnd(hgvs, ®ion->chromStart, ®ion->chromEnd); if (retPslTable) *retPslTable = cloneString(pslTable); } return region; } static struct bed *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { struct bed *region = NULL; char *acc = normalizeVersion(db, hgvs->seqAcc, NULL); -if (acc && startsWith("NP_", acc)) +if (acc && (startsWith("NP_", acc) || startsWith("XP_", acc))) { // Translate the NP_*:p. to NM_*:c. and map NM_*:c. to the genome. struct sqlConnection *conn = hAllocConn(db); char query[2048]; if (hDbHasNcbiRefSeq(db)) sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'", acc); else if (hTableExists(db, "refGene")) sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r " "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc); else return NULL; char *nmAcc = sqlQuickString(conn, query); hFreeConn(&conn); if (nmAcc) @@ -2538,73 +2539,100 @@ return dyStringCannibalize(&dy); } char *hgvsPFromVpPep(struct vpPep *vpPep, struct dnaSeq *protSeq, boolean addParens) /* Return an HGVS p. (protein) term for a variant projected into protein space. * Strict HGVS compliance requires parentheses around predicted protein changes, but * nobody seems to do that in practice. * Return NULL if an input is NULL. */ { if (vpPep == NULL || protSeq == NULL) return NULL; struct dyString *dy = dyStringCreate("%s:p.", vpPep->name); if (addParens) dyStringAppendC(dy, '('); int refLen = vpPep->end - vpPep->start; +// When predicting frameshift/extension, the length of ref may be different from refLen +int refExtLen = vpPep->ref ? strlen(vpPep->ref) : refLen; int altLen = vpPep->alt ? strlen(vpPep->alt) : 0; -char *pSeq = protSeq->dna; char refStartAbbr[4]; -aaToAbbr(pSeq[vpPep->start], refStartAbbr, sizeof(refStartAbbr)); +if (vpPep->ref) + aaToAbbr(vpPep->ref[0], refStartAbbr, sizeof(refStartAbbr)); +else + // If ref is null then we should be writing just '=' or '?' but prevent garbage just in case: + safecpy(refStartAbbr, sizeof(refStartAbbr), "?"); +// protSeq may or may not end with X, so treat protSeq->size accordingly +boolean hitsStopCodon = (vpPep->end > protSeq->size || + ((protSeq->dna[protSeq->size-1] == 'X') && vpPep->end == protSeq->size)); if (vpPep->likelyNoChange) dyStringAppend(dy, "="); else if (vpPep->cantPredict || vpPep->spansUtrCds) dyStringAppend(dy, "?"); else if (vpPep->frameshift) { + dyStringPrintf(dy, "%s%d", refStartAbbr, vpPep->start+1); if (altLen == 1) - dyStringPrintf(dy, "%s%dTer", refStartAbbr, vpPep->start+1); + dyStringAppend(dy, "Ter"); else { char altStartAbbr[4]; aaToAbbr(vpPep->alt[0], altStartAbbr, sizeof(altStartAbbr)); - dyStringPrintf(dy, "%s%d%sfsTer%d", refStartAbbr, vpPep->start+1, altStartAbbr, altLen); + // For stop-loss extension, make it "ext*" + if (hitsStopCodon && altLen > refExtLen) + dyStringPrintf(dy, "%sext*%d", altStartAbbr, altLen - refExtLen); + else + dyStringPrintf(dy, "%sfsTer%d", altStartAbbr, altLen); } } +else if (hitsStopCodon && altLen > refExtLen) + { + // Stop loss extension from something that doesn't disrupt frame + char altStartAbbr[4]; + aaToAbbr(vpPep->alt[0], altStartAbbr, sizeof(altStartAbbr)); + dyStringPrintf(dy, "%s%d%sext*%d", refStartAbbr, vpPep->start+1, + altStartAbbr, altLen - refExtLen); + } else { int dupLen = 0; if (refLen == 0 && altLen > 0) { // It's an insertion; is it a duplication? struct seqWindow *pSeqWin = memSeqWindowNew(protSeq->name, protSeq->dna); dupLen = findDup(vpPep->alt, pSeqWin, vpPep->start, FALSE); memSeqWindowFree(&pSeqWin); } if (refLen == 1) dyStringPrintf(dy, "%s%d", refStartAbbr, vpPep->start+1); else { int rangeStart = vpPep->start, rangeEnd = vpPep->end; + char *pSeq = protSeq->dna; if (dupLen > 0) { // Duplication; position range changes to preceding bases. rangeEnd = rangeStart; rangeStart -= dupLen; aaToAbbr(pSeq[rangeStart], refStartAbbr, sizeof(refStartAbbr)); } else if (refLen == 0) { // Insertion; expand to two-AA range around insertion point rangeStart--; aaToAbbr(pSeq[rangeStart], refStartAbbr, sizeof(refStartAbbr)); rangeEnd++; } + hitsStopCodon = (rangeEnd > protSeq->size || + ((protSeq->dna[protSeq->size-1] == 'X') && rangeEnd == protSeq->size)); char refLastAbbr[4]; + if (hitsStopCodon) + aaToAbbr('X', refLastAbbr, sizeof(refLastAbbr)); + else aaToAbbr(pSeq[rangeEnd-1], refLastAbbr, sizeof(refLastAbbr)); dyStringPrintf(dy, "%s%d_%s%d", refStartAbbr, rangeStart+1, refLastAbbr, rangeEnd); } hgvsAppendChangesFromPepRefAlt(dy, vpPep->ref, vpPep->alt, dupLen); } if (addParens) dyStringAppendC(dy, ')'); return dyStringCannibalize(&dy); }