5f9ba1691bd3051146f2f9464c9d1375e42f6ff4 angie Thu Mar 8 11:36:57 2018 -0800 Added support for ENS*T* transcript IDs in HGVS position search, using the latest Gencode. Added support for parsing ENS*P* protein IDs, but can't yet map those to the genome because our Gencode tables don't yet include a mapping between transcript and protein IDs. refs #21076 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index c295038..44ab4f3 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -16,35 +16,44 @@ #include "trackHub.h" void hgvsVariantFree(struct hgvsVariant **pHgvs) // Free *pHgvs and its contents, and set *pHgvs to NULL. { if (pHgvs && *pHgvs) { struct hgvsVariant *hgvs = *pHgvs; freez(&hgvs->seqAcc); freez(&hgvs->seqGeneSymbol); freez(&hgvs->changes); freez(pHgvs); } } -// Regular expressions for HGVS-recognized sequence accessions: LRG or versioned RefSeq: +// Regular expressions for HGVS-recognized sequence accessions: LRG, ENS* or versioned RefSeq: #define lrgTranscriptExp "(LRG_[0-9]+t[0-9]+)" #define lrgProteinExp "(LRG_[0-9]+p[0-9]+)" #define lrgRegionExp "(LRG_[0-9]+)" +#define ensTranscriptExp "(ENS([A-Z]{3})?T[0-9]+\\.[0-9]+)" +// 0..................................... whole matching string +// 1..................................... ENS transcript ID including optional lift suffix +// 2... optional non-human species code e.g. MUS for mouse +#define ensProteinExp "(ENS([A-Z]{3})?P[0-9]+\\.[0-9]+)" +// 0..................................... whole matching string +// 1..................................... ENS protein ID +// 2... optional non-human species code e.g. MUS for mouse + // NC = RefSeq reference assembly chromosome (NT = contig (e.g. alt), NW = patch) // NG = RefSeq incomplete genomic region (e.g. gene locus) // NM = RefSeq curated mRNA // NP = RefSeq curated protein // NR = RefSeq curated (non-coding) RNA #define geneSymbolExp "([A-Za-z0-9./_-]+)" #define optionalGeneSymbolExp "(\\(" geneSymbolExp "\\))?" #define versionedAccPrefixExp(p) "(" p "_[0-9]+(\\.[0-9]+)?)" optionalGeneSymbolExp // ........................ accession and optional dot version // ........... optional dot version // ...... optional gene symbol in ()s // .... optional gene symbol #define versionedRefSeqNCExp versionedAccPrefixExp("[NX][CTW]") #define versionedRefSeqNGExp versionedAccPrefixExp("[NX]G") #define versionedRefSeqNMExp versionedAccPrefixExp("[NX]M") @@ -112,144 +121,202 @@ // 0..................................... whole matching string // 1....................... LRG region // 2. g or m // 3.. 1-based start position // 4... optional range separator and end position // 5.. 1-based end position // 6... change description #define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGMDotPosExp) #define hgvsRefSeqNCGDotExp hgvsRefSeqNCGDotPosExp "(.*)" // substring numbering: // 0..................................... whole matching string // 1................. accession and optional dot version // 2........ optional dot version -// 3...... (n/a) optional gene symbol in ()s -// 4.... (n/a) optional gene symbol +// 3...... optional gene symbol in ()s +// 4.... optional gene symbol // 5. g or m // 6.. 1-based start position // 7... optional range separator and end position // 8.. 1-based end position // 9... change description #define hgvsLrgCDotPosExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotPosExp) #define hgvsLrgCDotExp hgvsLrgCDotPosExp "(.*)" // substring numbering: // 0..................................................... whole matching string -// 1................... LRG transcript +// 1............. LRG transcript // 2... optional UTR anchor "-" or "*" // 3... mandatory first number // 4....... optional offset separator and offset // 5... offset separator // 6... offset number // 7............... optional range sep and complex num // 8... optional UTR anchor "-" or "*" // 9... first number // 10....... optional offset separator and offset // 11... offset separator // 12... offset number // 13........ sequence change +#define hgvsEnsCDotPosExp hgvsFullRegex(ensTranscriptExp, hgvsCDotPosExp) +#define hgvsEnsCDotExp hgvsEnsCDotPosExp "(.*)" +// substring numbering: +// 0..................................................... whole matching string +// 1............ ENS transcript ID +// 2... optional non-human species code +// 3... optional UTR anchor "-" or "*" +// 4... mandatory first number +// 5....... optional offset separator and offset +// 6... offset separator +// 7... offset number +// 8............... optional range sep and complex num +// 9... optional UTR anchor "-" or "*" +// 10... first number +// 11....... optional offset separator and offset +// 12... offset separator +// 13... offset number +// 14........ sequence change + #define hgvsRefSeqNMCDotPosExp hgvsFullRegex(versionedRefSeqNMExp, hgvsCDotPosExp) #define hgvsRefSeqNMCDotExp hgvsRefSeqNMCDotPosExp "(.*)" // substring numbering: // 0............................................................... whole matching string // 1............... acc & optional dot version // 2........ optional dot version // 3..... optional gene sym in ()s // 4... optional gene symbol // 5... optional UTR anchor // 6... mandatory first number // 7....... optional offset // 8... offset separator // 9... offset number // 10............... optional range // 11... optional UTR anchor // 12... first number // 13....... optional offset // 14... offset separator // 15... offset number // 16........ sequence change #define hgvsLrgNDotExp hgvsFullRegex(lrgTranscriptExp, hgvsNDotPosExp) "(.*)" // substring numbering: // 0..................................................... whole matching string -// 1................... LRG transcript -// 2... n/a 4 n.: UTR anchor "-" or "*" +// 1............. LRG transcript +// 2... UTR anchor "-" or "*" // 3... mandatory first number // 4....... optional offset separator and offset // 5... offset separator // 6... offset number // 7............... optional range sep and complex num -// 8... n/a 4 n.: UTR anchor "-" or "*" +// 8... UTR anchor "-" or "*" // 9... first number // 10....... optional offset separator and offset // 11... offset separator // 12... offset number // 13........ sequence change +#define hgvsEnsNDotExp hgvsFullRegex(ensTranscriptExp, hgvsNDotPosExp) "(.*)" +// substring numbering: +// 0..................................................... whole matching string +// 1............ ENS transcript ID +// 2... optional non-human species code +// 3... UTR anchor "-" or "*" +// 4... mandatory first number +// 5....... optional offset separator and offset +// 6... offset separator +// 7... offset number +// 8............... optional range sep and complex num +// 9... UTR anchor "-" or "*" +// 10... first number +// 11....... optional offset separator and offset +// 12... offset separator +// 13... offset number +// 14........ sequence change + #define hgvsRefSeqNMRNDotExp hgvsFullRegex(versionedRefSeqNMRExp, hgvsNDotPosExp) "(.*)" // substring numbering: // 0............................................................... whole matching string // 1............... acc & optional dot version // 2........ optional dot version // 3..... optional gene sym in ()s // 4... optional gene symbol -// 5... n/a 4 n.: UTR anchor +// 5... UTR anchor // 6... mandatory first number // 7....... optional offset // 8... offset separator // 9... offset number // 10............... optional range -// 11... n/a 4 n.: UTR anchor +// 11... UTR anchor // 12... first number // 13....... optional offset // 14... offset separator // 15... offset number // 16........ sequence change #define hgvsLrgPDotSubstExp hgvsFullRegex(lrgProteinExp, hgvsPDotSubstExp) // substring numbering: // 0..................................................... whole matching string // 1............................ LRG protein // 2..... original sequence // 3...... 1-based position // 4...... replacement sequence +#define hgvsEnsPDotSubstExp hgvsFullRegex(ensProteinExp, hgvsPDotSubstExp) +// substring numbering: +// 0..................................................... whole matching string +// 1........................... ENS protein ID +// 2... optional non-human species code +// 3..... original sequence +// 4...... 1-based position +// 5...... replacement sequence + #define hgvsRefSeqNPPDotSubstExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotSubstExp) // substring numbering: // 0..................................................... whole matching string // 1................ accession and optional dot version // 2..... optional dot version // 3..... optional gene symbol in ()s // 4... optional gene symbol // 5..... original sequence // 6...... 1-based position // 7...... replacement sequence #define hgvsLrgPDotRangeExp hgvsFullRegex(lrgProteinExp, hgvsPDotRangeExp) // substring numbering: // 0..................................................... whole matching string // 1.......................... accession and optional dot version // 2... original start AA // 3... 1-based start position // 4................. optional range sep and AA+pos // 5... original end AA // 6... 1-based end position // 7...... change description +#define hgvsEnsPDotRangeExp hgvsFullRegex(ensProteinExp, hgvsPDotRangeExp) +// substring numbering: +// 0..................................................... whole matching string +// 1........................... ENS protein ID +// 2... optional non-human species code +// 3... original start AA +// 4... 1-based start position +// 5................. optional range sep and AA+pos +// 6... original end AA +// 7... 1-based end position +// 8...... change description + #define hgvsRefSeqNPPDotRangeExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotRangeExp) // substring numbering: // 0..................................................... whole matching string // 1................ accession and optional dot version // 2..... optional dot version // 3..... optional gene symbol in ()s // 4... optional gene symbol // 5... original start AA // 6... 1-based start position // 7................. optional range sep and AA+pos // 8... original end AA // 9... 1-based end position // 10..... change description // Pseudo-HGVS in common usage @@ -390,49 +457,59 @@ *retOffset = -*retOffset; } } static struct hgvsVariant *hgvsParseCNDotPos(char *term) /* If term is parseable as an HGVS CDS term, return the parsed representation, otherwise NULL. */ { struct hgvsVariant *hgvs = NULL; boolean isNoncoding = FALSE; boolean matches = FALSE; int accIx = 1; int startAnchorIx = 2; int endAnchorIx = 8; int endPosIx = 9; int changeIx = 13; -// The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that -// affects all substring offsets after the accession. -int refSeqExtra = 3; +// The LRG accession regex has only one substring but the RefSeq acc regex has 4 and ENS has 2, +// so that affects all substring offsets after the accession. +int ixExtra = 0; int geneSymbolIx = -1; regmatch_t substrs[17]; if (regexMatchSubstr(term, hgvsLrgCDotExp, substrs, ArraySize(substrs)) || (isNoncoding = regexMatchSubstr(term, hgvsLrgNDotExp, substrs, ArraySize(substrs)))) { matches = TRUE; } +else if (regexMatchSubstr(term, hgvsEnsCDotExp, substrs, ArraySize(substrs)) || + (isNoncoding = regexMatchSubstr(term, hgvsEnsNDotExp, substrs, ArraySize(substrs)))) + { + matches = TRUE; + ixExtra = 1; + } else if (regexMatchSubstr(term, hgvsRefSeqNMCDotExp, substrs, ArraySize(substrs)) || (isNoncoding = regexMatchSubstr(term, hgvsRefSeqNMRNDotExp, substrs, ArraySize(substrs)))) { matches = TRUE; + ixExtra = 3; geneSymbolIx = 4; - startAnchorIx += refSeqExtra; - endAnchorIx += refSeqExtra; - endPosIx += refSeqExtra; - changeIx += refSeqExtra; + } +if (ixExtra > 0) + { + startAnchorIx += ixExtra; + endAnchorIx += ixExtra; + endPosIx += ixExtra; + changeIx += ixExtra; } if (matches) { AllocVar(hgvs); hgvs->type = isNoncoding ? hgvstNoncoding : hgvstCoding; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); extractComplexNum(term, substrs, startAnchorIx, &hgvs->startIsUtr3, &hgvs->start1, &hgvs->startOffset); if (isNoncoding && hgvs->startIsUtr3) { warn("hgvsParseCNDotPos: noncoding term '%s' appears to start in UTR3 (*), " "not applicable for noncoding", term); hgvs->startIsUtr3 = FALSE; } if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx])) @@ -458,44 +535,53 @@ } return hgvs; } static struct hgvsVariant *hgvsParsePDotSubst(char *term) /* If term is parseable as an HGVS protein substitution, return the parsed representation, * otherwise NULL. */ { struct hgvsVariant *hgvs = NULL; boolean matches = FALSE; int accIx = 1; int refIx = 2; int posIx = 3; int altIx = 4; int geneSymbolIx = -1; +// The LRG accession regex has only one substring but the RefSeq acc regex has 4 and ENS has 2, +// so that affects all substring offsets after the accession. +int ixExtra = 0; regmatch_t substrs[8]; if (regexMatchSubstr(term, hgvsLrgPDotSubstExp, substrs, ArraySize(substrs))) matches = TRUE; +else if (regexMatchSubstr(term, hgvsEnsPDotSubstExp, substrs, ArraySize(substrs))) + { + ixExtra = 1; + matches = TRUE; + } else if (regexMatchSubstr(term, hgvsRefSeqNPPDotSubstExp, substrs, ArraySize(substrs))) { matches = TRUE; - // The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that - // affects all substring offsets after the accession. - int refSeqExtra = 3; - refIx += refSeqExtra; - posIx += refSeqExtra; - altIx += refSeqExtra; + ixExtra = 3; geneSymbolIx = 4; } +if (ixExtra > 0) + { + refIx += ixExtra; + posIx += ixExtra; + altIx += ixExtra; + } if (matches) { AllocVar(hgvs); hgvs->type = hgvstProtein; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); int changeStart = substrs[refIx].rm_so; int changeEnd = substrs[altIx].rm_eo; int len = changeEnd - changeStart; char change[len+1]; safencpy(change, sizeof(change), term+changeStart, len); hgvs->changes = cloneString(change); if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); hgvs->start1 = regexSubstringInt(term, substrs[posIx]); hgvs->end = hgvs->start1; @@ -503,45 +589,54 @@ return hgvs; } static struct hgvsVariant *hgvsParsePDotRange(char *term) /* If term is parseable as an HGVS protein range change, return the parsed representation, * otherwise NULL. */ { struct hgvsVariant *hgvs = NULL; boolean matches = FALSE; int accIx = 1; int startRefIx = 2; int startPosIx = 3; int endPosIx = 6; int changeIx = 7; int geneSymbolIx = -1; +// The LRG accession regex has only one substring but the RefSeq acc regex has 4 and ENS has 2, +// so that affects all substring offsets after the accession. +int ixExtra = 0; regmatch_t substrs[11]; if (regexMatchSubstr(term, hgvsLrgPDotRangeExp, substrs, ArraySize(substrs))) matches = TRUE; +else if (regexMatchSubstr(term, hgvsEnsPDotRangeExp, substrs, ArraySize(substrs))) + { + matches = TRUE; + ixExtra = 1; + } else if (regexMatchSubstr(term, hgvsRefSeqNPPDotRangeExp, substrs, ArraySize(substrs))) { matches = TRUE; - // The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that - // affects all substring offsets after the accession. - int refSeqExtra = 3; - startRefIx += refSeqExtra; - startPosIx += refSeqExtra; - endPosIx += refSeqExtra; - changeIx += refSeqExtra; + ixExtra = 3; geneSymbolIx = 4; } +if (ixExtra > 0) + { + startRefIx += ixExtra; + startPosIx += ixExtra; + endPosIx += ixExtra; + changeIx += ixExtra; + } if (matches) { AllocVar(hgvs); hgvs->type = hgvstProtein; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); int changeStart = substrs[startRefIx].rm_so; int changeEnd = substrs[changeIx].rm_eo; int len = changeEnd - changeStart; char change[len+1]; safencpy(change, sizeof(change), term+changeStart, len); hgvs->changes = cloneString(change); if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); if (regexSubstrMatched(substrs[endPosIx])) @@ -958,83 +1053,156 @@ // Since we found hgvs->seqAcc, split it into versionless and version parts. if (retFoundAcc) *retFoundAcc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.'); if (retFoundVersion) { char *p = strchr(hgvs->seqAcc, '.'); if (p) *retFoundVersion = atoi(p+1); } // Don't free chromInfo, it's cached! } } return coordsOK; } +static char *txSeqFromGp(char *db, struct genePred *gp) +/* Return transcribed-from-genome sequence for gp */ +{ +int seqLen = 0; +int i; +for (i = 0; i < gp->exonCount; i++) + seqLen += (gp->exonEnds[i] - gp->exonStarts[i]); +char *seq = needMem(seqLen+1); +int offset = 0; +for (i = 0; i < gp->exonCount; i++) + { + int exonStart = gp->exonStarts[i]; + int exonEnd = gp->exonEnds[i]; + struct dnaSeq *exonSeq = hChromSeq(db, gp->chrom, exonStart, exonEnd); + safencpy(seq+offset, seqLen+1-offset, exonSeq->dna, exonSeq->size); + offset += exonSeq->size; + dnaSeqFree(&exonSeq); + } +if (gp->strand[0] == '-') + reverseComplement(seq, seqLen); +return seq; +} + +static struct genePred *getGencodeGp(char *db, char *acc) +/* Return the genePred for acc in the latest wgEncodeGencodeBasicV* table, or NULL if not found. */ +{ +struct genePred *gp = NULL; +struct sqlConnection *conn = hAllocConn(db); +char *gencodeTable = hFindLatestGencodeTableConn(conn, "Basic"); +if (gencodeTable) + { + int hasBin = 1; + char query[2048]; + sqlSafef(query, sizeof(query), "select * from %s where name = '%s'", gencodeTable, acc); + struct sqlResult *sr = sqlGetResult(conn, query); + char **row; + if ((row = sqlNextRow(sr)) != NULL) + gp = genePredExtLoad(row+hasBin, GENEPREDX_NUM_COLS); + sqlFreeResult(&sr); + } +hFreeConn(&conn); +return gp; +} + +static char *getGencodeSeq(char *db, char *acc) +/* Return transcribed-from-genome sequence for acc, or NULL if not found. */ +{ +char *seq = NULL; +struct genePred *gp = getGencodeGp(db, acc); +if (gp) + seq = txSeqFromGp(db, gp); +return seq; +} + static char *getCdnaSeq(char *db, char *acc) /* Return cdna sequence for acc, or NULL if not found. */ { if (trackHubDatabase(db)) return NULL; char *seq = NULL; if (startsWith("LRG_", acc)) { if (hTableExists(db, "lrgCdna")) { char query[2048]; sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); } } +else if (startsWith("ENS", acc )) + { + // Construct it from the genome I guess? + seq = getGencodeSeq(db, acc); + } else { struct dnaSeq *cdnaSeq = NULL; if (hDbHasNcbiRefSeq(db)) cdnaSeq = hDnaSeqGet(db, acc, "seqNcbiRefSeq", "extNcbiRefSeq"); else cdnaSeq = hGenBankGetMrna(db, acc, NULL); if (cdnaSeq) seq = dnaSeqCannibalize(&cdnaSeq); } return seq; } static boolean getCds(char *db, char *acc, struct genbankCds *retCds) -/* Get the CDS info for genbank or LRG acc; return FALSE if not found or not applicable. */ +/* Get the CDS info for genbank/LRG/ENS acc; return FALSE if not found or not applicable. */ { if (trackHubDatabase(db)) return FALSE; boolean gotCds = FALSE; +char *cdsStr = NULL; char query[1024]; +if (startsWith("ENS", acc)) + { + // Infer CDS from genePred coords + struct genePred *gp = getGencodeGp(db, acc); + if (gp) + { + genePredToCds(gp, retCds); + gotCds = (retCds->end > retCds->start); + } + } +else + { if (startsWith("LRG_", acc)) sqlSafef(query, sizeof(query), "select cds from lrgCds where id = '%s'", acc); else if (hDbHasNcbiRefSeq(db) && // This is a hack to allow us to fall back on refSeqAli if ncbiRefSeqPsl is incomplete: strchr(acc, '.')) sqlSafef(query, sizeof(query), "select cds from ncbiRefSeqCds where id = '%s'", acc); else sqlSafef(query, sizeof(query), "SELECT c.name FROM %s as c, %s as g WHERE (g.acc = '%s') AND " "(g.cds != 0) AND (g.cds = c.id)", cdsTable, gbCdnaInfoTable, acc); struct sqlConnection *conn = hAllocConn(db); char cdsBuf[2048]; -char *cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf)); + cdsStr = sqlQuickQuery(conn, query, cdsBuf, sizeof(cdsBuf)); hFreeConn(&conn); + } if (isNotEmpty(cdsStr)) gotCds = (genbankCdsParse(cdsStr, retCds) && retCds->startComplete && retCds->start != retCds->end); return gotCds; } static char refBaseFromProt(char *change) /* If change starts with an amino acid 3-letter or 1-letter code then return the 1-letter code, * else null char. */ { regmatch_t substrs[1]; if (regexMatchSubstr(change, "^" hgvsAminoAcidExp, substrs, ArraySize(substrs))) { char match[256]; // 1- or 3-letter code regexSubstringCopy(change, substrs[0], match, sizeof(match)); @@ -1089,44 +1257,52 @@ static int getGbVersion(char *db, char *acc) /* Return the local version that we have for acc. */ { if (trackHubDatabase(db)) return 0; char query[2048]; sqlSafef(query, sizeof(query), "select version from %s where acc = '%s'", gbSeqTable, acc); struct sqlConnection *conn = hAllocConn(db); int version = sqlQuickNum(conn, query); hFreeConn(&conn); return version; } static char *normalizeVersion(char *db, char *acc, int *retFoundVersion) /* LRG accessions don't have versions, so just return the same acc and set *retFoundVersion to 0. + * ENS accessions must have versions. * The user may give us a RefSeq accession with or without a .version. * If ncbiRefSeq tables are present, return acc with version, looking up version if necessary. * If we look it up and can't find it, this returns NULL. * If instead we're using genbank tables, return acc with no version. * That ensures that acc will be found in our local tables. */ { if (trackHubDatabase(db)) return NULL; char *normalizedAcc = NULL; int foundVersion = 0; if (startsWith("LRG_", acc)) { normalizedAcc = cloneString(acc); } +else if (startsWith("ENS", acc)) + { + normalizedAcc = cloneString(acc); + char *p = strchr(acc, '.'); + if (p) + foundVersion = atoi(p+1); + } else if (hDbHasNcbiRefSeq(db)) { // ncbiRefSeq tables need versioned accessions. if (strchr(acc, '.')) normalizedAcc = cloneString(acc); else { char query[2048]; sqlSafef(query, sizeof(query), "select name from ncbiRefSeq where name like '%s.%%'", acc); struct sqlConnection *conn = hAllocConn(db); normalizedAcc = sqlQuickString(conn, query); hFreeConn(&conn); } if (isNotEmpty(normalizedAcc)) { @@ -1501,65 +1677,51 @@ int tBlockEnd = txAli->tStarts[i] + txAli->blockSizes[i]; int tNextBlockStart = txAli->tStarts[i+1]; if (vStart >= qBlockEnd && vEnd <= qNextBlockStart) { if (tBlockEnd == tNextBlockStart) return pslDelFromCoord(txAli, tBlockEnd, variantPsl); else return pslFromGap(txAli, i, variantPsl); } } // Not contained in a deletion from reference genome (txAli target) -- return NULL. return NULL; } -static struct psl *mapPsl(char *db, struct hgvsVariant *hgvs, char *pslTable, char *acc, +static struct psl *mapPsl(struct hgvsVariant *hgvs, char *acc, struct psl *txAli, struct genbankCds *cds, int *retUpstream, int *retDownstream) -/* If acc is found in pslTable, use pslTransMap to map hgvs onto the genome. */ +/* Use pslTransMap to map hgvs onto the genome. */ { -struct psl *mappedToGenome = NULL; -char query[2048]; -sqlSafef(query, sizeof(query), "select * from %s where qName = '%s'", pslTable, acc); -struct sqlConnection *conn = hAllocConn(db); -struct sqlResult *sr = sqlGetResult(conn, query); -char **row; -if (sr && (row = sqlNextRow(sr))) - { - int bin = 1; // All PSL tables used here, and any made in the future, use the bin column. - struct psl *txAli = pslLoad(row+bin); // variantPsl contains the anchor if a non-cdsStart anchor is used because // the actual position might be outside the bounds of the transcript sequence (intron/UTR) struct psl *variantPsl = pslFromHgvsNuc(hgvs, acc, txAli->qSize, txAli->qEnd, cds, retUpstream, retDownstream); - mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli); +struct psl *mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli); // If there is a deletion in the genome / insertion in the transcript then pslTransMap cannot // map those bases to the genome. In that case take a harder look and return the deletion // coords. if (mappedToGenome == NULL) mappedToGenome = mapToDeletion(variantPsl, txAli); pslFree(&variantPsl); - pslFree(&txAli); - } -sqlFreeResult(&sr); -hFreeConn(&conn); return mappedToGenome; } static char *pslTableForAcc(char *db, char *acc) -/* Based on acc (and whether db has NCBI RefSeq alignments), pick a PSL table where - * acc should be found (table may or may not exist). Don't free the returned string. */ +/* Based on acc (and whether db has NCBI RefSeq alignments), pick a PSL (or genePred for GENCODE) + * table where acc should be found (table may or may not exist). Don't free the returned string. */ { char *pslTable = NULL; if (startsWith("LRG_", acc)) pslTable = "lrgTranscriptAli"; else if (startsWith("NM_", acc) || startsWith("NR_", acc) || startsWith("XM_", acc) || startsWith("XR_", acc)) { // Use NCBI's alignments if they are available if (hDbHasNcbiRefSeq(db)) pslTable = "ncbiRefSeqPsl"; else pslTable = "refSeqAli"; } return pslTable; } @@ -1593,88 +1755,156 @@ if (revStrand) { region->chromStart -= downstream; region->chromEnd += upstream; } else { region->chromStart -= upstream; region->chromEnd += downstream; } limitToRange(region->chromStart, 0, psl->tSize); limitToRange(region->chromEnd, 0, psl->tSize); return region; } +static struct psl *pslForQName(char *db, char *pslTable, char *acc) +/* Look up acc in pslTable's qName column; return psl or NULL if not found. */ +{ +struct psl *psl = NULL; +char query[2048]; +struct sqlConnection *conn = hAllocConn(db); +char **row; +sqlSafef(query, sizeof(query), "select * from %s where qName = '%s'", pslTable, acc); +struct sqlResult *sr = sqlGetResult(conn, query); +if (sr && (row = sqlNextRow(sr))) + { + // All PSL tables used here, and any made in the future, use the bin column. + int bin = 1; + psl = pslLoad(row+bin); + } +sqlFreeResult(&sr); +hFreeConn(&conn); +return psl; +} + +static struct psl *getPslAndCds(char *db, struct hgvsVariant *hgvs, char **pAcc, + struct genbankCds *cds, char **retPslTable) +/* Get PSL and cds (if applicable) for acc. */ +{ +struct psl *txAli = NULL; +cds->start = cds->end = -1; +cds->startComplete = cds->endComplete = cds->complement = FALSE; +char *acc = *pAcc; +if (startsWith("ENS", acc)) + { + struct genePred *gp = getGencodeGp(db, acc); + if (gp) + { + int qSize = 0; // infer -- no polyAs in Gencode because it's genome-based + txAli = genePredToPsl(gp, hChromSize(db, gp->chrom), qSize); + genePredToCds(gp, cds); + if (retPslTable) + { + // Well, it's not PSL but it's the track table: + struct sqlConnection *conn = hAllocConn(db); + *retPslTable = hFindLatestGencodeTableConn(conn, "Basic"); + hFreeConn(&conn); + } + } + } +else + { + char *pslTable = pslTableForAcc(db, acc); + if (pslTable && hTableExists(db, pslTable)) + { + if (hgvs->type == hgvstCoding) + getCds(db, acc, cds); + txAli = pslForQName(db, pslTable, acc); + } + // As of 9/26/16, ncbiRefSeqPsl is missing some items (#13673#note-443) -- so fall back + // on UCSC alignments. + if (!txAli && sameString(pslTable, "ncbiRefSeqPsl") && hTableExists(db, "refSeqAli")) + { + char *accNoVersion = cloneFirstWordByDelimiter(acc, '.'); + if (hgvs->type == hgvstCoding) + getCds(db, accNoVersion, cds); + txAli = pslForQName(db, "refSeqAli", accNoVersion); + if (txAli) + { + pslTable = "refSeqAli"; + *pAcc = accNoVersion; + } + } + if (txAli && retPslTable != NULL) + *retPslTable = pslTable; + } +return txAli; +} + static struct bed *hgvsMapNucToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { if (hgvs->type == hgvstGenomic) return hgvsMapGDotToGenome(db, hgvs, retPslTable); char *acc = normalizeVersion(db, hgvs->seqAcc, NULL); if (isEmpty(acc)) return NULL; struct bed *region = NULL; -char *pslTable = pslTableForAcc(db, acc); +char *pslTable = NULL; struct genbankCds cds; -boolean gotCds = (hgvs->type == hgvstCoding) ? getCds(db, acc, &cds) : FALSE; -if (pslTable && (hgvs->type != hgvstCoding || gotCds) && hTableExists(db, pslTable)) +struct psl *txAli = getPslAndCds(db, hgvs, &acc, &cds, &pslTable); +boolean gotCds = (cds.end > cds.start); +if (txAli && (hgvs->type != hgvstCoding || gotCds)) { int upstream = 0, downstream = 0; - struct psl *mappedToGenome = mapPsl(db, hgvs, pslTable, acc, &cds, &upstream, &downstream); - // As of 9/26/16, ncbiRefSeqPsl is missing some items (#13673#note-443) -- so fall back - // on UCSC alignments. - if (!mappedToGenome && sameString(pslTable, "ncbiRefSeqPsl") && hTableExists(db, "refSeqAli")) - { - char *accNoVersion = cloneFirstWordByDelimiter(acc, '.'); - gotCds = (hgvs->type == hgvstCoding) ? getCds(db, accNoVersion, &cds) : FALSE; - if (hgvs->type != hgvstCoding || gotCds) - mappedToGenome = mapPsl(db, hgvs, "refSeqAli", accNoVersion, &cds, - &upstream, &downstream); - if (mappedToGenome) - { - pslTable = "refSeqAli"; - acc = accNoVersion; - } - } + struct psl *mappedToGenome = mapPsl(hgvs, acc, txAli, &cds, &upstream, &downstream); if (mappedToGenome) { region = pslAndFriendsToRegion(mappedToGenome, hgvs, upstream, downstream); pslFree(&mappedToGenome); } + pslFree(&txAli); } if (region) { adjustInsStartEnd(hgvs, ®ion->chromStart, ®ion->chromEnd); if (retPslTable) *retPslTable = cloneString(pslTable); } return region; } static struct bed *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { char *acc = normalizeVersion(db, hgvs->seqAcc, NULL); if (isEmpty(acc)) return NULL; struct bed *region = NULL; char *txAcc = NULL; if (startsWith("LRG_", acc)) txAcc = lrgProteinToTx(db, acc); +else if (startsWith("ENS", acc)) + { + // Look up ENS*T* txAcc for ENS*P* acc + + //#*** TODO when we have a Gencode table analogous to ensGtp + + } else if (startsWith("NP_", acc) || startsWith("XP_", acc)) { struct sqlConnection *conn = hAllocConn(db); char query[2048]; if (hDbHasNcbiRefSeq(db)) sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'", acc); else if (hTableExists(db, "refGene")) sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r " "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc); else return NULL; txAcc = sqlQuickString(conn, query); hFreeConn(&conn); }