31744b6e479fbe5e49b3188e651a328cf3374ed0 angie Thu Mar 8 13:40:51 2018 -0800 Added support for ENS*P* accessions in HGVS position search. Thanks Mark for the new Attrs proteinId column! refs #21076 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index 44ab4f3..ef8889b 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -741,37 +741,116 @@ return npAcc; } static char *lrgProteinToTx(char *db, char *protAcc) /* Return the LRG_ transcript accession for protAcc. Each LRG_NpM has a corresponding LRG_NtM. */ { int accLen = strlen(protAcc); char txAcc[accLen+1]; safecpy(txAcc, sizeof(txAcc), protAcc); char *p = strrchr(txAcc, 'p'); if (p) *p = 't'; return cloneString(txAcc); } +static char *gencodeProteinToTx(char *db, char *protAcc) +/* Return the ENS*T transcript accession for protAcc, or NULL if not found. */ +{ +char *txAcc = NULL; +struct sqlConnection *conn = hAllocConn(db); +char *attrsTable = hFindLatestGencodeTableConn(conn, "Attrs"); +if (attrsTable && hHasField(db, attrsTable, "proteinId")) + { + char query[2048]; + sqlSafef(query, sizeof(query), "select transcriptId from %s where proteinId = '%s'", + attrsTable, protAcc); + txAcc = sqlQuickString(conn, query); + } +hFreeConn(&conn); +return txAcc; +} + +static struct genePred *getGencodeGp(char *db, char *acc) +/* Return the genePred for acc in the latest wgEncodeGencodeBasicV* table, or NULL if not found. */ +{ +struct genePred *gp = NULL; +struct sqlConnection *conn = hAllocConn(db); +char *gencodeTable = hFindLatestGencodeTableConn(conn, "Basic"); +if (gencodeTable) + { + int hasBin = 1; + char query[2048]; + sqlSafef(query, sizeof(query), "select * from %s where name = '%s'", gencodeTable, acc); + struct sqlResult *sr = sqlGetResult(conn, query); + char **row; + if ((row = sqlNextRow(sr)) != NULL) + gp = genePredExtLoad(row+hasBin, GENEPREDX_NUM_COLS); + sqlFreeResult(&sr); + } +hFreeConn(&conn); +return gp; +} + +static char *txProtFromGp(char *db, struct genePred *gp) +/* Return a string containing the translated CDS portion of gp using genomic sequence. */ +{ +int cdsLen = 0; +int i, eCdsStart, eCdsEnd; +for (i = 0; i < gp->exonCount; i++) + { + if (genePredCdsExon(gp, i, &eCdsStart, &eCdsEnd)) + cdsLen += (eCdsEnd - eCdsStart); + } +char cdsSeq[cdsLen+1]; +int offset = 0; +for (i = 0; i < gp->exonCount; i++) + { + if (genePredCdsExon(gp, i, &eCdsStart, &eCdsEnd)) + { + struct dnaSeq *exonSeq = hChromSeq(db, gp->chrom, eCdsStart, eCdsEnd); + safencpy(cdsSeq+offset, cdsLen+1-offset, exonSeq->dna, exonSeq->size); + offset += exonSeq->size; + dnaSeqFree(&exonSeq); + } + } +if (gp->strand[0] == '-') + reverseComplement(cdsSeq, cdsLen); +char *seq = needMem(cdsLen+1); +dnaTranslateSome(cdsSeq, seq, cdsLen+1); +return seq; +} + static char *getProteinSeq(char *db, char *acc) /* Return amino acid sequence for acc, or NULL if not found. */ { if (trackHubDatabase(db)) return NULL; char *seq = NULL; -if (startsWith("LRG_", acc)) +if (startsWith("ENS", acc)) + { + char *txAcc = gencodeProteinToTx(db, acc); + if (txAcc) + { + struct genePred *gp = getGencodeGp(db, txAcc); + if (gp) + seq = txProtFromGp(db, gp); + genePredFree(&gp); + freeMem(txAcc); + } + } +else if (startsWith("LRG_", acc)) { if (hTableExists(db, "lrgPep")) { char query[2048]; // lrgPep is indexed by transcript ID not protein ID char *txAcc = lrgProteinToTx(db, acc); sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", txAcc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); freeMem(txAcc); } } else { @@ -779,31 +858,30 @@ { char query[2048]; sqlSafef(query, sizeof(query), "select seq from ncbiRefSeqPepTable " "where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); } else { aaSeq *aaSeq = hGenBankGetPep(db, acc, NULL); if (aaSeq) seq = aaSeq->dna; } } - return seq; } static char refBaseForNp(char *db, char *npAcc, int pos) // Get the amino acid base in NP_'s sequence at 1-based offset pos. { char *seq = getProteinSeq(db, npAcc); char base = seq ? seq[pos-1] : '\0'; freeMem(seq); return base; } struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term) /* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: */ // Note: this doesn't support non-coding gene symbol terms (which should have nt alleles) @@ -1076,51 +1154,30 @@ int offset = 0; for (i = 0; i < gp->exonCount; i++) { int exonStart = gp->exonStarts[i]; int exonEnd = gp->exonEnds[i]; struct dnaSeq *exonSeq = hChromSeq(db, gp->chrom, exonStart, exonEnd); safencpy(seq+offset, seqLen+1-offset, exonSeq->dna, exonSeq->size); offset += exonSeq->size; dnaSeqFree(&exonSeq); } if (gp->strand[0] == '-') reverseComplement(seq, seqLen); return seq; } -static struct genePred *getGencodeGp(char *db, char *acc) -/* Return the genePred for acc in the latest wgEncodeGencodeBasicV* table, or NULL if not found. */ -{ -struct genePred *gp = NULL; -struct sqlConnection *conn = hAllocConn(db); -char *gencodeTable = hFindLatestGencodeTableConn(conn, "Basic"); -if (gencodeTable) - { - int hasBin = 1; - char query[2048]; - sqlSafef(query, sizeof(query), "select * from %s where name = '%s'", gencodeTable, acc); - struct sqlResult *sr = sqlGetResult(conn, query); - char **row; - if ((row = sqlNextRow(sr)) != NULL) - gp = genePredExtLoad(row+hasBin, GENEPREDX_NUM_COLS); - sqlFreeResult(&sr); - } -hFreeConn(&conn); -return gp; -} - static char *getGencodeSeq(char *db, char *acc) /* Return transcribed-from-genome sequence for acc, or NULL if not found. */ { char *seq = NULL; struct genePred *gp = getGencodeGp(db, acc); if (gp) seq = txSeqFromGp(db, gp); return seq; } static char *getCdnaSeq(char *db, char *acc) /* Return cdna sequence for acc, or NULL if not found. */ { if (trackHubDatabase(db)) return NULL; @@ -1875,36 +1932,31 @@ return region; } static struct bed *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { char *acc = normalizeVersion(db, hgvs->seqAcc, NULL); if (isEmpty(acc)) return NULL; struct bed *region = NULL; char *txAcc = NULL; if (startsWith("LRG_", acc)) txAcc = lrgProteinToTx(db, acc); else if (startsWith("ENS", acc)) - { - // Look up ENS*T* txAcc for ENS*P* acc - - //#*** TODO when we have a Gencode table analogous to ensGtp - - } + txAcc = gencodeProteinToTx(db, acc); else if (startsWith("NP_", acc) || startsWith("XP_", acc)) { struct sqlConnection *conn = hAllocConn(db); char query[2048]; if (hDbHasNcbiRefSeq(db)) sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'", acc); else if (hTableExists(db, "refGene")) sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r " "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc); else return NULL; txAcc = sqlQuickString(conn, query); hFreeConn(&conn); }