3e70e7601a08e28667531d5abe3fe8b04ee95ded chmalee Fri Jul 16 11:23:58 2021 -0700 Return multiple transcripts for pseudo hgvs search, refs #15554 diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index a3af8af..53b8086 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -650,53 +650,53 @@ struct hgvsVariant *hgvsParseTerm(char *term) /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL. * This does not check validity of accessions, coordinates or alleles. */ { struct hgvsVariant *hgvs = hgvsParseCNDotPos(term); if (hgvs == NULL) hgvs = hgvsParsePDotSubst(term); if (hgvs == NULL) hgvs = hgvsParsePDotRange(term); if (hgvs == NULL) hgvs = hgvsParseGDotPos(term); return hgvs; } -static char *npForGeneSymbol(char *db, char *geneSymbol) -/* Given a gene symbol, look up and return its NP_ accession; if not found return NULL. */ +static struct slName *npForGeneSymbol(char *db, char *geneSymbol) +/* Given a gene symbol, look up and return a list of NP_ accessions; if not found return NULL. */ { char query[2048]; if (hDbHasNcbiRefSeq(db)) { sqlSafef(query, sizeof(query), "select protAcc from ncbiRefSeqLink where name = '%s' " "and protAcc != 'n/a' and protAcc != '' " "order by length(protAcc), protAcc", geneSymbol); } else if (hTableExists(db, "refGene")) { // Join with refGene to make sure it's an NP for this species. sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r " "where l.name = '%s' and l.mrnaAcc = r.name " "and l.protAcc != '' order by length(l.protAcc), l.protAcc" , refLinkTable, geneSymbol); } else return NULL; struct sqlConnection *conn = hAllocConn(db); -char *npAcc = sqlQuickString(conn, query); +struct slName *npAcc = sqlQuickList(conn, query); hFreeConn(&conn); return npAcc; } static char *nmForGeneSymbol(char *db, char *geneSymbol) /* Given a gene symbol, look up and return its NM_ accession; if not found return NULL. */ { if (trackHubDatabase(db)) return NULL; char query[2048]; char *geneTable = NULL; if (hDbHasNcbiRefSeq(db)) geneTable = "ncbiRefSeq"; else if (hTableExists(db, "refGene")) geneTable = "refGene"; @@ -872,31 +872,32 @@ } } return seq; } static char refBaseForNp(char *db, char *npAcc, int pos) // Get the amino acid base in NP_'s sequence at 1-based offset pos. { char *seq = getProteinSeq(db, npAcc); char base = seq ? seq[pos-1] : '\0'; freeMem(seq); return base; } struct hgvsVariant *hgvsParsePseudoHgvs(char *db, char *term) -/* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: */ +/* Attempt to parse things that are not strict HGVS, but that people might intend as HGVS: + * Return a list of struct hgvsVariant that may be what was intended */ // Note: this doesn't support non-coding gene symbol terms (which should have nt alleles) { struct hgvsVariant *hgvs = NULL; regmatch_t substrs[11]; int geneSymbolIx = 1; boolean isSubst; if ((isSubst = regexMatchSubstr(term, pseudoHgvsNMPDotSubstExp, substrs, ArraySize(substrs))) || regexMatchSubstr(term, pseudoHgvsNMPDotRangeExp, substrs, ArraySize(substrs))) { // User gave an NM_ accession but a protein change -- swap in the right NP_. int nmAccIx = 1; int geneSymbolIx = 4; int len = substrs[nmAccIx].rm_eo - substrs[nmAccIx].rm_so; char nmAcc[len+1]; @@ -917,61 +918,71 @@ } else npTerm = dyStringCreate("%s:p.%s", npAcc, description); hgvs = hgvsParseTerm(npTerm->string); dyStringFree(&npTerm); freeMem(npAcc); } } else if ((isSubst = regexMatchSubstr(term, pseudoHgvsGeneSymbolProtSubstExp, substrs, ArraySize(substrs))) || regexMatchSubstr(term, pseudoHgvsGeneSymbolProtRangeExp, substrs, ArraySize(substrs))) { int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; char geneSymbol[len+1]; safencpy(geneSymbol, sizeof(geneSymbol), term, len); - char *npAcc = npForGeneSymbol(db, geneSymbol); - if (isNotEmpty(npAcc)) + struct slName *npAccList = npForGeneSymbol(db, geneSymbol); + if (npAccList != NULL) + { + struct slName *npItem = NULL; + for (npItem = npAccList; npItem != NULL; npItem = npItem->next) { + char *npAcc = npItem->name; // Make it a real HGVS term with the NP and pass that on to the usual parser. int descStartIx = 2; char *description = term + substrs[descStartIx].rm_so; struct dyString *npTerm = dyStringCreate("%s(%s):p.%s", npAcc, geneSymbol, description); - hgvs = hgvsParseTerm(npTerm->string); + slAddHead(&hgvs, hgvsParseTerm(npTerm->string)); dyStringFree(&npTerm); - freeMem(npAcc); + //freeMem(npAcc); + } } } else if (regexMatchSubstr(term, pseudoHgvsGeneSymbolProtPosExp, substrs, ArraySize(substrs))) { int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; char geneSymbol[len+1]; safencpy(geneSymbol, sizeof(geneSymbol), term, len); - char *npAcc = npForGeneSymbol(db, geneSymbol); - if (isNotEmpty(npAcc)) + struct slName *npAccList = npForGeneSymbol(db, geneSymbol); + if (npAccList != NULL) + { + struct slName *npItem = NULL; + for (npItem = npAccList; npItem != NULL; npItem = npItem->next) { + char *npAcc = npItem->name; // Only position was provided, no change. Look up ref base and make a synonymous subst // so it's parseable HGVS. int posIx = 2; int pos = regexSubstringInt(term, substrs[posIx]); char refBase = refBaseForNp(db, npAcc, pos); struct dyString *npTerm = dyStringCreate("%s(%s):p.%c%d=", npAcc, geneSymbol, refBase, pos); - hgvs = hgvsParseTerm(npTerm->string); + slAddHead(&hgvs, hgvsParseTerm(npTerm->string)); dyStringFree(&npTerm); - freeMem(npAcc); + //freeMem(npAcc); + } } } else if (regexMatchSubstr(term, pseudoHgvsGeneSympolCDotPosExp, substrs, ArraySize(substrs))) { int len = substrs[geneSymbolIx].rm_eo - substrs[geneSymbolIx].rm_so; char geneSymbol[len+1]; safencpy(geneSymbol, sizeof(geneSymbol), term, len); char *nmAcc = nmForGeneSymbol(db, geneSymbol); if (isNotEmpty(nmAcc)) { // Make it a real HGVS term with the NM and pass that on to the usual parser. int descStartIx = regexSubstrMatched(substrs[2]) ? 2 : 3; char *description = term + substrs[descStartIx].rm_so; struct dyString *nmTerm = dyStringCreate("%s(%s):c.%s", nmAcc, geneSymbol, description);