aaf72102b545c05c42f66b7a3fc22d65b1ecf4fe angie Mon Aug 8 14:12:39 2016 -0700 Added recognition of a small subset of HGVS terms: coding (c.) SNVs relative to RefSeq NM_ or LRG transcript IDs, and protein (p.) simple substitutions relative to NP_. Also accepted (not HGVS but similar and popular): geneSymbol and abbreviated protein subst like "ALK G1494E". hgFind will map terms to the current genome if possible, and will display warnings about unrecognized accessions, out-of-bounds coordinates and mismatching reference alleles. refs #15071, #15554 diff --git src/hg/lib/hgFind.c src/hg/lib/hgFind.c index 0d6273d..78da647 100644 --- src/hg/lib/hgFind.c +++ src/hg/lib/hgFind.c @@ -11,30 +11,31 @@ #include "hash.h" #include "cheapcgi.h" #include "htmshell.h" #include "web.h" #include "jksql.h" #include "hdb.h" #include "hui.h" #include "psl.h" #include "genePred.h" #include "genePredReader.h" #include "bed.h" #include "cytoBand.h" #include "cart.h" #include "hgFind.h" #include "hgFindSpec.h" +#include "hgHgvs.h" #include "snp.h" #include "refLink.h" #include "kgAlias.h" #include "kgProtAlias.h" #include "findKGAlias.h" #include "findKGProtAlias.h" #include "tigrCmrGene.h" #include "minGeneInfo.h" #include "pipeline.h" #include "hgConfig.h" #include "trix.h" #include "trackHub.h" #include "udc.h" #include "hubConnect.h" #include "bigBedFind.h" @@ -3040,30 +3041,88 @@ hfs = hfsFind(longList, search); if (hfs != NULL) foundIt = hgFindUsingSpec(db, hfs, term, limitResults, hgp, FALSE, 0,0, FALSE); else warn("Unrecognized singleSearch=%s in URL", search); } if (foundIt) { fixSinglePos(hgp); if (cart != NULL) cartSetString(cart, "hgFind.matches", hgp->tableList->posList->browserName); } return foundIt; } +static int getDotVersion(char *acc) +/* If acc ends with a .version, return the version, else 0. */ +{ +char *p = strchr(acc, '.'); +if (p) + return atoi(p+1); +return 0; +} + +static boolean matchesHgvs(char *db, char *term, struct hgPositions *hgp) +/* Return TRUE if the search term looks like a variant encoded using the HGVS nomenclature */ +/* See http://varnomen.hgvs.org/ */ +{ +boolean foundIt = FALSE; +struct hgvsVariant *hgvs = hgvsParseTerm(term); +if (hgvs == NULL) + hgvs = hgvsParsePseudoHgvs(db, term); +if (hgvs) + { + char *foundAcc = NULL, *diffRefAllele = NULL; + int foundVersion = 0; + boolean coordsOk = hgvsValidate(db, hgvs, &foundAcc, &foundVersion, &diffRefAllele); + if (foundAcc == NULL) + warn("Can't find accession for HGVS term '%s'", term); + else + { + int hgvsVersion = getDotVersion(hgvs->seqAcc); + char foundAccWithV[strlen(foundAcc)+20]; + if (foundVersion) + safef(foundAccWithV, sizeof(foundAccWithV), "%s.%d", foundAcc, foundVersion); + else + safecpy(foundAccWithV, sizeof(foundAccWithV), foundAcc); + if (hgvsVersion && hgvsVersion != foundVersion) + warn("HGVS term '%s' is based on %s but UCSC has version %s", + term, hgvs->seqAcc, foundAccWithV); + if (! coordsOk) + warn("HGVS term '%s' has coordinates outside the bounds of %s", term, foundAccWithV); + else if (diffRefAllele != NULL) + warn ("HGVS term '%s' reference value '%s' does not match %s value '%s'", + term, hgvs->refAllele, foundAccWithV, diffRefAllele); + if (coordsOk) + { + char *pslTable = NULL; + struct psl *mapping = hgvsMapToGenome(db, hgvs, &pslTable); + if (mapping) + { + int padding = 5; + char *trackTable = startsWith("lrg", pslTable) ? "lrgTranscriptAli" : "refGene"; + singlePos(hgp, "HGVS", NULL, trackTable, term, "", + mapping->tName, mapping->tStart-padding, mapping->tEnd+padding); + foundIt = TRUE; + } + } + } + } +return foundIt; +} + struct hgPositions *hgPositionsFind(char *db, char *term, char *extraCgi, char *hgAppNameIn, struct cart *cart, boolean multiTerm) /* Return table of positions that match term or NULL if none such. */ { struct hgPositions *hgp = NULL, *hgpItem = NULL; regmatch_t substrs[4]; boolean canonicalSpec = FALSE; boolean gbrowserSpec = FALSE; boolean lengthSpec = FALSE; boolean singleBaseSpec = FALSE; boolean relativeFlag = FALSE; int relStart = 0, relEnd = 0; hgAppName = hgAppNameIn; @@ -3141,31 +3200,31 @@ hgParseChromRange(db, term, &chrom, &start, &end); if (relativeFlag) { int chromSize = end; end = start + relEnd; start = start + relStart; if (end > chromSize) end = chromSize; if (start < 0) start = 0; } singlePos(hgp, "Chromosome Range", NULL, "chromInfo", originalTerm, "", chrom, start, end); } -else +else if (!matchesHgvs(db, term, hgp)) { struct hgFindSpec *shortList = NULL, *longList = NULL; struct hgFindSpec *hfs; boolean done = FALSE; // Disable singleBaseSpec for any term that is not hgOfficialChromName // because that mangles legitimate IDs that are [A-Z]:[0-9]+. if (singleBaseSpec) { singleBaseSpec = relativeFlag = FALSE; term = cloneString(originalTerm); // restore original term relStart = relEnd = 0; } if (!trackHubDatabase(db))