37adcffad6e119b07187ac7659654277a7347450 angie Fri Oct 27 13:39:28 2017 -0700 Add support for LRG genomic and protein HGVS search terms. diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index ed54650..e3f24a6 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -1,46 +1,49 @@ /* hgHgvs.c - Parse the Human Genome Variation Society (HGVS) nomenclature for variants. */ /* See http://varnomen.hgvs.org/ and https://github.com/mutalyzer/mutalyzer/ */ /* Copyright (C) 2016 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "hgHgvs.h" +#include "bPlusTree.h" #include "chromInfo.h" #include "genbank.h" #include "hdb.h" #include "indelShift.h" +#include "lrg.h" #include "pslTransMap.h" #include "regexHelper.h" #include "trackHub.h" void hgvsVariantFree(struct hgvsVariant **pHgvs) // Free *pHgvs and its contents, and set *pHgvs to NULL. { if (pHgvs && *pHgvs) { struct hgvsVariant *hgvs = *pHgvs; freez(&hgvs->seqAcc); freez(&hgvs->seqGeneSymbol); freez(&hgvs->changes); freez(pHgvs); } } // Regular expressions for HGVS-recognized sequence accessions: LRG or versioned RefSeq: -#define lrgTranscriptExp "(LRG_[0-9]+t[0-9+])" -#define lrgRegionExp "(LRG_[0-9+])" +#define lrgTranscriptExp "(LRG_[0-9]+t[0-9]+)" +#define lrgProteinExp "(LRG_[0-9]+p[0-9]+)" +#define lrgRegionExp "(LRG_[0-9]+)" // NC = RefSeq reference assembly chromosome // NG = RefSeq incomplete genomic region (e.g. gene locus) // NM = RefSeq curated mRNA // NP = RefSeq curated protein // NR = RefSeq curated (non-coding) RNA #define geneSymbolExp "([A-Za-z0-9./_-]+)" #define optionalGeneSymbolExp "(\\(" geneSymbolExp "\\))?" #define versionedAccPrefixExp(p) "(" p "_[0-9]+(\\.[0-9]+)?)" optionalGeneSymbolExp // ........................ accession and optional dot version // ........... optional dot version // ...... optional gene symbol in ()s // .... optional gene symbol #define versionedRefSeqNCExp versionedAccPrefixExp("[NX]C") #define versionedRefSeqNGExp versionedAccPrefixExp("[NX]G") @@ -91,30 +94,42 @@ // ... // replacement sequence // Protein range (or just single pos) regex #define hgvsAaRangeExp "(" hgvsAminoAcidExp ")" posIntExp "(_(" hgvsAminoAcidExp ")" posIntExp ")?(.*)" #define hgvsPDotRangeExp "p\\.\\(?" hgvsAaRangeExp "\\)?" // original start AA ... // 1-based start position ... // optional range sep and AA+pos ..................................... // original end AA ... // 1-based end position ... // change description ... // Complete HGVS term regexes combining sequence identifiers and change operations #define hgvsFullRegex(seq, op) "^" seq "[ :]+" op +#define hgvsLrgGDotPosExp hgvsFullRegex(lrgRegionExp, hgvsGMDotPosExp) +#define hgvsLrgGDotExp hgvsLrgGDotPosExp "(.*)" +// substring numbering: +// 0..................................... whole matching string +// 1....................... LRG region +// 2. g or m +// 3.. 1-based start position +// 4... optional range separator and end position +// 5.. 1-based end position +// 6... change description + + #define hgvsRefSeqNCGDotPosExp hgvsFullRegex(versionedRefSeqNCExp, hgvsGMDotPosExp) #define hgvsRefSeqNCGDotExp hgvsRefSeqNCGDotPosExp "(.*)" // substring numbering: // 0..................................... whole matching string // 1................. accession and optional dot version // 2........ optional dot version // 3...... (n/a) optional gene symbol in ()s // 4.... (n/a) optional gene symbol // 5. g or m // 6.. 1-based start position // 7... optional range separator and end position // 8.. 1-based end position // 9... change description #define hgvsLrgCDotPosExp hgvsFullRegex(lrgTranscriptExp, hgvsCDotPosExp) @@ -181,41 +196,60 @@ // 3..... optional gene sym in ()s // 4... optional gene symbol // 5... n/a 4 n.: UTR anchor // 6... mandatory first number // 7....... optional offset // 8... offset separator // 9... offset number // 10............... optional range // 11... n/a 4 n.: UTR anchor // 12... first number // 13....... optional offset // 14... offset separator // 15... offset number // 16........ sequence change +#define hgvsLrgPDotSubstExp hgvsFullRegex(lrgProteinExp, hgvsPDotSubstExp) +// substring numbering: +// 0..................................................... whole matching string +// 1............................ LRG protein +// 2..... original sequence +// 3...... 1-based position +// 4...... replacement sequence + #define hgvsRefSeqNPPDotSubstExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotSubstExp) // substring numbering: // 0..................................................... whole matching string // 1................ accession and optional dot version // 2..... optional dot version // 3..... optional gene symbol in ()s // 4... optional gene symbol // 5..... original sequence // 6...... 1-based position // 7...... replacement sequence +#define hgvsLrgPDotRangeExp hgvsFullRegex(lrgProteinExp, hgvsPDotRangeExp) +// substring numbering: +// 0..................................................... whole matching string +// 1.......................... accession and optional dot version +// 2... original start AA +// 3... 1-based start position +// 4................. optional range sep and AA+pos +// 5... original end AA +// 6... 1-based end position +// 7...... change description + #define hgvsRefSeqNPPDotRangeExp hgvsFullRegex(versionedRefSeqNPExp, hgvsPDotRangeExp) // substring numbering: // 0..................................................... whole matching string // 1................ accession and optional dot version // 2..... optional dot version // 3..... optional gene symbol in ()s // 4... optional gene symbol // 5... original start AA // 6... 1-based start position // 7................. optional range sep and AA+pos // 8... original end AA // 9... 1-based end position // 10..... change description // Pseudo-HGVS in common usage @@ -283,37 +317,50 @@ // 1................... gene symbol // 2..... 1-based position // Gene symbol, maybe punctuation, and a clear "c." position (and possibly change) #define pseudoHgvsGeneSympolCDotPosExp "^" geneSymbolExp "[: ]+" hgvsCDotPosExp // 0..................................................... whole matching string // 1................... gene symbol // 2..... optional beginning of position exp // 3..... beginning of position exp static struct hgvsVariant *hgvsParseGDotPos(char *term) /* If term is parseable as an HGVS NC_...g. term, return the parsed representation, else NULL. */ { struct hgvsVariant *hgvs = NULL; +int accIx = 1; +int startPosIx = 3; +int endPosIx = 5; +int changeIx = 6; +boolean matches = FALSE; regmatch_t substrs[17]; -if (regexMatchSubstr(term, hgvsRefSeqNCGDotExp, substrs, ArraySize(substrs))) +if (regexMatchSubstr(term, hgvsLrgGDotExp, substrs, ArraySize(substrs))) + matches = TRUE; +else if (regexMatchSubstr(term, hgvsRefSeqNCGDotExp, substrs, ArraySize(substrs))) + { + matches = TRUE; + // The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that + // affects all substring offsets after the accession. + int refSeqExtra = 3; + startPosIx += refSeqExtra; + endPosIx += refSeqExtra; + changeIx += refSeqExtra; + } +if (matches) { - int accIx = 1; - int startPosIx = 6; - int endPosIx = 8; - int changeIx = 9; AllocVar(hgvs); // HGVS recommendation May 2017: replace m. with g. since mitochondrion is genomic too hgvs->type = hgvstGenomic; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); if (regexSubstrMatched(substrs[endPosIx])) hgvs->end = regexSubstringInt(term, substrs[endPosIx]); else hgvs->end = hgvs->start1; hgvs->changes = regexSubstringClone(term, substrs[changeIx]); } return hgvs; } static void extractComplexNum(char *term, regmatch_t *substrs, int substrOffset, @@ -405,79 +452,108 @@ { hgvs->end = hgvs->start1; hgvs->endIsUtr3 = hgvs->startIsUtr3; hgvs->endOffset = hgvs->startOffset; } hgvs->changes = regexSubstringClone(term, substrs[changeIx]); } return hgvs; } static struct hgvsVariant *hgvsParsePDotSubst(char *term) /* If term is parseable as an HGVS protein substitution, return the parsed representation, * otherwise NULL. */ { struct hgvsVariant *hgvs = NULL; +boolean matches = FALSE; +int accIx = 1; +int refIx = 2; +int posIx = 3; +int altIx = 4; +int geneSymbolIx = -1; regmatch_t substrs[8]; -if (regexMatchSubstr(term, hgvsRefSeqNPPDotSubstExp, substrs, ArraySize(substrs))) +if (regexMatchSubstr(term, hgvsLrgPDotSubstExp, substrs, ArraySize(substrs))) + matches = TRUE; +else if (regexMatchSubstr(term, hgvsRefSeqNPPDotSubstExp, substrs, ArraySize(substrs))) + { + matches = TRUE; + // The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that + // affects all substring offsets after the accession. + int refSeqExtra = 3; + refIx += refSeqExtra; + posIx += refSeqExtra; + altIx += refSeqExtra; + geneSymbolIx = 4; + } +if (matches) { - int accIx = 1; - int geneSymbolIx = 4; - int refIx = 5; - int posIx = 6; - int altIx = 7; AllocVar(hgvs); hgvs->type = hgvstProtein; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); int changeStart = substrs[refIx].rm_so; int changeEnd = substrs[altIx].rm_eo; int len = changeEnd - changeStart; char change[len+1]; safencpy(change, sizeof(change), term+changeStart, len); hgvs->changes = cloneString(change); - if (regexSubstrMatched(substrs[geneSymbolIx])) + if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); hgvs->start1 = regexSubstringInt(term, substrs[posIx]); hgvs->end = hgvs->start1; } return hgvs; } static struct hgvsVariant *hgvsParsePDotRange(char *term) /* If term is parseable as an HGVS protein range change, return the parsed representation, * otherwise NULL. */ { struct hgvsVariant *hgvs = NULL; +boolean matches = FALSE; +int accIx = 1; +int startRefIx = 2; +int startPosIx = 3; +int endPosIx = 6; +int changeIx = 7; +int geneSymbolIx = -1; regmatch_t substrs[11]; -if (regexMatchSubstr(term, hgvsRefSeqNPPDotRangeExp, substrs, ArraySize(substrs))) +if (regexMatchSubstr(term, hgvsLrgPDotRangeExp, substrs, ArraySize(substrs))) + matches = TRUE; +else if (regexMatchSubstr(term, hgvsRefSeqNPPDotRangeExp, substrs, ArraySize(substrs))) + { + matches = TRUE; + // The LRG accession regex has only one substring but the RefSeq acc regex has 4, so that + // affects all substring offsets after the accession. + int refSeqExtra = 3; + startRefIx += refSeqExtra; + startPosIx += refSeqExtra; + endPosIx += refSeqExtra; + changeIx += refSeqExtra; + geneSymbolIx = 4; + } +if (matches) { - int accIx = 1; - int geneSymbolIx = 4; - int startRefIx = 5; - int startPosIx = 6; - int endPosIx = 9; - int changeIx = 10; AllocVar(hgvs); hgvs->type = hgvstProtein; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); int changeStart = substrs[startRefIx].rm_so; int changeEnd = substrs[changeIx].rm_eo; int len = changeEnd - changeStart; char change[len+1]; safencpy(change, sizeof(change), term+changeStart, len); hgvs->changes = cloneString(change); - if (regexSubstrMatched(substrs[geneSymbolIx])) + if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); if (regexSubstrMatched(substrs[endPosIx])) hgvs->end = regexSubstringInt(term, substrs[endPosIx]); else hgvs->end = hgvs->start1; } return hgvs; } struct hgvsVariant *hgvsParseTerm(char *term) /* If term is a parseable form of HGVS, return the parsed representation, otherwise NULL. * This does not check validity of accessions, coordinates or alleles. */ { struct hgvsVariant *hgvs = hgvsParseCNDotPos(term); @@ -558,45 +634,60 @@ { // Trim .version if present since our genbank tables don't use versioned names. char *trimmed = cloneFirstWordByDelimiter(nmAcc, '.'); sqlSafef(query, sizeof(query), "select l.protAcc from %s l, refGene r " "where r.name = '%s' and l.mrnaAcc = r.name " "and l.protAcc != '' order by length(l.protAcc), l.protAcc", refLinkTable, trimmed); } else return NULL; struct sqlConnection *conn = hAllocConn(db); char *npAcc = sqlQuickString(conn, query); hFreeConn(&conn); return npAcc; } +static char *lrgProteinToTx(char *db, char *protAcc) +/* Return the LRG_ transcript accession for protAcc. Each LRG_NpM has a corresponding LRG_NtM. */ +{ +int accLen = strlen(protAcc); +char txAcc[accLen+1]; +safecpy(txAcc, sizeof(txAcc), protAcc); +char *p = strrchr(txAcc, 'p'); +if (p) + *p = 't'; +return cloneString(txAcc); +} + static char *getProteinSeq(char *db, char *acc) /* Return amino acid sequence for acc, or NULL if not found. */ { if (trackHubDatabase(db)) return NULL; char *seq = NULL; if (startsWith("LRG_", acc)) { if (hTableExists(db, "lrgPep")) { char query[2048]; - sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", acc); + // lrgPep is indexed by transcript ID not protein ID + char *txAcc = lrgProteinToTx(db, acc); + sqlSafef(query, sizeof(query), "select seq from lrgPep where name = '%s'", txAcc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); + freeMem(txAcc); } } else { if (hDbHasNcbiRefSeq(db)) { char query[2048]; sqlSafef(query, sizeof(query), "select seq from ncbiRefSeqPepTable " "where name = '%s'", acc); struct sqlConnection *conn = hAllocConn(db); seq = sqlQuickString(conn, query); hFreeConn(&conn); } else { @@ -722,105 +813,172 @@ int endPosIx = 5; int changeIx = 6; AllocVar(hgvs); hgvs->type = hgvstGenomic; hgvs->seqAcc = regexSubstringClone(term, substrs[chrIx]); hgvs->start1 = regexSubstringInt(term, substrs[startPosIx]); if (regexSubstrMatched(substrs[endPosIx])) hgvs->end = regexSubstringInt(term, substrs[endPosIx]); else hgvs->end = hgvs->start1; hgvs->changes = regexSubstringClone(term, substrs[changeIx]); } return hgvs; } +static struct bbiFile *getLrgBbi(char *db) +/* Return bbiFile for LRG regions or NULL if not found. */ +{ +struct bbiFile *bbi = NULL; +// I don't think this will be called often enough to warrant caching open bbi file (and index?). +// I expect it to be called a couple times when the user enters a LRG genomic HGVS pos/search term. +// It would be cleaner to get fileName from tdb or db -- but this is much quicker & easier: +char fileName[1024]; +safef(fileName, sizeof(fileName), "/gbdb/%s/bbi/lrg.bb", db); +char *fileNameRep = hReplaceGbdb(fileName); +if (fileExists(fileNameRep)) + bbi = bigBedFileOpen(fileNameRep); +freeMem(fileNameRep); +return bbi; +} + +static struct lrg *loadLrgByName(char *db, char *lrgId) +/* Retrieve lrg data from bigBed. */ +{ +struct lrg *lrg = NULL; +struct bbiFile *bbi = getLrgBbi(db); +if (bbi) + { + int fieldIx = 0; + struct bptFile *index = bigBedOpenExtraIndex(bbi, "name", &fieldIx); + if (index) + { + struct lm *lm = lmInit(0); + struct bigBedInterval *bb = bigBedNameQuery(bbi, index, fieldIx, lrgId, lm); + if (bb) + { + char *fields[bbi->fieldCount]; + char chrom[512], startBuf[16], endBuf[16]; + bigBedIntervalToRowLookupChrom(bb, NULL, bbi, chrom, sizeof(chrom), + startBuf, endBuf, fields, bbi->fieldCount); + lrg = lrgLoad(fields); + } + lmCleanup(&lm); + } + // Don't bptFileClose(&index) -- index shares pointers with bbi! + } +bigBedFileClose(&bbi); +return lrg; +} + static char refBaseFromNucSubst(char *change) /* If sequence change is a nucleotide substitution, return the reference base, else null char. */ { if (regexMatchNoCase(change, "^([ACGTU])>")) return toupper(change[0]); return '\0'; } static void hgvsStartEndToZeroBasedHalfOpen(struct hgvsVariant *hgvs, int *retStart, int *retEnd) /* Convert HGVS's fully closed, 1-based-unless-negative start and end to UCSC start and end. */ { // If hgvs->start1 is negative, it is effectively 0-based, so subtract 1 only if positive. int start = hgvs->start1; if (start > 0) start--; // If hgvs->end is negative, it is effectively 0-based, so add 1 to get half-open end. int end = hgvs->end; if (end < 0) end++; if (retStart) *retStart = start; if (retEnd) *retEnd = end; } static boolean hgvsValidateGenomic(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele) -/* Return TRUE if hgvs->seqAcc can be mapped to a chrom in db and coords are legal for the chrom. +/* Return TRUE if hgvs->seqAcc can be mapped to a chrom/LRG in db and coords are in range. * If retFoundAcc is non-NULL, set it to the versionless seqAcc if chrom is found, else NULL. - * If retFoundVersion is non-NULL, set it to seqAcc's version if chrom is found, else NULL. + * If retFoundVersion is non-NULL, set to seqAcc's version if chrom is found, -1 for LRG, else NULL. * If retDiffRefAllele is non-NULL and hgvs specifies a reference allele then compare it to * the genomic sequence at that location; set *retDiffRefAllele to NULL if they match, or to * genomic sequence if they differ. */ { boolean coordsOK = FALSE; +char hgvsBase = '\0'; if (retDiffRefAllele) + { + hgvsBase = refBaseFromNucSubst(hgvs->changes); *retDiffRefAllele = NULL; + } if (retFoundAcc) *retFoundAcc = NULL; if (retFoundVersion) *retFoundVersion = 0; +int start, end; +hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end); +if (startsWith("LRG_", hgvs->seqAcc)) + { + struct lrg *lrg = loadLrgByName(db, hgvs->seqAcc); + if (lrg && start >= 0 && start < lrg->lrgSize && end > 0 && end <= lrg->lrgSize) + { + coordsOK = TRUE; + if (hgvsBase != '\0') + { + struct dnaSeq *lrgSeq = lrgReconstructSequence(lrg, db); + lrgSeq->dna[start] = toupper(lrgSeq->dna[start]); + if (lrgSeq->dna[start] != hgvsBase) + *retDiffRefAllele = cloneStringZ(lrgSeq->dna+start, 1); + dnaSeqFree(&lrgSeq); + } + if (retFoundAcc) + *retFoundAcc = cloneString(hgvs->seqAcc); + } + lrgFree(&lrg); + } +else + { char *chrom = hgOfficialChromName(db, hgvs->seqAcc); if (isNotEmpty(chrom)) { struct chromInfo *ci = hGetChromInfo(db, chrom); - int start, end; - hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end); - if (start >= 0 && start < ci->size && end > 0 && end < ci->size) + if (start >= 0 && start < ci->size && end > 0 && end <= ci->size) { coordsOK = TRUE; - if (retDiffRefAllele) - { - char hgvsBase = refBaseFromNucSubst(hgvs->changes); if (hgvsBase != '\0') { struct dnaSeq *refBase = hFetchSeq(ci->fileName, chrom, start, end); touppers(refBase->dna); if (refBase->dna[0] != hgvsBase) *retDiffRefAllele = cloneString(refBase->dna); dnaSeqFree(&refBase); } } - } // Since we found hgvs->seqAcc, split it into versionless and version parts. if (retFoundAcc) *retFoundAcc = cloneFirstWordByDelimiter(hgvs->seqAcc, '.'); if (retFoundVersion) { char *p = strchr(hgvs->seqAcc, '.'); if (p) *retFoundVersion = atoi(p+1); } // Don't free chromInfo, it's cached! } + } return coordsOK; } static char *getCdnaSeq(char *db, char *acc) /* Return cdna sequence for acc, or NULL if not found. */ { if (trackHubDatabase(db)) return NULL; char *seq = NULL; if (startsWith("LRG_", acc)) { if (hTableExists(db, "lrgCdna")) { char query[2048]; sqlSafef(query, sizeof(query), "select seq from lrgCdna where name = '%s'", acc); @@ -1063,57 +1221,106 @@ static struct bed *bed6New(char *chrom, unsigned chromStart, unsigned chromEnd, char *name, int score, char strand) /* Alloc & return a new bed with only the first 6 fields populated. */ { struct bed *bed6; AllocVar(bed6); bed6->chrom = cloneString(chrom); bed6->chromStart = chromStart; bed6->chromEnd = chromEnd; bed6->name = cloneString(name); bed6->score = score; bed6->strand[0] = strand; return bed6; } +static int pslMapQStartToT(struct psl *psl, int qStartIn) +/* Map a query start coord to a target start coord; return -1 if qStart is not in the alignment. */ +{ +int t = -1, qStart = qStartIn, ix; +boolean isRc = pslQStrand(psl) == '-'; +if (isRc) + { + int qEnd = qStart + 1; + qStart = psl->qSize - qEnd; + } +for (ix = 0; ix < psl->blockCount; ix++) + { + int qOffset = qStart - pslQStart(psl, ix); + if (qOffset >= 0 && qStart < pslQEnd(psl, ix)) + { + t = pslTStart(psl, ix) + qOffset; + break; + } + } +return t; +} + +static int pslMapQEndToT(struct psl *psl, int qEnd) +/* Map a query end coord to a target end coord; return -1 if qEnd is not in the alignment. */ +{ +int tEnd = -1; +int qStart = qEnd - 1; +int tStart = pslMapQStartToT(psl, qStart); +if (tStart >= 0) + tEnd = tStart + 1; +return tEnd; +} + boolean hgvsIsInsertion(struct hgvsVariant *hgvs) /* Return TRUE if hgvs is an insertion (end == start1+1, change starts with "ins"). */ { return (hgvs->end == hgvs->start1+1 && startsWith("ins", hgvs->changes)); } static void adjustInsStartEnd(struct hgvsVariant *hgvs, uint *retStart0, uint *retEnd) /* HGVS insertion coordinates include the base before and the base after the insertion point, * while we treat it as a zero-length point. So if this is insertion, adjust the start and end. */ { if (hgvsIsInsertion(hgvs)) { *retStart0 = *retStart0 + 1; *retEnd = *retEnd - 1; } } static struct bed *hgvsMapGDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) -/* Given an NC_*g. term, if we can map NC_ to chrom then return the region, else NULL. */ +/* Given an NC_*g. or LRG_*g. term, if we can map to chrom then return the region, else NULL. */ { struct bed *region = NULL; +if (startsWith("LRG_", hgvs->seqAcc)) + { + struct lrg *lrg = loadLrgByName(db, hgvs->seqAcc); + if (lrg) + { + uint chromSize = hChromSize(db, lrg->chrom); + struct psl *psl = lrgToPsl(lrg, chromSize); + int start = pslMapQStartToT(psl, hgvs->start1 - 1); + int end = pslMapQEndToT(psl, hgvs->end); + if (start >= 0 && end >= 0) + region = bed6New(lrg->chrom, start, end, "", 0, '+'); + pslFree(&psl); + } + } +else + { char *chrom = hgOfficialChromName(db, hgvs->seqAcc); if (isNotEmpty(chrom) && hgvs->start1 > 0) - { region = bed6New(chrom, hgvs->start1 - 1, hgvs->end, "", 0, '+'); - adjustInsStartEnd(hgvs, ®ion->chromStart, ®ion->chromEnd); } +if (region) + adjustInsStartEnd(hgvs, ®ion->chromStart, ®ion->chromEnd); if (retPslTable) *retPslTable = NULL; return region; } static void hgvsTranscriptToZeroBasedHalfOpen(struct hgvsVariant *hgvs, int maxCoord, struct genbankCds *cds, int *retStart, int *retEnd, int *retUpstreamBases, int *retDownstreamBases) /* Convert a transcript HGVS's start1 and end into UCSC coords plus upstream and downstream lengths * for when the transcript HGVS has coordinates that extend beyond its sequence boundaries. * ret* args must be non-NULL. */ { hgvsStartEndToZeroBasedHalfOpen(hgvs, retStart, retEnd); if (hgvs->type == hgvstCoding) @@ -1437,59 +1644,64 @@ } } if (region) { adjustInsStartEnd(hgvs, ®ion->chromStart, ®ion->chromEnd); if (retPslTable) *retPslTable = cloneString(pslTable); } return region; } static struct bed *hgvsMapPDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { -struct bed *region = NULL; char *acc = normalizeVersion(db, hgvs->seqAcc, NULL); -if (acc && (startsWith("NP_", acc) || startsWith("XP_", acc))) +if (isEmpty(acc)) + return NULL; +struct bed *region = NULL; +char *txAcc = NULL; +if (startsWith("LRG_", acc)) + txAcc = lrgProteinToTx(db, acc); +else if (startsWith("NP_", acc) || startsWith("XP_", acc)) { - // Translate the NP_*:p. to NM_*:c. and map NM_*:c. to the genome. struct sqlConnection *conn = hAllocConn(db); char query[2048]; if (hDbHasNcbiRefSeq(db)) sqlSafef(query, sizeof(query), "select mrnaAcc from ncbiRefSeqLink where protAcc = '%s'", acc); else if (hTableExists(db, "refGene")) sqlSafef(query, sizeof(query), "select mrnaAcc from %s l, refGene r " "where l.protAcc = '%s' and r.name = l.mrnaAcc", refLinkTable, acc); else return NULL; - char *nmAcc = sqlQuickString(conn, query); + txAcc = sqlQuickString(conn, query); hFreeConn(&conn); - if (nmAcc) + } +if (txAcc) { + // Translate the p. to c. and map c. to the genome. struct hgvsVariant cDot; zeroBytes(&cDot, sizeof(cDot)); cDot.type = hgvstCoding; - cDot.seqAcc = nmAcc; + cDot.seqAcc = txAcc; cDot.start1 = ((hgvs->start1 - 1) * 3) + 1; cDot.end = ((hgvs->end - 1) * 3) + 3; cDot.changes = hgvs->changes; region = hgvsMapNucToGenome(db, &cDot, retPslTable); - freeMem(nmAcc); - } + freeMem(txAcc); } return region; } struct bed *hgvsMapToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Return a bed6 with the variant's span on the genome and strand, or NULL if unable to map. * If successful and retPslTable is not NULL, set it to the name of the PSL table used. */ { if (hgvs == NULL) return NULL; struct bed *region = NULL; if (hgvs->type == hgvstProtein) region = hgvsMapPDotToGenome(db, hgvs, retPslTable); else region = hgvsMapNucToGenome(db, hgvs, retPslTable);