b1291b0928de2ce364c5335642acfc457110641e angie Mon Apr 24 14:08:18 2017 -0700 When an HGVS term maps to a base that has been deleted from the reference genome, e.g. NM_020469.2(ABO):c.261= maps to a deleted base in hg19 & hg38, pslTransMap returns NULL, so we were failing to map it to any genomic location. Added mapToDeletion for this special case, so we can map to the zero-length region where the reference genome has a deleted base. diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index 0d06599..ab6b363 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -1125,49 +1125,113 @@ int altLen = refLen; psl->qName = cloneString(hgvs->changes); psl->qSize = altLen; psl->qStart = 0; psl->qEnd = altLen; psl->blockCount = 1; AllocArray(psl->blockSizes, psl->blockCount); AllocArray(psl->qStarts, psl->blockCount); AllocArray(psl->tStarts, psl->blockCount); psl->blockSizes[0] = refLen <= altLen ? refLen : altLen; psl->qStarts[0] = psl->qStart; psl->tStarts[0] = psl->tStart; return psl; } +static struct psl *pslDelFromCoord(struct psl *txAli, int tStart, struct psl *variantPsl) +/* Return a PSL with same target and query as txAli, but as a deletion at offset tStart: + * two zero-length blocks surrounding no target and query = variantPsl's target coords. */ +{ +struct psl *del; +AllocVar(del); +del->tName = cloneString(txAli->tName); +del->tSize = txAli->tSize; +safecpy(del->strand, sizeof(del->strand), txAli->strand); +del->tStart = del->tEnd = tStart; +del->qName = cloneString(txAli->qName); +del->qStart = variantPsl->tStart; +del->qEnd = variantPsl->tEnd; +del->blockCount = 2; +AllocArray(del->blockSizes, del->blockCount); +AllocArray(del->qStarts, del->blockCount); +AllocArray(del->tStarts, del->blockCount); +// I wonder if zero-length blockSizes would trigger crashes somewhere... +del->blockSizes[0] = del->blockSizes[1] = 0; +del->tStarts[0] = del->tStarts[1] = del->tStart; +del->qStarts[0] = del->qStart; +del->qStarts[1] = del->qEnd; +return del; +} + + +static struct psl *mapToDeletion(struct psl *variantPsl, struct psl *txAli) +/* If the variant falls on a transcript base that is deleted in the reference genome, + * return the deletion coords (pslTransMap returns NULL), otherwise return NULL. */ +{ +// variant start and end coords, in transcript coords: +int vStart = variantPsl->tStart; +int vEnd = variantPsl->tEnd; +// If txAli->strand is '-', reverse coords +if (txAli->strand[0] == '-') + { + vStart = variantPsl->tSize - variantPsl->tEnd; + vEnd = variantPsl->tSize - variantPsl->tStart; + } +if (vEnd < txAli->qStart) + return pslDelFromCoord(txAli, txAli->qStart, variantPsl); +else if (vStart > txAli->qEnd) + return pslDelFromCoord(txAli, txAli->qEnd, variantPsl); +int i; +for (i = 0; i < txAli->blockCount - 1; i++) + { + int qBlockEnd = txAli->qStarts[i] + txAli->blockSizes[i]; + int qNextBlockStart = txAli->qStarts[i+1]; + int tBlockEnd = txAli->tStarts[i] + txAli->blockSizes[i]; + int tNextBlockStart = txAli->tStarts[i+1]; + if (vStart >= qBlockEnd && vEnd <= qNextBlockStart && + tBlockEnd == tNextBlockStart) + return pslDelFromCoord(txAli, tBlockEnd, variantPsl); + } +// Not contained in a deletion from reference genome (txAli target) -- return NULL. +return NULL; +} + + static struct psl *mapPsl(char *db, struct hgvsVariant *hgvs, char *pslTable, char *acc, struct genbankCds *cds, int *retUpstream, int *retDownstream) /* If acc is found in pslTable, use pslTransMap to map hgvs onto the genome. */ { struct psl *mappedToGenome = NULL; char query[2048]; sqlSafef(query, sizeof(query), "select * from %s where qName = '%s'", pslTable, acc); struct sqlConnection *conn = hAllocConn(db); struct sqlResult *sr = sqlGetResult(conn, query); char **row; if (sr && (row = sqlNextRow(sr))) { int bin = 1; // All PSL tables used here, and any made in the future, use the bin column. struct psl *txAli = pslLoad(row+bin); // variantPsl contains the anchor if a non-cdsStart anchor is used because // the actual position might be outside the bounds of the transcript sequence (intron/UTR) struct psl *variantPsl = pslFromHgvsNuc(hgvs, acc, txAli->qSize, txAli->qEnd, cds, retUpstream, retDownstream); mappedToGenome = pslTransMap(pslTransMapNoOpts, variantPsl, txAli); + // If there is a deletion in the genome / insertion in the transcript then pslTransMap cannot + // map those bases to the genome. In that case take a harder look and return the deletion + // coords. + if (mappedToGenome == NULL) + mappedToGenome = mapToDeletion(variantPsl, txAli); pslFree(&variantPsl); pslFree(&txAli); } sqlFreeResult(&sr); hFreeConn(&conn); return mappedToGenome; } static char *pslTableForAcc(char *db, char *acc) /* Based on acc (and whether db has NCBI RefSeq alignments), pick a PSL table where * acc should be found (table may or may not exist). Don't free the returned string. */ { char *pslTable = NULL; if (startsWith("LRG_", acc)) pslTable = "lrgTranscriptAli";