e6e8263c1a4a85000be7cc4df0797d75dceba589 angie Fri Jun 30 09:51:18 2017 -0700 Noncoding introns were not being mapped to the genome correctly, hgvsTranscriptToZeroBasedHalfOpen had a fencepost error for upstream/downstream and mapToDeletion was using q coords instead of t coords for upstream/downstream, doh! helpful testcase: NR_111987:n.-1 . fixes #19702 note-1. diff --git src/hg/lib/hgHgvs.c src/hg/lib/hgHgvs.c index e10ef5a..47f37dd 100644 --- src/hg/lib/hgHgvs.c +++ src/hg/lib/hgHgvs.c @@ -355,42 +355,48 @@ { matches = TRUE; geneSymbolIx = 4; startAnchorIx += refSeqExtra; endAnchorIx += refSeqExtra; endPosIx += refSeqExtra; changeIx += refSeqExtra; } if (matches) { AllocVar(hgvs); hgvs->type = isNoncoding ? hgvstNoncoding : hgvstCoding; hgvs->seqAcc = regexSubstringClone(term, substrs[accIx]); extractComplexNum(term, substrs, startAnchorIx, &hgvs->startIsUtr3, &hgvs->start1, &hgvs->startOffset); - if (isNoncoding && (hgvs->startIsUtr3 || hgvs->start1 < 0)) - warn("hgvsParseCNDotPos: noncoding term '%s' appears to start in UTR, " + if (isNoncoding && hgvs->startIsUtr3) + { + warn("hgvsParseCNDotPos: noncoding term '%s' appears to start in UTR3 (*), " "not applicable for noncoding", term); + hgvs->startIsUtr3 = FALSE; + } if (geneSymbolIx >= 0 && regexSubstrMatched(substrs[geneSymbolIx])) hgvs->seqGeneSymbol = regexSubstringClone(term, substrs[geneSymbolIx]); if (regexSubstrMatched(substrs[endPosIx])) { extractComplexNum(term, substrs, endAnchorIx, &hgvs->endIsUtr3, &hgvs->end, &hgvs->endOffset); - if (isNoncoding && (hgvs->endIsUtr3 || hgvs->end < 0)) - warn("hgvsParseCNDotPos: noncoding term '%s' appears to end in UTR, " + if (isNoncoding && hgvs->endIsUtr3) + { + warn("hgvsParseCNDotPos: noncoding term '%s' appears to end in UTR3 (*), " "not applicable for noncoding", term); + hgvs->endIsUtr3 = FALSE; + } } else { hgvs->end = hgvs->start1; hgvs->endIsUtr3 = hgvs->startIsUtr3; hgvs->endOffset = hgvs->startOffset; } hgvs->changes = regexSubstringClone(term, substrs[changeIx]); } return hgvs; } static struct hgvsVariant *hgvsParsePDotSubst(char *term) /* If term is parseable as an HGVS protein substitution, return the parsed representation, * otherwise NULL. */ @@ -967,85 +973,79 @@ } else { // genbank tables -- no version normalizedAcc = cloneFirstWordByDelimiter(acc, '.'); foundVersion = getGbVersion(db, normalizedAcc); } if (retFoundVersion) *retFoundVersion = foundVersion; return normalizedAcc; } static boolean hgvsValidateGene(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele) /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc. - * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so + * Note: Transcript terms may contain coords outside the bounds (upstream, intron, downstream) so * those can't be checked without mapping the term to the genome. * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version * of hgvs->seqAcc) or NULL if we can't find any match. * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4), * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG). * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */ { char *acc = normalizeVersion(db, hgvs->seqAcc, retFoundVersion); if (isEmpty(acc)) return FALSE; boolean coordsOK = FALSE; char *accSeq = (hgvs->type == hgvstProtein ? getProteinSeq(db, acc) : getCdnaSeq(db, acc)); if (accSeq) { // By convention, foundAcc is always versionless because it's accompanied by foundVersion. if (retFoundAcc) *retFoundAcc = cloneFirstWordByDelimiter(acc, '.'); int seqLen = strlen(accSeq); int start, end; hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end); if (hgvs->type == hgvstCoding) { - // Coding term coords can extend beyond the bounds of the transcript so - // we can't check them without mapping to the genome. However, if the coords - // are in bounds and a reference allele is provided, we can check that. struct genbankCds cds; coordsOK = getCds(db, acc, &cds); if (coordsOK && retDiffRefAllele) { start += (hgvs->startIsUtr3 ? cds.end : cds.start); if (hgvs->startOffset == 0 && start >= 0 && start < seqLen) checkRefAllele(hgvs, start, accSeq, retDiffRefAllele); } } else { - if (start >= 0 && start < seqLen && end > 0 && end <= seqLen) - { coordsOK = TRUE; - if (retDiffRefAllele) + if (retDiffRefAllele && hgvs->startOffset == 0 && start >= 0 && start < seqLen) checkRefAllele(hgvs, start, accSeq, retDiffRefAllele); } } - } freeMem(accSeq); freeMem(acc); return coordsOK; } boolean hgvsValidate(char *db, struct hgvsVariant *hgvs, char **retFoundAcc, int *retFoundVersion, char **retDiffRefAllele) /* Return TRUE if hgvs coords are within the bounds of the sequence for hgvs->seqAcc. - * Note: Coding terms may contain coords outside the bounds (upstream, intron, downstream) so + * Note: Transcript terms may contain coords outside the bounds (upstream, intron, downstream) so * those can't be checked without mapping the term to the genome; this returns TRUE if seq is found. * If retFoundAcc is not NULL, set it to our local accession (which may be missing the .version * of hgvs->seqAcc) or NULL if we can't find any match. * If retFoundVersion is not NULL and hgvs->seqAcc has a version number (e.g. NM_005957.4), * set retFoundVersion to our version from latest GenBank, otherwise 0 (no version for LRG). * If coords are OK and retDiffRefAllele is not NULL: if our sequence at the coords * matches hgvs->refAllele then set it to NULL; if mismatch then set it to our sequence. */ { if (hgvs->type == hgvstGenomic || hgvs->type == hgvstMito) return hgvsValidateGenomic(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele); else return hgvsValidateGene(db, hgvs, retFoundAcc, retFoundVersion, retDiffRefAllele); } static struct bed *bed6New(char *chrom, unsigned chromStart, unsigned chromEnd, char *name, @@ -1083,112 +1083,108 @@ static struct bed *hgvsMapGDotToGenome(char *db, struct hgvsVariant *hgvs, char **retPslTable) /* Given an NC_*g. term, if we can map NC_ to chrom then return the region, else NULL. */ { struct bed *region = NULL; char *chrom = hgOfficialChromName(db, hgvs->seqAcc); if (isNotEmpty(chrom) && hgvs->start1 > 0) { region = bed6New(chrom, hgvs->start1 - 1, hgvs->end, "", 0, '+'); adjustInsStartEnd(hgvs, ®ion->chromStart, ®ion->chromEnd); } if (retPslTable) *retPslTable = NULL; return region; } -static void hgvsCodingToZeroBasedHalfOpen(struct hgvsVariant *hgvs, +static void hgvsTranscriptToZeroBasedHalfOpen(struct hgvsVariant *hgvs, int maxCoord, struct genbankCds *cds, int *retStart, int *retEnd, int *retUpstreamBases, int *retDownstreamBases) -/* Convert a coding HGVS's start1 and end into UCSC coords plus upstream and downstream lengths - * for when the coding HGVS has coordinates that extend beyond its sequence boundaries. +/* Convert a transcript HGVS's start1 and end into UCSC coords plus upstream and downstream lengths + * for when the transcript HGVS has coordinates that extend beyond its sequence boundaries. * ret* args must be non-NULL. */ { -int start, end; -hgvsStartEndToZeroBasedHalfOpen(hgvs, &start, &end); +hgvsStartEndToZeroBasedHalfOpen(hgvs, retStart, retEnd); +if (hgvs->type == hgvstCoding) + { // If the position follows '*' that means it's relative to cdsEnd; otherwise rel to cdsStart -if (hgvs->startIsUtr3) - *retStart = cds->end + start; -else - *retStart = cds->start + start; -if (hgvs->endIsUtr3) - *retEnd = cds->end + end; -else - *retEnd = cds->start + end; + *retStart += (hgvs->startIsUtr3 ? cds->end : cds->start); + *retEnd += (hgvs->endIsUtr3 ? cds->end : cds->start); + } // Now check for coords that extend beyond the transcript('s alignment to the genome) if (*retStart < 0) { - // hgvs->start1 is upstream of coding transcript. + // hgvs->start1 is upstream of transcript. *retUpstreamBases = -*retStart; *retStart = 0; } -else if (*retStart > maxCoord) +else if (*retStart >= maxCoord) { - // Even the start coord is downstream of coding transcript -- make a negative "upstream" + // Even the start coord is downstream of transcript -- make a negative "upstream" // for adjusting start. *retUpstreamBases = -(*retStart - maxCoord + 1); *retStart = maxCoord - 1; } else *retUpstreamBases = 0; if (*retEnd > maxCoord) { - // hgvs->end is downstream of coding transcript. + // hgvs->end is downstream of transcript. *retDownstreamBases = *retEnd - maxCoord; *retEnd = maxCoord; } -else if (*retEnd < 0) +else if (*retEnd <= 0) { - // Even the end coord is upstream of coding transcript -- make a negative "downstream" + // Even the end coord is upstream of transcript -- make a negative "downstream" // for adjusting end. *retEnd += *retUpstreamBases; *retDownstreamBases = -*retUpstreamBases; } else *retDownstreamBases = 0; } static struct psl *pslFromHgvsNuc(struct hgvsVariant *hgvs, char *acc, int accSize, int accEnd, struct genbankCds *cds, int *retUpstreamBases, int *retDownstreamBases) /* Allocate and return a PSL modeling the variant in nucleotide sequence acc. * The PSL target is the sequence and the query is the changed part of the sequence. * accEnd is given in case accSize includes an unaligned poly-A tail. * If hgvs is coding ("c.") then the caller must pass in a valid cds. * In case the start or end position is outside the bounds of the sequence, set retUpstreamBases * or retDownstreamBases to the number of bases beyond the beginning or end of sequence. - * NOTE: coding intron offsets are ignored; the PSL contains the exon anchor point + * NOTE: intron offsets are ignored; the PSL contains the exon anchor point * and the caller will have to map that to the genome and then apply the intron offset. */ { if (hgvs == NULL) return NULL; if (hgvs->type == hgvstProtein) errAbort("pslFromHgvsNuc must be called only on nucleotide HGVSs, not protein."); struct psl *psl; AllocVar(psl); psl->tName = cloneString(acc); safecpy(psl->strand, sizeof(psl->strand), "+"); psl->tSize = accSize; -if (hgvs->type != hgvstCoding) +if (hgvs->type == hgvstGenomic || hgvs->type == hgvstMito) { // Sane 1-based fully closed coords. hgvsStartEndToZeroBasedHalfOpen(hgvs, &psl->tStart, &psl->tEnd); } else { // Simple or insanely complex CDS-relative coords. - hgvsCodingToZeroBasedHalfOpen(hgvs, accEnd, cds, &psl->tStart, &psl->tEnd, + hgvsTranscriptToZeroBasedHalfOpen(hgvs, accEnd, cds, &psl->tStart, &psl->tEnd, retUpstreamBases, retDownstreamBases); } int refLen = psl->tEnd - psl->tStart; // Just use refLen for alt until we parse the sequence change portion of the term: int altLen = refLen; psl->qName = cloneString(hgvs->changes); psl->qSize = altLen; psl->qStart = 0; psl->qEnd = altLen; psl->blockCount = 1; AllocArray(psl->blockSizes, psl->blockCount); AllocArray(psl->qStarts, psl->blockCount); AllocArray(psl->tStarts, psl->blockCount); psl->blockSizes[0] = refLen <= altLen ? refLen : altLen; psl->qStarts[0] = psl->qStart; @@ -1212,45 +1208,46 @@ del->blockCount = 2; AllocArray(del->blockSizes, del->blockCount); AllocArray(del->qStarts, del->blockCount); AllocArray(del->tStarts, del->blockCount); // I wonder if zero-length blockSizes would trigger crashes somewhere... del->blockSizes[0] = del->blockSizes[1] = 0; del->tStarts[0] = del->tStarts[1] = del->tStart; del->qStarts[0] = del->qStart; del->qStarts[1] = del->qEnd; return del; } static struct psl *mapToDeletion(struct psl *variantPsl, struct psl *txAli) /* If the variant falls on a transcript base that is deleted in the reference genome, + * (or upstream/downstream mapped to a zero-length point), * return the deletion coords (pslTransMap returns NULL), otherwise return NULL. */ { // variant start and end coords, in transcript coords: int vStart = variantPsl->tStart; int vEnd = variantPsl->tEnd; -// If txAli->strand is '-', reverse coords -if (txAli->strand[0] == '-') +boolean isRc = (pslQStrand(txAli) == '-'); +if (isRc) { vStart = variantPsl->tSize - variantPsl->tEnd; vEnd = variantPsl->tSize - variantPsl->tStart; } if (vEnd < txAli->qStart) - return pslDelFromCoord(txAli, txAli->qStart, variantPsl); + return pslDelFromCoord(txAli, isRc ? txAli->tEnd : txAli->tStart, variantPsl); else if (vStart > txAli->qEnd) - return pslDelFromCoord(txAli, txAli->qEnd, variantPsl); + return pslDelFromCoord(txAli, isRc ? txAli->tStart : txAli->tEnd, variantPsl); int i; for (i = 0; i < txAli->blockCount - 1; i++) { int qBlockEnd = txAli->qStarts[i] + txAli->blockSizes[i]; int qNextBlockStart = txAli->qStarts[i+1]; int tBlockEnd = txAli->tStarts[i] + txAli->blockSizes[i]; int tNextBlockStart = txAli->tStarts[i+1]; if (vStart >= qBlockEnd && vEnd <= qNextBlockStart && tBlockEnd == tNextBlockStart) return pslDelFromCoord(txAli, tBlockEnd, variantPsl); } // Not contained in a deletion from reference genome (txAli target) -- return NULL. return NULL; }