ee6b3270d5a37e7e28c7af65aead341fa682295f markd Thu Sep 27 17:09:26 2018 -0700 Fixed GTF conversion problem where final CDS exon had incorrect frame. occurred when The CDS started in the exon that contains the last codon before the stop codon. The stop code was either spliced or the only codon in the exon. This changed 44 gencode V28 transcripts, many of which were manually validated in the browser. diff --git src/hg/lib/genePred.c src/hg/lib/genePred.c index 07ecbef..dc2584b 100644 --- src/hg/lib/genePred.c +++ src/hg/lib/genePred.c @@ -577,57 +577,69 @@ int iStop = (gp->strand[0] == '+') ? gp->exonCount : -1; int dir = (gp->strand[0] == '+') ? +1 : -1; int iPrevExon = -1, iExon; // find beginning of frames for (iExon = iStart; (iExon != iStop) && (gp->exonFrames[iExon] == -1); iExon += dir) continue; // find last with frame for (; (iExon != iStop) && (gp->exonFrames[iExon] != -1); iExon += dir) iPrevExon = iExon; if (iPrevExon < 0) errAbort("bug: findLastFramedExon should have found an exon with frame"); return iPrevExon; } +static int exonCdsStart(struct genePred *gp, int iExon) +/* get the CDS starting position in specified exon */ +{ +return (gp->cdsStart < gp->exonStarts[iExon]) ? gp->exonStarts[iExon] : gp->cdsStart; +} + +static int exonCdsEnd(struct genePred *gp, int iExon) +/* get the CDS ending position in specified exon */ +{ +return (gp->cdsEnd > gp->exonEnds[iExon]) ? gp->exonEnds[iExon] : gp->cdsEnd; +} + static void extendFramePos(struct genePred *gp) /* extend frame missing from last exon(s) for positive strand genes, normally * caused by GTF stop_codon */ { int iExon = findLastFramedExon(gp); -int frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon])); +int frame = incrFrame(gp->exonFrames[iExon], (exonCdsEnd(gp, iExon) - exonCdsStart(gp, iExon))); for (iExon++; (iExon < gp->exonCount) && (gp->exonStarts[iExon] < gp->cdsEnd); iExon++) { if (!((gp->exonFrames[iExon] < 0) || (gp->exonFrames[iExon] == frame))) errAbort("conflicting frame for %s exon index %d, was %d, trying to assign %d", gp->name, iExon, gp->exonFrames[iExon], frame); gp->exonFrames[iExon] = frame; - frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon])); + frame = incrFrame(gp->exonFrames[iExon], (exonCdsEnd(gp, iExon) - exonCdsStart(gp, iExon))); } } static void extendFrameNeg(struct genePred *gp) /* extend frame missing from last exon(s) for negative strand genes, normally * caused by GTF stop_codon */ { int iExon = findLastFramedExon(gp); -int frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon])); +int frame = incrFrame(gp->exonFrames[iExon], (exonCdsEnd(gp, iExon) - exonCdsStart(gp, iExon))); for (iExon--; (iExon >= 0) && (gp->exonEnds[iExon] > gp->cdsStart); iExon--) { if (!((gp->exonFrames[iExon] < 0) || (gp->exonFrames[iExon] == frame))) errAbort("conflicting frame for %s exon index %d, was %d, trying to assign %d", gp->name, iExon, gp->exonFrames[iExon], frame); gp->exonFrames[iExon] = frame; - frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon])); + frame = incrFrame(gp->exonFrames[iExon], (exonCdsEnd(gp, iExon) - exonCdsStart(gp, iExon))); } } static void fixStopFrame(struct genePred *gp) /* Handle nasty corner case: GTF with the all or part of the stop codon as the * only codon in an exon, we must set the frame on these exons. Some ENSEMBL * GTFs have single base `exons' with just a base of the stop codon, so must * check every base. While less than ideal, we just extend the frame past the * last exon with frame, so a conflicting frame on the stop codon will be * missed. Just switching to GFF3 fixes this mess. */ { if (gp->strand[0] == '+') extendFramePos(gp); else extendFrameNeg(gp);