b1a9de0fd2ebab6d3caf4197eecc2e75eb74162d markd Mon May 24 13:53:35 2021 -0700 Added better handling of GTFs without correct frames on CDS. This also simplified the handling of GTF stop codons diff --git src/hg/lib/genePred.c src/hg/lib/genePred.c index 0e0ee50..0dba3b6 100644 --- src/hg/lib/genePred.c +++ src/hg/lib/genePred.c @@ -558,105 +558,30 @@ } static int findPosExon(struct genePred *gp, int pos) /* find exon containing the specified position */ { int i; for (i = 0; i < gp->exonCount; i++) { if ((gp->exonStarts[i] <= pos) && (pos < gp->exonEnds[i])) return i; } errAbort("bug: position %d not found in exon of %s genePred", pos, gp->name); return 0; } -static int findLastFramedExon(struct genePred *gp) -/* locate the last exon with frame */ -{ -int iStart = (gp->strand[0] == '+') ? 0 : gp->exonCount-1; -int iStop = (gp->strand[0] == '+') ? gp->exonCount : -1; -int dir = (gp->strand[0] == '+') ? +1 : -1; -int iPrevExon = -1, iExon; - -// find beginning of frames -for (iExon = iStart; (iExon != iStop) && (gp->exonFrames[iExon] == -1); iExon += dir) - continue; -// find last with frame -for (; (iExon != iStop) && (gp->exonFrames[iExon] != -1); iExon += dir) - iPrevExon = iExon; -if (iPrevExon < 0) - errAbort("bug: findLastFramedExon should have found an exon with frame"); -return iPrevExon; -} - -static int exonCdsStart(struct genePred *gp, int iExon) -/* get the CDS starting position in specified exon */ -{ -return (gp->cdsStart < gp->exonStarts[iExon]) ? gp->exonStarts[iExon] : gp->cdsStart; -} - -static int exonCdsEnd(struct genePred *gp, int iExon) -/* get the CDS ending position in specified exon */ -{ -return (gp->cdsEnd > gp->exonEnds[iExon]) ? gp->exonEnds[iExon] : gp->cdsEnd; -} - -static void extendFramePos(struct genePred *gp) -/* extend frame missing from last exon(s) for positive strand genes, normally - * caused by GTF stop_codon */ -{ -int iExon = findLastFramedExon(gp); -int frame = incrFrame(gp->exonFrames[iExon], (exonCdsEnd(gp, iExon) - exonCdsStart(gp, iExon))); -for (iExon++; (iExon < gp->exonCount) && (gp->exonStarts[iExon] < gp->cdsEnd); iExon++) - { - if (!((gp->exonFrames[iExon] < 0) || (gp->exonFrames[iExon] == frame))) - errAbort("conflicting frame for %s exon index %d, was %d, trying to assign %d", gp->name, iExon, gp->exonFrames[iExon], frame); - gp->exonFrames[iExon] = frame; - frame = incrFrame(gp->exonFrames[iExon], (exonCdsEnd(gp, iExon) - exonCdsStart(gp, iExon))); - } -} - -static void extendFrameNeg(struct genePred *gp) -/* extend frame missing from last exon(s) for negative strand genes, normally - * caused by GTF stop_codon */ -{ -int iExon = findLastFramedExon(gp); -int frame = incrFrame(gp->exonFrames[iExon], (exonCdsEnd(gp, iExon) - exonCdsStart(gp, iExon))); -for (iExon--; (iExon >= 0) && (gp->exonEnds[iExon] > gp->cdsStart); iExon--) - { - if (!((gp->exonFrames[iExon] < 0) || (gp->exonFrames[iExon] == frame))) - errAbort("conflicting frame for %s exon index %d, was %d, trying to assign %d", gp->name, iExon, gp->exonFrames[iExon], frame); - gp->exonFrames[iExon] = frame; - frame = incrFrame(gp->exonFrames[iExon], (exonCdsEnd(gp, iExon) - exonCdsStart(gp, iExon))); - } -} - -static void fixStopFrame(struct genePred *gp) -/* Handle nasty corner case: GTF with the all or part of the stop codon as the - * only codon in an exon, we must set the frame on these exons. Some ENSEMBL - * GTFs have single base `exons' with just a base of the stop codon, so must - * check every base. While less than ideal, we just extend the frame past the - * last exon with frame, so a conflicting frame on the stop codon will be - * missed. Just switching to GFF3 fixes this mess. */ -{ -if (gp->strand[0] == '+') - extendFramePos(gp); -else - extendFrameNeg(gp); -} - static boolean adjImpliedStopPos(struct genePred *gp) /* implied stop codon adjustment for positive strand gene */ { int iExon = findPosExon(gp, gp->cdsEnd-1); unsigned space = (gp->exonEnds[iExon]-gp->cdsEnd); if (space >= 3) { // room in current exon gp->cdsEnd += 3; return TRUE; } else if ((iExon < gp->exonCount) && ((gp->exonEnds[iExon+1]-gp->exonStarts[iExon+1])+space) >= 3) { // room including next exon @@ -696,59 +621,30 @@ { if (gp->optFields & genePredCdsStatFld) gp->cdsEndStat = cdsComplete; } } else { if (adjImpliedStopNeg(gp)) { if (gp->optFields & genePredCdsStatFld) gp->cdsStartStat = cdsComplete; } } } -static void checkForNoFrames(struct genePred *gp) -/* check for requesting frame from a file without frames. */ -{ -int i; -/* Complain if we have a CDS but exonFrames are all -1: */ -if (gp->cdsStart < gp->cdsEnd) - { - boolean foundReal = FALSE; - for (i = 0; i < gp->exonCount; i++) - { - if (gp->exonFrames[i] >= 0 && gp->exonFrames[i] <= 2) - { - foundReal = TRUE; - break; - } - } - if (! foundReal) - { - errAbort("Error: exonFrames field is being added, but I found a " - "gene (%s) with CDS but no valid frames. " - "This can happen if program is invoked with -genePredExt " - "but no valid frames are given in the file. If the 8th " - "field of GFF/GTF file is always a placeholder, then don't use " - "-genePredExt.", - gp->name); - } - } -} - static boolean isGtfFeature(char *feat) /* check if a feature is one of the recognized features. All otherse * should be ignored. http://mblab.wustl.edu/GTF22.html */ { static char *gtfFeats[] = {"CDS", "start_codon", "stop_codon", "5UTR", "3UTR", "inter", "inter_CNS", "intron_CNS", "exon", NULL}; int i; for (i = 0; gtfFeats[i] != NULL; i++) { if (sameString(feat, gtfFeats[i])) return TRUE; } return FALSE; } @@ -790,50 +686,59 @@ } /* set frame from any overlapping records with frame. */ while ((gl != NULL) && (rangeIntersection(gl->start, gl->end, start, end) > 0)) { if (allowedFrameFeature(gl, isGtf)) { int frame = phaseToFrame(gl->frame); if (frame >= 0) *framePtr = frame; } gl = gl->next; } return gl; } -static void assignFrame(boolean isGtf, struct gffGroup *group, struct genePred *gp) -/* Assign frame from GFF after genePred has been built.*/ +static void assignFrame(boolean isGtf, struct gffGroup *group, struct genePred *gp, unsigned options) +/* Assign frame from GFF/GTF after genePred has been built.*/ { + struct gffLine *gl = group->lineList; int start, end, i; -boolean haveFrame = FALSE; +int numCdsExons = 0; +int numFrameExons = 0; for (i = 0; i < gp->exonCount; i++) { if (genePredCdsExon(gp, i, &start, &end)) { + numCdsExons += 1; gl = assignFrameForCdsExon(start, end, &(gp->exonFrames[i]), gl, isGtf); if (gp->exonFrames[i] >= 0) - haveFrame = TRUE; + numFrameExons += 1; } } -/* make sure stop has frame if some exons in the gene had frame */ -if (haveFrame) - fixStopFrame(gp); -checkForNoFrames(gp); +/* Assign frames for exons without frames, either due to it incorrectly not + * being on cases. This also Handle the nasty corner caseof GTF with the all + * or part of the stop codon as the only codon in an exon, we must set the + * frame on these exons. Some ENSEMBL GTFs have single base `exons' (due to + * low quality assemblies) with just a base of the stop codon, so must check + * every base. While less than ideal, we just extend the frame past the last + * exon with frame, so a conflicting frame on the stop codon will be missed. + * Just switching to GFF3 fixes this mess. Return the count of exons that + * actually had frame set. */ +genePredFixExonFrames(gp); } static struct genePred *mkFromGroupedGxf(struct gffFile *gff, struct gffGroup *group, char *name, boolean isGtf, char *exonSelectWord, unsigned optFields, unsigned options) /* common function to create genePreds from GFFs or GTFs. This is a little * ugly with to many check of isGtf, however the was way to much identical * code the other way. Options are from genePredFromGxfOpts */ { struct genePred *gp; long stopCodonStart = -1, stopCodonEnd = -1; long cdsStart = -1, cdsEnd = -1; int exonCount = 0; boolean haveStartCodon = FALSE, haveStopCodon = FALSE; struct gffLine *gl; @@ -1021,31 +926,31 @@ assert(i < exonCount); assert(gl->start >= eStarts[i]); if (gl->end > eEnds[i]) eEnds[i] = gl->end; } } } gp->exonCount = i+1; /* adjust for flybase type of GFFs, with stop codon implied and outside of * CDS. */ if ((options & genePredGxfImpliedStopAfterCds) && !haveStopCodon) adjImpliedStopAfterCds(gp); if (optFields & genePredExonFramesFld) - assignFrame(isGtf, group, gp); + assignFrame(isGtf, group, gp, options); return gp; } struct genePred *genePredFromGroupedGff(struct gffFile *gff, struct gffGroup *group, char *name, char *exonSelectWord, unsigned optFields, unsigned options) /* Convert gff->groupList to genePred list. See genePred.h for details. */ { struct gffLine *gl; boolean anyExon = FALSE; /* Look to see if any exons. If not allow CDS to be used instead. */ if (exonSelectWord) { for (gl = group->lineList; gl != NULL; gl = gl->next) @@ -1760,31 +1665,31 @@ blockSize = gp->exonEnds[gp->exonCount-1] - gp->exonStarts[gp->exonCount-1]; return endDist < (gpSize - blockSize - 50); } else if(sameString(gp->strand, "-")) { blockSize = gp->exonEnds[0] - gp->exonStarts[0]; return startDist > (blockSize + 50); } else errAbort("genePredNmdsTarget() - Don't recognize strand: %s\n", gp->strand); return FALSE; } void genePredAddExonFrames(struct genePred *gp) /* Add exonFrames array to a genePred that doesn't have it. Frame is assumed - * to be contiguous. */ + * to be contiguous. NOTE: suggest using genePredFixExonFrames for new code. */ { int iExon, start, end, iBase = 0; int iStart, iEnd, iDir; /* initial array to all -1, for no frame */ AllocArray(gp->exonFrames, gp->exonCount); gp->optFields |= genePredExonFramesFld; for (iExon = 0; iExon < gp->exonCount; iExon++) gp->exonFrames[iExon] = -1; if (sameString(gp->strand, "+")) { iStart = 0; iEnd = gp->exonCount; iDir = 1; @@ -1793,30 +1698,80 @@ { iStart = gp->exonCount-1; iEnd = -1; iDir = -1; } for (iExon = iStart; iExon != iEnd; iExon += iDir) { if (genePredCdsExon(gp, iExon, &start, &end)) { gp->exonFrames[iExon] = iBase % 3; iBase += (end - start); } } } +void genePredFixExonFrames(struct genePred *gp) +/* Add exonFrames array to a genePred that has frame on only some or no + * features. Frame is assumed to be contiguous when an existing frame is not + * present. */ +{ +int iStart, iEnd, iDir; + +if (gp->exonFrames == NULL) + { + /* doesn't currently have frames */ + AllocArray(gp->exonFrames, gp->exonCount); + gp->optFields |= genePredExonFramesFld; + for (int iExon = 0; iExon < gp->exonCount; iExon++) + gp->exonFrames[iExon] = -1; + } + +if (sameString(gp->strand, "+")) + { + iStart = 0; + iEnd = gp->exonCount; + iDir = 1; + } +else + { + iStart = gp->exonCount - 1; + iEnd = -1; + iDir = -1; + } +int nextFrame = -1; +for (int iExon = iStart; iExon != iEnd; iExon += iDir) + { + int start, end; + if (genePredCdsExon(gp, iExon, &start, &end)) + { + if (gp->exonFrames[iExon] < 0) + { + // no existing frame + if (nextFrame < 0) + nextFrame = 0; // first frame + gp->exonFrames[iExon] = nextFrame; + } + nextFrame = incrFrame(gp->exonFrames[iExon], end - start); + } + else + { + gp->exonFrames[iExon] = -1; + } + } +} + void genePredRc(struct genePred *gp, int chromSize) /* Reverse complement a genePred (project it to the opposite strand). Useful * when doing analysis that is simplified by having things on the same strand. */ { int iExon; gp->strand[0] = (gp->strand[0] == '+') ? '-' : '+'; reverseUnsignedRange(&gp->txStart, &gp->txEnd, chromSize); reverseUnsignedRange(&gp->cdsStart, &gp->cdsEnd, chromSize); for (iExon = 0; iExon < gp->exonCount; iExon++) reverseUnsignedRange(&gp->exonStarts[iExon], &gp->exonEnds[iExon], chromSize); reverseUnsigned(gp->exonStarts, gp->exonCount); reverseUnsigned(gp->exonEnds, gp->exonCount); if (gp->optFields & genePredCdsStatFld) {