15f2972707e8b9a2f2f19074a62c9405f052f255 angie Wed Jun 1 13:55:56 2011 -0700 MLQ #4060 bugfix: user reported that Table Browser's GTF start_ and _stopcodons sometimes span introns, and we should be outputting discrete lines for split codons instead of big intron-spanners (http://mblab.wustl.edu/GTF22.html). Tested fix by comparing GTF dumps of hg18.refGene, hg18.knownGene and mm9.knownGene between hgwdev and hgwdev-angie, ensuring that the only changes were to split up too-large start_codon and stop_codon records. diff --git src/hg/hgTables/gffOut.c src/hg/hgTables/gffOut.c index a9d9a8a..f3e4567 100644 --- src/hg/hgTables/gffOut.c +++ src/hg/hgTables/gffOut.c @@ -75,151 +75,143 @@ nextPhase = (3 + ((nextPhase - cdsSize) % 3)) % 3; stopIndx = j; } else { frames[j] = '.'; } } if (retStartIndx) *retStartIndx = startIndx; if (retStopIndx) *retStopIndx = stopIndx; return frames; } -static int offsetToGenomic(struct bed *bed, int exonIndx, int anchor, int offset) -/* Return the genomic coord which is offset transcribed bases away from anchor, - * which must fall in the exonIndx exon of bed. If the offset is not contained - * by any exon, return -1. */ +static void addStartStopCodon(struct bed *bed, int exonIndx, int anchor, int offset, char *codon, + char *source, char *txName) +/* Output a start or stop codon -- if it is split across multiple exons, output multiple lines. + * anchor must fall in the exonIndx exon of bed. + * If offset is positive, we are computing an end coord; if negative, a start coord. */ { int simpleAnswer = anchor + offset; int exonStart = bed->chromStart + bed->chromStarts[exonIndx]; int exonEnd = exonStart + bed->blockSizes[exonIndx]; if ((offset >= 0 && (anchor >= exonEnd || anchor < exonStart)) || (offset < 0 && (anchor > exonEnd || anchor <= exonStart))) - errAbort("offsetToGenomic: anchor %d is not in exon %d [%d,%d]", + errAbort("addStartStopCodon: anchor %d is not in exon %d [%d,%d]", anchor, exonIndx, exonStart, exonEnd); -if (offset < 0 && simpleAnswer < exonStart) +if (offset < 0) + { + if (simpleAnswer < exonStart) + { + if (exonIndx >= 1) { - if (exonIndx < 1) - return -1; int stillNeeded = simpleAnswer - exonStart; - int prevExonEnd = bed->chromStart + bed->chromStarts[exonIndx-1] + bed->blockSizes[exonIndx-1]; - return offsetToGenomic(bed, exonIndx-1, prevExonEnd, stillNeeded); + int prevExonEnd = (bed->chromStart + bed->chromStarts[exonIndx-1] + + bed->blockSizes[exonIndx-1]); + addStartStopCodon(bed, exonIndx-1, prevExonEnd, stillNeeded, codon, source, txName); + } + addGffLineFromBed(bed, source, codon, exonStart, anchor, '.', txName); } -else if (offset > 0 && simpleAnswer > exonEnd) + else + addGffLineFromBed(bed, source, codon, simpleAnswer, anchor, '.', txName); + } +else if (offset > 0) + { + if (simpleAnswer > exonEnd) + { + addGffLineFromBed(bed, source, codon, anchor, exonEnd, '.', txName); + if (exonIndx < bed->blockCount - 1) { - if (exonIndx >= bed->blockCount - 1) - return -1; int stillNeeded = simpleAnswer - exonEnd; int nextExonStart = bed->chromStart + bed->chromStarts[exonIndx+1]; - return offsetToGenomic(bed, exonIndx+1, nextExonStart, stillNeeded); + addStartStopCodon(bed, exonIndx+1, nextExonStart, stillNeeded, codon, source, txName); + } + } + else + addGffLineFromBed(bed, source, codon, anchor, simpleAnswer, '.', txName); } -return simpleAnswer; } static void addCdsStartStop(struct bed *bed, char *source, int exonCdsStart, int exonCdsEnd, char *frames, int exonIndx, int cdsStartIndx, int cdsStopIndx, boolean gtf2StopCodons, char *txName) /* Output a CDS record for the trimmed CDS portion of exonIndx, and a start_codon or * stop_codon if applicable. */ { boolean isRc = (bed->strand[0] == '-'); /* start_codon (goes first for + strand) overlaps with CDS */ if ((exonIndx == cdsStartIndx) && !isRc) - { - int startCodonEnd = offsetToGenomic(bed, exonIndx, exonCdsStart, 3); - if (startCodonEnd >= 0 && startCodonEnd <= bed->thickEnd) - addGffLineFromBed(bed, source, "start_codon", exonCdsStart, startCodonEnd, '.', txName); - } + addStartStopCodon(bed, exonIndx, exonCdsStart, 3, "start_codon", source, txName); /* If gtf2StopCodons is set, then we follow the GTF2 convention of excluding * the stop codon from the CDS region. In other GFF flavors, stop_codon is * part of the CDS, which is the case in our table coords too. * This function would already be complicated enough without gtf2StopCodons, * due to strand and the possibility of a start or stop codon being split * across exons. Excluding the stop codon from CDS complicates it further * because cdsEnd on the current exon may affect the CDS line of the previous * or next exon. The cdsPortion* variables below are the CDS extremities that * may have the stop codon excised. */ if (exonIndx == cdsStopIndx) { if (isRc) { - int stopCodonEnd = offsetToGenomic(bed, exonIndx, exonCdsStart, 3); - int cdsPortionStart = exonCdsStart; - if (stopCodonEnd >= 0 && stopCodonEnd <= bed->thickEnd) - { - if (gtf2StopCodons) - cdsPortionStart = stopCodonEnd; - addGffLineFromBed(bed, source, "stop_codon", exonCdsStart, stopCodonEnd, '.', txName); - } + addStartStopCodon(bed, exonIndx, exonCdsStart, 3, "stop_codon", source, txName); + int cdsPortionStart = gtf2StopCodons ? exonCdsStart + 3 : exonCdsStart; if (cdsPortionStart < exonCdsEnd) addGffLineFromBed(bed, source, "CDS", cdsPortionStart, exonCdsEnd, frames[exonIndx], txName); } else { - int stopCodonStart = offsetToGenomic(bed, exonIndx, exonCdsEnd, -3); - int cdsPortionEnd = exonCdsEnd; - if (stopCodonStart >= 0 && stopCodonStart >= bed->thickStart) - { - if (gtf2StopCodons) - cdsPortionEnd = stopCodonStart; + int cdsPortionEnd = gtf2StopCodons ? exonCdsEnd - 3 : exonCdsEnd; if (cdsPortionEnd > exonCdsStart) addGffLineFromBed(bed, source, "CDS", exonCdsStart, cdsPortionEnd, frames[exonIndx], txName); - addGffLineFromBed(bed, source, "stop_codon", stopCodonStart, exonCdsEnd, '.', txName); - } - else if (cdsPortionEnd > exonCdsStart) - addGffLineFromBed(bed, source, "CDS", exonCdsStart, cdsPortionEnd, - frames[exonIndx], txName); + addStartStopCodon(bed, exonIndx, exonCdsEnd, -3, "stop_codon", source, txName); } } else { int cdsPortionStart = exonCdsStart; int cdsPortionEnd = exonCdsEnd; if (gtf2StopCodons) { if (isRc && exonIndx - 1 == cdsStopIndx) { int lastExonEnd = (bed->chromStart + bed->chromStarts[exonIndx-1] + bed->blockSizes[exonIndx-1]); int basesWereStolen = 3 - (lastExonEnd - bed->thickStart); if (basesWereStolen > 0) cdsPortionStart += basesWereStolen; } if (!isRc && exonIndx + 1 == cdsStopIndx) { int nextExonStart = bed->chromStart + bed->chromStarts[exonIndx+1]; int basesWillBeStolen = 3 - (bed->thickEnd - nextExonStart); if (basesWillBeStolen > 0) cdsPortionEnd -= basesWillBeStolen; } } if (cdsPortionEnd > cdsPortionStart) addGffLineFromBed(bed, source, "CDS", cdsPortionStart, cdsPortionEnd, frames[exonIndx], txName); } /* start_codon (goes last for - strand) overlaps with CDS */ if ((exonIndx == cdsStartIndx) && isRc) - { - int startCodonStart = offsetToGenomic(bed, exonIndx, exonCdsEnd, -3); - if (startCodonStart >= 0 && startCodonStart >= bed->thickStart) - addGffLineFromBed(bed, source, "start_codon", startCodonStart, exonCdsEnd, '.', txName); - } + addStartStopCodon(bed, exonIndx, exonCdsEnd, -3, "start_codon", source, txName); } static int bedToGffLines(struct bed *bedList, struct hTableInfo *hti, int fieldCount, char *source, boolean gtf2StopCodons) /* Translate a (list of) bed into gff and print out. * Note that field count (perhaps reduced by bitwise intersection) * can in effect override hti. */ { struct hash *nameHash = newHash(20); struct bed *bed; int i, exonStart, exonEnd; char txName[256]; int itemCount = 0; static int namelessIx = 0;