15f2972707e8b9a2f2f19074a62c9405f052f255
angie
  Wed Jun 1 13:55:56 2011 -0700
MLQ #4060 bugfix: user reported that Table Browser's GTF start_ and _stopcodons sometimes span introns, and we should be outputting discrete lines
for split codons instead of big intron-spanners (http://mblab.wustl.edu/GTF22.html).
Tested fix by comparing GTF dumps of hg18.refGene, hg18.knownGene and mm9.knownGene
between hgwdev and hgwdev-angie, ensuring that the only changes were to split up
too-large start_codon and stop_codon records.

diff --git src/hg/hgTables/gffOut.c src/hg/hgTables/gffOut.c
index a9d9a8a..f3e4567 100644
--- src/hg/hgTables/gffOut.c
+++ src/hg/hgTables/gffOut.c
@@ -75,151 +75,143 @@
 	nextPhase = (3 + ((nextPhase - cdsSize) % 3)) % 3;
 	stopIndx = j;
 	}
     else
 	{
 	frames[j] = '.';
 	}
     }
 if (retStartIndx)
     *retStartIndx = startIndx;
 if (retStopIndx)
     *retStopIndx = stopIndx;
 return frames;
 }
 
-static int offsetToGenomic(struct bed *bed, int exonIndx, int anchor, int offset)
-/* Return the genomic coord which is offset transcribed bases away from anchor,
- * which must fall in the exonIndx exon of bed.  If the offset is not contained
- * by any exon, return -1. */
+static void addStartStopCodon(struct bed *bed, int exonIndx, int anchor, int offset, char *codon,
+			      char *source, char *txName)
+/* Output a start or stop codon -- if it is split across multiple exons, output multiple lines. 
+ * anchor must fall in the exonIndx exon of bed.
+ * If offset is positive, we are computing an end coord; if negative, a start coord. */
 {
 int simpleAnswer = anchor + offset;
 int exonStart = bed->chromStart + bed->chromStarts[exonIndx];
 int exonEnd = exonStart + bed->blockSizes[exonIndx];
 if ((offset >= 0 && (anchor >= exonEnd || anchor < exonStart)) ||
     (offset < 0 && (anchor > exonEnd || anchor <= exonStart)))
-    errAbort("offsetToGenomic: anchor %d is not in exon %d [%d,%d]",
+    errAbort("addStartStopCodon: anchor %d is not in exon %d [%d,%d]",
 	     anchor, exonIndx, exonStart, exonEnd);
-if (offset < 0 && simpleAnswer < exonStart)
+if (offset < 0)
+    {
+    if (simpleAnswer < exonStart)
+	{
+	if (exonIndx >= 1)
     {
-    if (exonIndx < 1)
-	return -1;
     int stillNeeded = simpleAnswer - exonStart;
-    int prevExonEnd = bed->chromStart + bed->chromStarts[exonIndx-1] + bed->blockSizes[exonIndx-1];
-    return offsetToGenomic(bed, exonIndx-1, prevExonEnd, stillNeeded);
+	    int prevExonEnd = (bed->chromStart + bed->chromStarts[exonIndx-1] +
+			       bed->blockSizes[exonIndx-1]);
+	    addStartStopCodon(bed, exonIndx-1, prevExonEnd, stillNeeded, codon, source, txName);
+	    }
+	addGffLineFromBed(bed, source, codon, exonStart, anchor, '.', txName);
     }
-else if (offset > 0 && simpleAnswer > exonEnd)
+    else
+	addGffLineFromBed(bed, source, codon, simpleAnswer, anchor, '.', txName);
+    }
+else if (offset > 0)
+    {
+    if (simpleAnswer > exonEnd)
+	{
+	addGffLineFromBed(bed, source, codon, anchor, exonEnd, '.', txName);
+	if (exonIndx < bed->blockCount - 1)
     {
-    if (exonIndx >= bed->blockCount - 1)
-	return -1;
     int stillNeeded = simpleAnswer - exonEnd;
     int nextExonStart = bed->chromStart + bed->chromStarts[exonIndx+1];
-    return offsetToGenomic(bed, exonIndx+1, nextExonStart, stillNeeded);
+	    addStartStopCodon(bed, exonIndx+1, nextExonStart, stillNeeded, codon, source, txName);
+	    }
+	}
+    else
+	addGffLineFromBed(bed, source, codon, anchor, simpleAnswer, '.', txName);
     }
-return simpleAnswer;
 }
 
 static void addCdsStartStop(struct bed *bed, char *source, int exonCdsStart, int exonCdsEnd,
 			    char *frames, int exonIndx, int cdsStartIndx, int cdsStopIndx,
 			    boolean gtf2StopCodons, char *txName)
 /* Output a CDS record for the trimmed CDS portion of exonIndx, and a start_codon or
  * stop_codon if applicable. */
 {
 boolean isRc = (bed->strand[0] == '-');
 /* start_codon (goes first for + strand) overlaps with CDS */
 if ((exonIndx == cdsStartIndx) && !isRc)
-    {
-    int startCodonEnd = offsetToGenomic(bed, exonIndx, exonCdsStart, 3);
-    if (startCodonEnd >= 0 && startCodonEnd <= bed->thickEnd)
-	addGffLineFromBed(bed, source, "start_codon", exonCdsStart, startCodonEnd, '.', txName);
-    }
+    addStartStopCodon(bed, exonIndx, exonCdsStart, 3, "start_codon", source, txName);
 /* If gtf2StopCodons is set, then we follow the GTF2 convention of excluding
  * the stop codon from the CDS region.  In other GFF flavors, stop_codon is
  * part of the CDS, which is the case in our table coords too.
  * This function would already be complicated enough without gtf2StopCodons,
  * due to strand and the possibility of a start or stop codon being split
  * across exons.  Excluding the stop codon from CDS complicates it further
  * because cdsEnd on the current exon may affect the CDS line of the previous
  * or next exon.  The cdsPortion* variables below are the CDS extremities that
  * may have the stop codon excised. */
 if (exonIndx == cdsStopIndx)
     {
     if (isRc)
 	{
-	int stopCodonEnd = offsetToGenomic(bed, exonIndx, exonCdsStart, 3);
-	int cdsPortionStart = exonCdsStart;
-	if (stopCodonEnd >= 0 && stopCodonEnd <= bed->thickEnd)
-	    {
-	    if (gtf2StopCodons)
-		cdsPortionStart = stopCodonEnd;
-	    addGffLineFromBed(bed, source, "stop_codon", exonCdsStart, stopCodonEnd, '.', txName);
-	    }
+	addStartStopCodon(bed, exonIndx, exonCdsStart, 3, "stop_codon", source, txName);
+	int cdsPortionStart = gtf2StopCodons ? exonCdsStart + 3 : exonCdsStart;
 	if (cdsPortionStart < exonCdsEnd)
 	    addGffLineFromBed(bed, source, "CDS", cdsPortionStart, exonCdsEnd,
 			      frames[exonIndx], txName);
 	}
     else
 	{
-	int stopCodonStart = offsetToGenomic(bed, exonIndx, exonCdsEnd, -3);
-	int cdsPortionEnd = exonCdsEnd;
-	if (stopCodonStart >= 0 && stopCodonStart >= bed->thickStart)
-	    {
-	    if (gtf2StopCodons)
-		cdsPortionEnd = stopCodonStart;
+	int cdsPortionEnd = gtf2StopCodons ? exonCdsEnd - 3 : exonCdsEnd;
 	    if (cdsPortionEnd > exonCdsStart)
 		addGffLineFromBed(bed, source, "CDS", exonCdsStart, cdsPortionEnd,
 				  frames[exonIndx], txName);
-	    addGffLineFromBed(bed, source, "stop_codon", stopCodonStart, exonCdsEnd, '.', txName);
-	    }
-	else if (cdsPortionEnd > exonCdsStart)
-	    addGffLineFromBed(bed, source, "CDS", exonCdsStart, cdsPortionEnd,
-			      frames[exonIndx], txName);
+	addStartStopCodon(bed, exonIndx, exonCdsEnd, -3, "stop_codon", source, txName);
 	}
     }
 else
     {
     int cdsPortionStart = exonCdsStart;
     int cdsPortionEnd = exonCdsEnd;
     if (gtf2StopCodons)
 	{
 	if (isRc && exonIndx - 1 == cdsStopIndx)
 	    {
 	    int lastExonEnd = (bed->chromStart + bed->chromStarts[exonIndx-1] +
 			       bed->blockSizes[exonIndx-1]);
 	    int basesWereStolen = 3 - (lastExonEnd - bed->thickStart);
 	    if (basesWereStolen > 0)
 		cdsPortionStart += basesWereStolen;
 	    }
 	if (!isRc && exonIndx + 1 == cdsStopIndx)
 	    {
 	    int nextExonStart = bed->chromStart + bed->chromStarts[exonIndx+1];
 	    int basesWillBeStolen = 3 - (bed->thickEnd - nextExonStart);
 	    if (basesWillBeStolen > 0)
 		cdsPortionEnd -= basesWillBeStolen;
 	    }
 	}
     if (cdsPortionEnd > cdsPortionStart)
 	addGffLineFromBed(bed, source, "CDS", cdsPortionStart, cdsPortionEnd,
 			  frames[exonIndx], txName);
     }
 /* start_codon (goes last for - strand) overlaps with CDS */
 if ((exonIndx == cdsStartIndx) && isRc)
-    {
-    int startCodonStart = offsetToGenomic(bed, exonIndx, exonCdsEnd, -3);
-    if (startCodonStart >= 0 && startCodonStart >= bed->thickStart)
-	addGffLineFromBed(bed, source, "start_codon", startCodonStart, exonCdsEnd, '.', txName);
-    }
+    addStartStopCodon(bed, exonIndx, exonCdsEnd, -3, "start_codon", source, txName);
 }
 
 
 static int bedToGffLines(struct bed *bedList, struct hTableInfo *hti,
 			 int fieldCount, char *source, boolean gtf2StopCodons)
 /* Translate a (list of) bed into gff and print out.
  * Note that field count (perhaps reduced by bitwise intersection)
  * can in effect override hti. */
 {
 struct hash *nameHash = newHash(20);
 struct bed *bed;
 int i, exonStart, exonEnd;
 char txName[256];
 int itemCount = 0;
 static int namelessIx = 0;