src/hg/lib/genePred.c 7f7a9a5579d7acec698c51dbf93f5030ce1a0439

7f7a9a5579d7acec698c51dbf93f5030ce1a0439
braney
  Tue Apr 8 15:43:17 2025 -0700
support genePred in quickLift

diff --git src/hg/lib/genePred.c src/hg/lib/genePred.c
index 400f8a23c71..65dadcf876d 100644
--- src/hg/lib/genePred.c
+++ src/hg/lib/genePred.c
@@ -1,2461 +1,2467 @@
 /* genePred.c was originally generated by the autoSql program, which also 
  * generated genePred.h and genePred.sql.  This module links the database and the RAM 
  * representation of objects. */
 
 /* Copyright (C) 2014 The Regents of the University of California 
  * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 #include "common.h"
 #include "gff.h"
 #include "jksql.h"
 #include "psl.h"
 #include "linefile.h"
 #include "genePred.h"
 #include "genbank.h"
 #include "rangeTree.h"
 #include "hdb.h"
 #include "chromInfo.h"
 
 
 struct genePred *genePredLoad(char **row)
 /* Load a genePred from row fetched with select * from genePred
  * from database.  Dispose of this with genePredFree(). 
  * NOTE: cannabalizes the row argument */
 {
 struct genePred *ret;
 int sizeOne;
 
 AllocVar(ret);
 ret->exonCount = sqlUnsigned(row[7]);
 ret->name = cloneString(row[0]);
 ret->chrom = cloneString(row[1]);
 strcpy(ret->strand, row[2]);
 ret->txStart = sqlUnsigned(row[3]);
 ret->txEnd = sqlUnsigned(row[4]);
 ret->cdsStart = sqlUnsigned(row[5]);
 ret->cdsEnd = sqlUnsigned(row[6]);
 sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne);
 if (sizeOne != ret->exonCount)
     errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)",
              ret->name, sizeOne, ret->exonCount);
 sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne);
 if (sizeOne != ret->exonCount)
     errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)",
              ret->name, sizeOne, ret->exonCount);
 return ret;
 }
 
 struct genePred *genePredLoadAll(char *fileName) 
 /* Load all genePred from a whitespace-separated file.
  * Dispose of this with genePredFreeList(). */
 {
 struct genePred *list = NULL, *el;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[10];
 
 while (lineFileRow(lf, row))
     {
     el = genePredLoad(row);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);
 return list;
 }
 
 struct genePred *genePredLoadAllByChar(char *fileName, char chopper) 
 /* Load all genePred from a chopper separated file.
  * Dispose of this with genePredFreeList(). */
 {
 struct genePred *list = NULL, *el;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[10];
 
 while (lineFileNextCharRow(lf, chopper, row, ArraySize(row)))
     {
     el = genePredLoad(row);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);
 return list;
 }
 
 struct genePred *genePredCommaIn(char **pS, struct genePred *ret)
 /* Create a genePred out of a comma separated string. 
  * This will fill in ret if non-null, otherwise will
  * return a new genePred */
 {
 char *s = *pS;
 int i;
 
 if (ret == NULL)
     AllocVar(ret);
 ret->name = sqlStringComma(&s);
 ret->chrom = sqlStringComma(&s);
 sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand));
 ret->txStart = sqlUnsignedComma(&s);
 ret->txEnd = sqlUnsignedComma(&s);
 ret->cdsStart = sqlUnsignedComma(&s);
 ret->cdsEnd = sqlUnsignedComma(&s);
 ret->exonCount = sqlUnsignedComma(&s);
 s = sqlEatChar(s, '{');
 AllocArray(ret->exonStarts, ret->exonCount);
 for (i=0; i<ret->exonCount; ++i)
     {
     ret->exonStarts[i] = sqlUnsignedComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 s = sqlEatChar(s, '{');
 AllocArray(ret->exonEnds, ret->exonCount);
 for (i=0; i<ret->exonCount; ++i)
     {
     ret->exonEnds[i] = sqlUnsignedComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 *pS = s;
 return ret;
 }
 
 void genePredFree(struct genePred **pEl)
 /* Free a single dynamically allocated genePred such as created
  * with genePredLoad(). */
 {
 struct genePred *el;
 
 if ((el = *pEl) == NULL) return;
 freeMem(el->name);
 freeMem(el->chrom);
 freeMem(el->exonStarts);
 freeMem(el->exonEnds);
 freeMem(el->name2);
 freeMem(el->exonFrames);
 freez(pEl);
 }
 
 void genePredFreeList(struct genePred **pList)
 /* Free a list of dynamically allocated genePred's */
 {
 struct genePred *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     genePredFree(&el);
     }
 *pList = NULL;
 }
 
 void genePredOutput(struct genePred *el, FILE *f, char sep, char lastSep) 
 /* Print out genePred.  Separate fields with sep. Follow last field with lastSep. */
 {
 int i;
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->name);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->chrom);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->strand);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 fprintf(f, "%u", el->txStart);
 fputc(sep,f);
 fprintf(f, "%u", el->txEnd);
 fputc(sep,f);
 fprintf(f, "%u", el->cdsStart);
 fputc(sep,f);
 fprintf(f, "%u", el->cdsEnd);
 fputc(sep,f);
 fprintf(f, "%u", el->exonCount);
 fputc(sep,f);
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->exonCount; ++i)
     {
     fprintf(f, "%u", el->exonStarts[i]);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 fputc(sep,f);
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->exonCount; ++i)
     {
     fprintf(f, "%u", el->exonEnds[i]);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 
 /* optional fields, >= test is used so unspecified coumns can be filled in */
 if (el->optFields >= genePredScoreFld)
     {
     fputc(sep,f);
     fprintf(f, "%d", el->score);
     }
 if (el->optFields >= genePredName2Fld)
     {
     fputc(sep,f);
     if (sep == ',') fputc('"',f);
     fprintf(f, "%s", ((el->name2 != NULL) ? el->name2 : ""));
     if (sep == ',') fputc('"',f);
     }
 if (el->optFields >= genePredCdsStatFld)
     {
     fputc(sep,f);
     if (sep == ',') fputc('"',f);
     fprintf(f, "%s", genePredCdsStatStr(el->cdsStartStat));
     if (sep == ',') fputc('"',f);
     fputc(sep,f);
     if (sep == ',') fputc('"',f);
     fprintf(f, "%s", genePredCdsStatStr(el->cdsEndStat));
     if (sep == ',') fputc('"',f);
     }
 if (el->optFields >= genePredExonFramesFld)
     {
     fputc(sep,f);
     if (sep == ',') fputc('{',f);
     for (i=0; i<el->exonCount; ++i)
         {
         fprintf(f, "%d", el->exonFrames[i]);
         fputc(',', f);
         }
     if (sep == ',') fputc('}',f);
     }
 fputc(lastSep,f);
 }
 
 /* ---------  Start of hand generated code. ---------------------------- */
 
 char *genePredCdsStatStr(enum cdsStatus stat)
 /* get string value of a cdsStatus */
 {
 switch (stat) {
 case cdsNone:
     return "none";
 case cdsUnknown:
     return "unk";
 case cdsIncomplete:
     return "incmpl";
 case cdsComplete:
     return "cmpl";
 default:
     errAbort("invalid cdsStatus enum value: %d", stat);
     return NULL;  /* make compiler happy */
 }
 }
 
 enum cdsStatus parseCdsStat(char *statStr)
 /* parse a cdsStatus string */
 {
 if ((statStr == NULL) || sameString(statStr, "none"))
     return cdsNone;
 if (sameString(statStr, "unk"))
     return cdsUnknown;
 if (sameString(statStr, "incmpl"))
     return cdsIncomplete;
 if (sameString(statStr, "cmpl"))
     return cdsComplete;
 
 errAbort("invalid genePred cdsStatus: \"%s\"", statStr);
 return cdsNone;  /* make compiler happy */
 }
 
 struct genePred *genePredKnownLoad(char **row, int numCols)
 /* Load a genePred in knownGene format from row. */
 {
 struct genePred *ret;
 int sizeOne;
 
 AllocVar(ret);
 ret->exonCount = sqlUnsigned(row[7]);
 ret->name = cloneString(row[0]);
 ret->chrom = cloneString(row[1]);
 strcpy(ret->strand, row[2]);
 ret->txStart = sqlUnsigned(row[3]);
 ret->txEnd = sqlUnsigned(row[4]);
 ret->cdsStart = sqlUnsigned(row[5]);
 ret->cdsEnd = sqlUnsigned(row[6]);
 sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne);
 if (sizeOne != ret->exonCount)
     errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)",
              ret->name, sizeOne, ret->exonCount);
 sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne);
 if (sizeOne != ret->exonCount)
     errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)",
              ret->name, sizeOne, ret->exonCount);
 
 ret->name2 = cloneString(row[11]);
 return ret;
 }
 struct genePred *genePredExtLoad(char **row, int numCols)
 /* Load a genePred with from a row, with optional fields.  The row must
  * contain columns in the order in the struct, and they must be present up to
  * the last specfied optional field.  Missing intermediate fields must have
  * zero or empty columns, they may not be omitted.  Fields at the end can be
  * omitted. Dispose of this with genePredFree(). */
 {
 struct genePred *ret;
 int sizeOne, iCol;
 
 AllocVar(ret);
 ret->exonCount = sqlUnsigned(row[7]);
 ret->name = cloneString(row[0]);
 ret->chrom = cloneString(row[1]);
 strcpy(ret->strand, row[2]);
 ret->txStart = sqlUnsigned(row[3]);
 ret->txEnd = sqlUnsigned(row[4]);
 ret->cdsStart = sqlUnsigned(row[5]);
 ret->cdsEnd = sqlUnsigned(row[6]);
 sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne);
 if (sizeOne != ret->exonCount)
     errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)",
              ret->name, sizeOne, ret->exonCount);
 sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne);
 if (sizeOne != ret->exonCount)
     errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)",
              ret->name, sizeOne, ret->exonCount);
 
 iCol=GENEPRED_NUM_COLS;
 if (iCol < numCols)
     {
     ret->score = sqlSigned(row[iCol++]);
     ret->optFields |= genePredScoreFld;
     }
 if (iCol < numCols)
     {
     ret->name2 = cloneString(row[iCol++]);
     ret->optFields |= genePredName2Fld;
     }
 if (iCol < numCols)
     {
     if (isEmpty(row[iCol])) // if the cdsStartStat field is empty
 	return ret;         // ignore the rest of the fields
     ret->cdsStartStat = parseCdsStat(row[iCol++]);
     ret->optFields |= genePredCdsStatFld;
     }
 if (iCol < numCols)
     {
     ret->cdsEndStat = parseCdsStat(row[iCol++]);
     ret->optFields |= genePredCdsStatFld;
     }
 if (iCol < numCols)
     {
     sqlSignedDynamicArray(row[iCol++], &ret->exonFrames, &sizeOne);
     if (sizeOne != ret->exonCount)
         errAbort("genePred: %s number of exonFrames (%d) != number of exons (%d)",
                  ret->name, sizeOne, ret->exonCount);
     ret->optFields |= genePredExonFramesFld;
     }
 return ret;
 }
 
+struct genePred *genePredExtLoad15(char **row)
+/* Load a genePred record assumed to be 15 fields. */
+{
+return  genePredExtLoad(row, 15);
+}
+
 struct genePred *genePredKnownLoadAll(char *fileName)
 /* Load all genePreds with from tab-separated file in knownGene format */
 {
 struct genePred *list = NULL, *el;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[GENEPREDX_NUM_COLS];
 int numCols;
 
 while ((numCols = lineFileChopNextTab(lf, row, ArraySize(row))) > 0)
     {
     lineFileExpectAtLeast(lf, GENEPRED_NUM_COLS, numCols);
     el = genePredKnownLoad(row, numCols);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);
 return list;
 }
 
 struct genePred *genePredExtLoadAll(char *fileName)
 /* Load all genePreds with from tab-separated file, possibly with optional
  * fields. Dispose of this with genePredFreeList(). */
 {
 struct genePred *list = NULL, *el;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[GENEPREDX_NUM_COLS];
 int numCols;
 
 while ((numCols = lineFileChopNextTab(lf, row, ArraySize(row))) > 0)
     {
     lineFileExpectAtLeast(lf, GENEPRED_NUM_COLS, numCols);
     el = genePredExtLoad(row, numCols);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);
 return list;
 }
 
 int genePredCmp(const void *va, const void *vb)
 /* Compare to sort based on chromosome, txStart. */
 {
 const struct genePred *a = *((struct genePred **)va);
 const struct genePred *b = *((struct genePred **)vb);
 int dif = strcmp(a->chrom, b->chrom);
 if (dif == 0)
     dif = a->txStart - b->txStart;
 return dif;
 }
 
 int genePredNameCmp(const void *va, const void *vb)
 /* Compare to sort based on name, then chromosome, txStart. */
 {
 const struct genePred *a = *((struct genePred **)va);
 const struct genePred *b = *((struct genePred **)vb);
 int dif = strcmp(a->name, b->name);
 if (dif == 0)
     dif = strcmp(a->chrom, b->chrom);
 if (dif == 0)
     dif = a->txStart - b->txStart;
 return dif;
 }
 
 static boolean haveStartStopCodons(struct gffFile *gff)
 /* For GFFs, determine if any of the annotations use start_codon or                                                                                                                 
  * stop_codon */
 {
 struct gffFeature *feature;
 for(feature = gff->featureList; feature != NULL; feature = feature->next)
     {
     if(sameWord(feature->name, "start_codon") ||
        sameWord(feature->name, "start_codon"))
         return TRUE;
     }
 return FALSE;
 }
 
 static boolean isExon(char *feat, boolean isGtf, char *exonSelectWord)
 /* determine if a feature is an exon; different criteria for GFF ane GTF */
 {
 if (isGtf)
     {
     /* must check for stop_codon here, because GTF put it outside of the
      * CDS */
     return (sameWord(feat, "CDS") || sameWord(feat, "stop_codon")
             || sameWord(feat, "start_codon") || sameWord(feat, "exon")
             || sameWord(feat, "utr") || sameWord(feat, "5utr")
             || sameWord(feat, "3utr"));
     }
 else
     {
     return ((exonSelectWord == NULL) || sameWord(feat, exonSelectWord));
     }
 }
 
 static boolean isStartStopCodon(char *feat)
 /* determine if a feature is GTF style start/stop codon */
 {
 return sameWord(feat, "stop_codon") || sameWord(feat, "start_codon");
 }
 
 static boolean isCds(char *feat)
 /* determine if a feature is CDS */
 {
 return sameWord(feat, "CDS") || isStartStopCodon(feat);
 }
 
 static void chkGroupLine(struct gffGroup *group, struct gffLine *gl, struct genePred *gp)
 /* check that a gffLine is consistent with the genePred being built.  this
  * helps detect some problems that lead to corrupt genePreds */
 {
 if (!(sameString(gl->seq, gp->chrom) && (gl->strand == gp->strand[0])))
     {
     fprintf(stderr, "invalid gffGroup detected on line: ");
     gffTabOut(gl, stderr);
     errAbort("GFF/GTF group %s on %s%c, this line is on %s%c, all group members must be on same seq and strand",
              group->name, gp->chrom, gp->strand[0],
                      gl->seq, gl->strand);
     }
 }
 
 static int phaseToFrame(char phase)
 /* convert GTF/GFF phase to frame */
 {
 switch (phase)
     {
     case '0':
         return 0;
     case '1':
         return 2;
     case '2':
         return 1;
     }
 return -1;
 }
 
 #ifdef NOT_CURRENTLY_USED
 static char frameToPhase(int frame)
 /* convert GTF/GFF frame to phase */
 {
 switch (frame)
     {
     case 0:
         return '0';
     case 1:
         return '2';
     case 2:
         return '1';
     }
 return -1;
 }
 #endif
 
 static int incrFrame(int frame, int amt)
 /* increment frame, no-op if frame is -1 */
 {
 if (frame < 0)
     return frame;
 else if (amt >= 0)
     return (frame+amt) % 3;
 else
     {
     int amt3 = (-amt) % 3;
     return (frame - (amt - amt3)) % 3;
     }
 }
 
 static int findPosExon(struct genePred *gp, int pos)
 /* find exon containing the specified position */
 {
 int i;
 for (i = 0; i < gp->exonCount; i++)
     {
     if ((gp->exonStarts[i] <= pos) && (pos < gp->exonEnds[i]))
         return i;
     }
 errAbort("bug: position %d not found in exon of %s genePred", pos, gp->name);
 return 0;
 }
 
 static boolean adjImpliedStopPos(struct genePred *gp)
 /* implied stop codon adjustment for positive strand gene */
 {
 int iExon = findPosExon(gp, gp->cdsEnd-1);
 unsigned space = (gp->exonEnds[iExon]-gp->cdsEnd);
 if (space >= 3)
     {
     // room in current exon
     gp->cdsEnd += 3;
     return TRUE;
     }
 else if ((iExon < gp->exonCount)
          && ((gp->exonEnds[iExon+1]-gp->exonStarts[iExon+1])+space) >= 3)
     {
     // room including next exon
     gp->cdsEnd = gp->exonStarts[iExon+1] + (3-space);
     return TRUE;
     }
 return FALSE;
 }
 
 static boolean adjImpliedStopNeg(struct genePred *gp)
 /* implied stop codon adjustment for negative strand gene */
 {
 int iExon = findPosExon(gp, gp->cdsStart);
 unsigned space = (gp->cdsStart-gp->exonStarts[iExon]);
 if (space >= 3)
     {
     // room in current exon
     gp->cdsStart -= 3;
     return TRUE;
     }
 else if ((iExon > 0)
          && ((gp->exonEnds[iExon-1]-gp->exonStarts[iExon-1])+space) >= 3)
     {
     // room including previous exon
     gp->cdsStart = gp->exonEnds[iExon-1] - (3-space);
     return TRUE;
     }
 return FALSE;
 }
 
 static void adjImpliedStopAfterCds(struct genePred *gp)
 /* adjust CDS bounds to include implied stop codon outside of CDS. */
 {
 if (gp->strand[0] == '+')
     {
     if (adjImpliedStopPos(gp))
         {
         if (gp->optFields & genePredCdsStatFld)
             gp->cdsEndStat = cdsComplete;
         }
     }
 else
     {
     if (adjImpliedStopNeg(gp))
         {
         if (gp->optFields & genePredCdsStatFld)
             gp->cdsStartStat = cdsComplete;
         }
     }
 }
 
 static boolean isGtfFeature(char *feat)
 /* check if a feature is one of the recognized features.  All otherse
  * should be ignored. http://mblab.wustl.edu/GTF22.html */
 {
 static char *gtfFeats[] = {"CDS", "start_codon", "stop_codon", "5UTR", "3UTR",
                            "inter", "inter_CNS", "intron_CNS", "exon", NULL};
 int i;
 for (i = 0; gtfFeats[i] != NULL; i++)
     {
     if (sameString(feat, gtfFeats[i]))
         return TRUE;
     }
 return FALSE;
 }
 
 static boolean ignoreGxfLine(struct gffLine *gl, boolean isGtf)
 /* should the specified line be skipped? */
 {
 if (isGtf)
     return !isGtfFeature(gl->feature);
 else
     return FALSE;
 }
 
 static boolean allowedFrameFeature(struct gffLine *gl, boolean isGtf)
 /* should this feature be used to assign frame? */
 {
 if (isStartStopCodon(gl->feature) || ignoreGxfLine(gl, isGtf))
     return FALSE;
 
 // for GFF, any features with frame can be used to set frame,
 // with GTF, there are some bogus files that set frame on UTR features,
 // so only use CDS
 if (isGtf)
     return sameString(gl->feature, "CDS");
 else
     return (gl->frame != '.');
 }
 
 static struct gffLine *assignFrameForCdsExon(int start, int end, int *framePtr,
                                              struct gffLine *gl, boolean isGtf)
 /* set frame from any overlapping records with frame. Don't include
  * start/stop_codon, as it isn't a full exon. */
 {
 /* skip lines preceeding this exon */
 while (gl != NULL)
     {
     if (allowedFrameFeature(gl, isGtf) && (gl->end >= start))
         break;
     gl = gl->next;
 }
 /* set frame from any overlapping records with frame. */
 while ((gl != NULL) && (rangeIntersection(gl->start, gl->end, start, end) > 0))
     {
     if (allowedFrameFeature(gl, isGtf))
         {
         int frame = phaseToFrame(gl->frame);
         if (frame >= 0)
             *framePtr = frame;
         }
     gl = gl->next;
     }
 return gl;
 }
 
 static void assignFrame(boolean isGtf, struct gffGroup *group, struct genePred *gp, unsigned options)
 /* Assign frame from GFF/GTF after genePred has been built.*/
 {
 
 struct gffLine *gl = group->lineList;
 int start, end, i;
 int numCdsExons = 0;
 int numFrameExons = 0;
 for (i = 0; i < gp->exonCount; i++)
     {
     if (genePredCdsExon(gp, i, &start, &end))
         {
         numCdsExons += 1;
         gl = assignFrameForCdsExon(start, end, &(gp->exonFrames[i]), gl, isGtf);
         if (gp->exonFrames[i] >= 0)
             numFrameExons += 1;
         }
     }
 
 /* Assign frames for exons without frames, either due to it incorrectly not
  * being on cases.  This also Handle the nasty corner caseof GTF with the all
  * or part of the stop codon as the only codon in an exon, we must set the
  * frame on these exons.  Some ENSEMBL GTFs have single base `exons' (due to
  * low quality assemblies) with just a base of the stop codon, so must check
  * every base.  While less than ideal, we just extend the frame past the last
  * exon with frame, so a conflicting frame on the stop codon will be missed.
  * Just switching to GFF3 fixes this mess.  Return the count of exons that
  * actually had frame set. */
 genePredFixExonFrames(gp);
 }
 
 static struct genePred *mkFromGroupedGxf(struct gffFile *gff, struct gffGroup *group, char *name,
                                          boolean isGtf, char *exonSelectWord, unsigned optFields,
                                          unsigned options)
 /* common function to create genePreds from GFFs or GTFs.  This is a little
  * ugly with to many check of isGtf, however the was way to much identical
  * code the other way. Options are from genePredFromGxfOpts */
 {
 struct genePred *gp;
 long stopCodonStart = -1, stopCodonEnd = -1;
 long cdsStart = -1, cdsEnd = -1;
 int exonCount = 0;
 boolean haveStartCodon = FALSE, haveStopCodon = FALSE;
 struct gffLine *gl;
 unsigned *eStarts, *eEnds;
 int i;
 
 /* should we count on start/stop codon annotation in GFFs? */
 boolean useStartStops = isGtf || haveStartStopCodons(gff);
 
 int geneStart = 0, geneEnd = 0;
 
 /* Count up exons and figure out cdsStart and cdsEnd. */
 for (gl = group->lineList; gl != NULL; gl = gl->next)
     {
     boolean exonishLine = FALSE;
     if (ignoreGxfLine(gl, isGtf))
         continue;
     if (isExon(gl->feature, isGtf, exonSelectWord))
 	{
 	exonishLine = TRUE;
 	++exonCount;
 	}
     if (isCds(gl->feature))
         {
 	exonishLine = TRUE;
 	if ((cdsStart < 0) || (gl->start < cdsStart))
             cdsStart = gl->start;
 	if ((cdsEnd < 0) || (gl->end > cdsEnd))
             cdsEnd = gl->end;
 	}
     if (exonishLine)
         {
 	if (geneStart == geneEnd)  // Not initialized yet
 	     {
 	     geneStart = gl->start;
 	     geneEnd = gl->end;
 	     }
 	else
 	     {
 	     geneStart = min(gl->start, geneStart);
 	     geneEnd = max(gl->end, geneEnd);
 	     }
 	}
     if (sameWord(gl->feature, "start_codon"))
         haveStartCodon = TRUE;
     if (sameWord(gl->feature, "stop_codon"))
         {
         /* stop_codon can be split, need bounds for adjusting CDS below */
         if ((stopCodonStart < 0) || (gl->start < stopCodonStart))
             stopCodonStart = gl->start;
         if ((stopCodonEnd < 0) || (gl->end > stopCodonEnd))
             stopCodonEnd = gl->end;
         haveStopCodon = TRUE;
         }
     }
 if (exonCount == 0)
     return NULL;
 if (cdsStart > cdsEnd)
     {
     /* no cds annotated */
     cdsStart = 0;
     cdsEnd = 0;
     }
 else if (stopCodonStart >= 0)
     {
     /* adjust CDS to include stop codon as in GTF */
     if (group->strand == '+')
         {
         if (stopCodonEnd > cdsEnd)
             cdsEnd = stopCodonEnd;
         }
     else
         {
         if (stopCodonStart < cdsStart)
             cdsStart = stopCodonStart;
         }
     }
 
 /* add in version numbers if requested and available */
 char geneIdToUse[1024], transcriptIdToUse[1024];
 geneIdToUse[0]= '\0';
 if (options & genePredGxfGeneNameAsName2)
     {
     if (group->lineList->geneName != NULL)
         safecpy(geneIdToUse, sizeof(geneIdToUse), group->lineList->geneName);
     }
 else if (group->lineList->geneId != NULL)
     {
     if ((options & genePredGxfIncludeVersion) && (group->lineList->geneVersion != NULL))
         safef(geneIdToUse, sizeof(geneIdToUse), "%s.%s", group->lineList->geneId, group->lineList->geneVersion);
     else
         safecpy(geneIdToUse, sizeof(geneIdToUse), group->lineList->geneId);
     }
 if ((options & genePredGxfIncludeVersion) && (group->lineList->transcriptVersion != NULL))
     safef(transcriptIdToUse, sizeof(transcriptIdToUse), "%s.%s", name, group->lineList->transcriptVersion);
 else
     safecpy(transcriptIdToUse, sizeof(transcriptIdToUse), name);
 
 
 /* Allocate genePred and fill in values. */
 AllocVar(gp);
 gp->name = cloneString(transcriptIdToUse);
 gp->chrom = cloneString(group->seq);
 gp->strand[0] = group->strand;
 gp->txStart = geneStart;
 gp->txEnd = geneEnd;
 if (cdsStart < cdsEnd)
     {
     gp->cdsStart = cdsStart;
     gp->cdsEnd = cdsEnd;
     }
 else
     {
     // no CDS, set to txEnd
     gp->cdsStart = gp->txEnd;
     gp->cdsEnd = gp->txEnd;
     }
 gp->exonStarts = AllocArray(eStarts, exonCount);
 gp->exonEnds = AllocArray(eEnds, exonCount);
 gp->optFields = optFields;
 
 if (optFields & genePredName2Fld)
     gp->name2 = cloneString(geneIdToUse);
 if (optFields & genePredCdsStatFld)
     {
     if (cdsStart < cdsEnd)
         {
         if (!useStartStops)
             {
             /* GFF doesn't require start or stop, if not used, assume complete */
             gp->cdsStartStat = cdsComplete;
             gp->cdsEndStat = cdsComplete;
             }
         else if (group->strand == '+')
             {
             gp->cdsStartStat = (haveStartCodon ? cdsComplete : cdsIncomplete);
             gp->cdsEndStat = (haveStopCodon ? cdsComplete : cdsIncomplete);;
             }
         else
             {
             gp->cdsEndStat = (haveStartCodon ? cdsComplete : cdsIncomplete);
             gp->cdsStartStat = (haveStopCodon ? cdsComplete : cdsIncomplete);;
             }
         }
     else
         {
         gp->cdsStartStat = cdsNone;
         gp->cdsEndStat = cdsNone;
         }
     }
 if (optFields & genePredExonFramesFld)
     {
     AllocArray(gp->exonFrames, exonCount);
     for (i = 0; i < exonCount; i++)
         gp->exonFrames[i] = -1;
     }
 
 
 /* adjust tx range to include stop codon */
 if ((group->strand == '+') && (gp->txEnd == stopCodonStart))
      gp->txEnd = stopCodonEnd;
 else if ((group->strand == '-') && (gp->txStart == stopCodonEnd))
     gp->txStart = stopCodonStart;
 
 i = -1; /* before first exon */
 /* fill in exons, merging overlaping and adjacent exons */
 for (gl = group->lineList; gl != NULL; gl = gl->next)
     {
     if (ignoreGxfLine(gl, isGtf))
         continue;
     if (isExon(gl->feature, isGtf, exonSelectWord) || (isGtf && isCds(gl->feature)))
         {
         chkGroupLine(group, gl, gp);
         if ((i < 0) || (gl->start > eEnds[i]))
             {
             /* start a new exon */
             ++i;
             assert(i < exonCount);
             eStarts[i] = gl->start;
             eEnds[i] = gl->end;
             }
         else
             {
             /* overlap, extend exon, picking the largest of ends */
             assert(i < exonCount);
             assert(gl->start >= eStarts[i]);
             if (gl->end > eEnds[i])
                 eEnds[i] = gl->end;
             }
         }
     }
 gp->exonCount = i+1;
 
 /* adjust for flybase type of GFFs, with stop codon implied and outside of
  * CDS. */
 if ((options & genePredGxfImpliedStopAfterCds) && !haveStopCodon)
     adjImpliedStopAfterCds(gp);
 
 if (optFields & genePredExonFramesFld)
     assignFrame(isGtf, group, gp, options);
 
 return gp;
 }
 
 struct genePred *genePredFromGroupedGff(struct gffFile *gff, struct gffGroup *group, char *name,
                                         char *exonSelectWord, unsigned optFields, unsigned options)
 /* Convert gff->groupList to genePred list. See genePred.h for details. */
 {
 struct gffLine *gl;
 boolean anyExon = FALSE;
 
 /* Look to see if any exons.  If not allow CDS to be used instead. */
 if (exonSelectWord)
     {
     for (gl = group->lineList; gl != NULL; gl = gl->next)
 	{
 	if (sameWord(gl->feature, exonSelectWord))
 	    {
 	    anyExon = TRUE;
 	    break;
 	    }
 	}
     }
 else
     anyExon = TRUE;
 if (!anyExon)
     exonSelectWord = "CDS";
 
 return mkFromGroupedGxf(gff, group, name, FALSE, exonSelectWord, optFields,
                         options);
 }
 
 struct genePred *genePredFromGroupedGtf(struct gffFile *gff, struct gffGroup *group, char *name,
                                         unsigned optFields, unsigned options)
 /* Convert gff->groupList to genePred list, using GTF feature conventions.
  * See genePred.h for details. */
 {
 return mkFromGroupedGxf(gff, group, name, TRUE, NULL, optFields,
                         options);
 }
 
 static void mapCdsToGenome(struct psl *psl, struct genbankCds* cds,
                            struct genePred* gene)
 /* Convert set cdsStart/end from mrna to genomic coordinates. */
 {
 struct genbankCds genomeCds = genbankCdsToGenome(cds, psl);
 if (genomeCds.start < genomeCds.end)
     {
     gene->cdsStart = genomeCds.start;
     gene->cdsEnd = genomeCds.end;
     if (gene->optFields & genePredCdsStatFld)
         {
         if (gene->strand[0] == '+')
             {
             gene->cdsStartStat = (genomeCds.startComplete) ? cdsComplete : cdsIncomplete;
             gene->cdsEndStat = (genomeCds.endComplete) ? cdsComplete : cdsIncomplete;;
             }
         else
             {
             gene->cdsStartStat = (genomeCds.endComplete) ? cdsComplete : cdsIncomplete;;
             gene->cdsEndStat = (genomeCds.startComplete) ? cdsComplete : cdsIncomplete;
             }
         }
     }
 else
     {
     /* CDS not aligned */
     gene->cdsStart = gene->txEnd;
     gene->cdsEnd = gene->txEnd;
     if (gene->optFields & genePredCdsStatFld)
         {
         gene->cdsStartStat = cdsUnknown;
         gene->cdsEndStat = cdsUnknown;
         }
     }
 }
 
 void genePredAddGenbankCds(struct psl *psl, struct genbankCds* cds, 
 	struct genePred *gene)
 /* Convert cdsStart/End from mrna to genomic coordinates. 
  * Note that the genePred blocks need not be filled in before
  * this call. */
 {
 if (cds == NULL)
     {
     /* no CDS, set to zero-length at end */
     gene->cdsStart = psl->tEnd;
     gene->cdsEnd = psl->tEnd;
     if (gene->optFields & genePredCdsStatFld)
         {
         gene->cdsStartStat = cdsNone;
         gene->cdsEndStat = cdsNone;
         }
     }
 else if ((cds->start < 0) || (cds->end <= 0))
     {
     /* unknown CDS, set to end */
     gene->cdsStart = psl->tEnd;
     gene->cdsEnd = psl->tEnd;
     if (gene->optFields & genePredCdsStatFld)
         {
         gene->cdsStartStat = cdsUnknown;
         gene->cdsEndStat = cdsUnknown;
         }
     }
 else 
     {
     /* have CDS annotation, make sure it's in bounds */
     struct genbankCds adjCds = *cds;
     if (adjCds.end > psl->qSize)
         {
         adjCds.end = psl->qSize;
         adjCds.endComplete = FALSE;
         }
     mapCdsToGenome(psl, &adjCds, gene);
     }
 }
 
 static int getFrame(struct psl *psl, int start, int end,
                     struct genbankCds* cds)
 /* get the starting frame for an exon of a mRNA.  start and end are
  * the bounds of the exon within the mRNA.  This may cover multiple psl
  * blocks due to merging of small gaps. */
 {
 /* use the 3' end is used if it's complete, as it is more often accurate when
  * genes are defined from mRNAs sequenced with reverse-transcriptase. */
 int frame = -1;
 /* map to mRNA coords in CDS since frame for an exon is in direction of
  * transcription. */
 if (psl->strand[0] == '-')
     reverseIntRange(&start, &end, psl->qSize);
 if (start < cds->start)
     start = cds->start;
 if (end > cds->end)
     end = cds->end;
 
 if (start < end)
     {
     /* Compute frame from end of RNA if CDS end is marked complete and start
      * is not complete.  This is done as the end of an RNA is more likely
      * completely due to reverse transcriptase not replicating the entire RNA.
      * However, code that create CDS from genePreds doesn't always create a
      * CDS specification that indicates incompleteness. So don't use CDS end
      * unless we know start is incomplete, mean code tried to set it.  This is
      * not a perfect solution, as handling of CDS specification is naive and
      * doesn't account for truncated start or stop.  Incomplete codons can
      * result in frame shift even is CDS completeness is set correctly. */
     if (cds->endComplete && !cds->startComplete)
         {
         int fr = (cds->end-start) % 3;
         frame = (fr == 2) ? 1 : ((fr == 1) ? 2 : 0);
         }
     else
         frame = (start-cds->start) % 3;
     }
 return frame;
 }
 
 static boolean shouldMergeBlocks(struct genePred *gene, 
                                  unsigned tStart, unsigned prevTEnd,
                                  unsigned qStart, unsigned prevQEnd,
                                  unsigned options,
                                  int cdsMergeSize, int utrMergeSize)
 /* determine if a new block starting at tStart whould be merged with
  * the preceeding exon.
  */
 {
 boolean inCds = (gene->cdsStart <= tStart) && (tStart < gene->cdsEnd);
 int tGapSize = (tStart - prevTEnd);
 int qGapSize = (qStart - prevQEnd);
 if (inCds)
     {
     if ((options & genePredPslCdsMod3)
         && (((tGapSize % 3) != 0) || ((qGapSize % 3) != 0)))
         return FALSE;  /* not a multiple of three */
     if ((cdsMergeSize >= 0) && ((tGapSize <= cdsMergeSize) && (qGapSize <= cdsMergeSize)))
         return TRUE;
     }
 else 
     {
     // qGapSize only matters for CDS where we are trying to keep frame sane
     if ((utrMergeSize >= 0) && (tGapSize <= utrMergeSize))
         return TRUE;
     }
 return FALSE; /* don't merge */
 }
 
 static void pslToExons(struct psl *psl, struct genePred *gene,
                        struct genbankCds* cds, unsigned options,
                        int cdsMergeSize, int utrMergeSize)
 /* Convert psl alignment blocks to genePred exons, merging together blocks by
  * the specified paraemeters.  Optionally add frame information. */
 {
 int iBlk, iExon;
 int startIdx, stopIdx, idxIncr;
 
 /* this is bigger than needed if blocks merge */
 gene->exonStarts = needMem(psl->blockCount*sizeof(unsigned));
 gene->exonEnds = needMem(psl->blockCount*sizeof(unsigned));
 if (gene->optFields & genePredExonFramesFld)
     {
     /* default frames to indicate no frame */
     gene->exonFrames = needMem(psl->blockCount*sizeof(unsigned));
     for (iExon = 0; iExon < psl->blockCount; iExon++)
         gene->exonFrames[iExon] = -1;
     }
 
 /* traverse psl in postive target order */
 if (psl->strand[1] == '-')
     {
     startIdx = psl->blockCount-1;
     stopIdx = -1;
     idxIncr = -1;
     }
 else
     {
     startIdx = 0;
     stopIdx = psl->blockCount;
     idxIncr = 1;
     }
 
 iExon = -1;  /* indicate none have been added */
 unsigned prevQEnd = 0;
 unsigned prevTEnd = 0;
 for (iBlk = startIdx; iBlk != stopIdx; iBlk += idxIncr)
     {
     int qStart = psl->qStarts[iBlk];
     int qEnd = qStart + psl->blockSizes[iBlk];
     if (psl->strand[0] == '-')
         reverseIntRange(&qStart, &qEnd, psl->qSize);
     int tStart = psl->tStarts[iBlk];
     int tEnd = tStart + psl->blockSizes[iBlk];
     if (psl->strand[1] == '-')
         reverseIntRange(&tStart, &tEnd, psl->tSize);
     if ((iExon < 0) || !shouldMergeBlocks(gene, tStart, prevTEnd, qStart, prevQEnd, options,
                                           cdsMergeSize, utrMergeSize))
         {
         /* new exon */
         iExon++;
         gene->exonStarts[iExon] = tStart;
 	}
     gene->exonEnds[iExon] = tEnd;
     /* Set the frame. We have to deal with multiple blocks being merged.  For
      * positive strand, the first block frame is used, for negative strand,
      * the last is used. */
     if ((gene->optFields & genePredExonFramesFld) && (cds != NULL)
         && (((psl->strand[0] == '+') && (gene->exonFrames[iExon] < 0))
             || (psl->strand[0] == '-')))
 	{
         int fr = getFrame(psl, psl->qStarts[iBlk], psl->qStarts[iBlk]+psl->blockSizes[iBlk], cds);
         if (fr >= 0)
 	    gene->exonFrames[iExon] = fr;
 	}
     prevTEnd = tEnd;
     prevQEnd = qEnd;
     }
 gene->exonCount = iExon+1;
 assert(gene->exonCount <= psl->blockCount);
 }
 
 struct genePred *genePredFromPsl3(struct psl *psl,  struct genbankCds* cds, 
                                   unsigned optFields, unsigned options,
                                   int cdsMergeSize, int utrMergeSize)
 /* Convert a PSL of an mRNA alignment to a genePred, converting a genbank CDS
  * specification string to genomic coordinates. Small genomic inserts are
  * merged based on the mergeSize parameters.  Gaps no larger than the
  * specified merge sizes result in the adjacent blocks being merged into a
  * single exon.  Gaps in CDS use cdsMergeSize, in UTR use utrMergeSize.  If
  * the genePredPslCdsMod3 option is specified, then CDS gaps are only merged
  * if a multiple of three.  A negative merge sizes disables merging of blocks.
  * This differs from specifying zero in that adjacent blocks will not be
  * merged. The optfields field is a set from genePredFields, indicated what
  * fields to create.  Zero-length CDS, or null cds, creates without CDS
  * annotation.  If cds is null, it will set status fields to cdsNone.  */
 {
 struct genePred *gene;
 #if 0
 /* FIXME; the browser calls this on a protein psl when rendering the amino
  * acids of a protein alignment track.  Some of this code doesn't really make
  * sense for proteins and I really don't see how this produces the right
  * results, however the check is disabled until this can be investigated
  * because it *seems* to work.  markd 2006-05-22 */
 if (pslIsProtein(psl))
     errAbort("can't convert protein psls to genePreds");
 #endif
 
 AllocVar(gene);
 gene->name = cloneString(psl->qName);
 gene->chrom = cloneString(psl->tName);
 gene->txStart = psl->tStart;
 gene->txEnd = psl->tEnd;
 gene->optFields = optFields;
 
 /* get strand in genome that the positive version mRNA aligns to */
 if (psl->strand[1] == '\0')
     {
     /* assumed pos target strand, so neg query would be pos target */
     gene->strand[0] = psl->strand[0];
     }
 else 
     {
     /* query and target strand are different; will be neg when query pos */
     gene->strand[0] = ((psl->strand[0] != psl->strand[1]) ? '-' : '+');
     }
 
 genePredAddGenbankCds(psl, cds, gene);
 pslToExons(psl, gene, cds, options, cdsMergeSize, utrMergeSize);
 return gene;
 }
 
 struct genePred *genePredFromPsl2(struct psl *psl, unsigned optFields,
                                   struct genbankCds* cds, int insertMergeSize)
 /* Compatibility function, genePredFromPsl3 is prefered.  See that function's
  * documentation for details. This calls genePredFromPsl3 with no options
  * and insertMergeSize set for CDS and UTR.
  */
 {
 return genePredFromPsl3(psl, cds, optFields, genePredPslDefaults, insertMergeSize, insertMergeSize);
 }
 
 struct genePred *genePredFromPsl(struct psl *psl, int cdsStart, int cdsEnd,
                                  int insertMergeSize)
 /* Compatibility function, genePredFromPsl3 is prefered.  See that function's
  * documentation for details. This calls genePredFromPsl3 with no options.
  */
 {
 struct genbankCds cds;
 ZeroVar(&cds);
 cds.start = cdsStart;
 cds.end = cdsEnd;
 return genePredFromPsl3(psl, &cds, 0, genePredPslDefaults, insertMergeSize, insertMergeSize);
 }
 
 char* genePredGetCreateSql(char* table, unsigned optFields, unsigned options,
                            int chromIndexLen)
 /* Get SQL required to create a genePred table. optFields is a bit set
  * consisting of the genePredFields values. Options are a bit set of
  * genePredCreateOpts. Returned string should be freed.  This will create all
  * optional fields that preceed the highest optFields column.  chromIndexLen
  * is now ignored.. */
 {
 /* the >= is used so that we create preceeding fields. */
 struct dyString *dy = sqlDyStringCreate(
 "CREATE TABLE %s (", table);
 
 /* bin column goes here */
 if (options & genePredWithBin)
     sqlDyStringPrintf(dy,
 "    bin smallint unsigned not null,"
 "    INDEX(chrom,bin),");
 else
     sqlDyStringPrintf(dy,
 "   INDEX(chrom,txStart),");
 
 sqlDyStringPrintf(dy,
 "   name varchar(255) not null,"	/* mrna accession of gene */
 "   chrom varchar(255) not null,"	/* Chromosome name */
 "   strand char(1) not null,"		/* + or - for strand */
 "   txStart int unsigned not null,"	/* Transcription start position */
 "   txEnd int unsigned not null,"	/* Transcription end position */
 "   cdsStart int unsigned not null,"	/* Coding region start */
 "   cdsEnd int unsigned not null,"	/* Coding region end */
 "   exonCount int unsigned not null,"	/* Number of exons */
 "   exonStarts longblob not null,"	/* Exon start positions */
 "   exonEnds longblob not null,"	/* Exon end positions */
 "   INDEX(name)"
 );
 
 if (optFields >= genePredScoreFld)
     sqlDyStringPrintf(dy,
 "   ,score int");
 
 if (optFields >= genePredName2Fld)
     sqlDyStringPrintf(dy,
 "  ,name2 varchar(255) not null,"    /* Secondary name. (e.g. name of gene) or NULL if not available */
 "   INDEX(name2)");
 
 if (optFields >= genePredCdsStatFld)
     sqlDyStringPrintf(dy,
 "  ,cdsStartStat enum('none', 'unk', 'incmpl', 'cmpl') not null,"    /* Status of cdsStart annotation */
 "   cdsEndStat enum('none', 'unk', 'incmpl', 'cmpl') not null");     /* Status of cdsEnd annotation */
 
 if (optFields >= genePredExonFramesFld)
     sqlDyStringPrintf(dy,
 "   ,exonFrames longblob not null");    /* List of frame for each exon, or -1 if no frame or not known. NULL if not available. */
 
 sqlDyStringPrintf(dy, ")");
 
 return cloneString(dyStringCannibalize(&dy));
 }
 
 // FIXME: this really doesn't belong in this module
 struct genePred *getOverlappingGene(char *db, struct genePred **list, char *table, char *chrom, int cStart, int cEnd, char *name, int *retOverlap)
 {
 /* read all genes from a table find the gene with the biggest overlap. 
    Cache the list of genes to so we only read it once */
 
 char query[256];
 struct genePred *gene;
 struct sqlConnection *conn;
 struct sqlResult *sr;
 char **row;
 struct genePred *el = NULL, *bestMatch = NULL, *gp = NULL;
 int overlap = 0 , bestOverlap = 0, i;
 struct psl *psl;
 int *eFrames;
 
 
 if (*list == NULL)
     {
     printf("Loading Predictions from %s\n",table);
     AllocVar(*list);
     conn = hAllocConn(db);
     AllocVar(gene);
     sqlSafef(query, sizeof(query), "select * from %s", table);
     sr = sqlGetResult(conn, query);
     while ((row = sqlNextRow(sr)) != NULL)
         {
         if (!sameString(table,"all_mrna"))
             {
             el = genePredLoad(row);
             }
         else
             {
             psl = pslLoad(row);
             el = genePredFromPsl2(psl, 0, NULL, genePredStdInsertMergeSize);
             }
         slAddHead(list, el);
         }
     slReverse(list);
     sqlFreeResult(&sr);
     hFreeConn(&conn);
     }
 for (el = *list; el != NULL; el = el->next)
     {
     if (chrom != NULL && el->chrom != NULL)
         {
         overlap = 0;
         if ( sameString(chrom, el->chrom))
             {
             for (i = 0 ; i<(el->exonCount); i++)
                 {
                 overlap += positiveRangeIntersection(cStart,cEnd, el->exonStarts[i], el->exonEnds[i]) ;
                 }
             if (overlap > 20 && sameString(name, el->name))
                 {
                 bestMatch = el;
                 bestOverlap = overlap;
                 *retOverlap = bestOverlap;
                 }
             if (overlap > bestOverlap)
                 {
                 bestMatch = el;
                 bestOverlap = overlap;
                 *retOverlap = bestOverlap;
                 }
             }
         }
     }
 if (bestMatch != NULL)
     {
     /* Allocate genePred and fill in values. */
     AllocVar(gp);
     gp->name = cloneString(bestMatch->name);
     gp->chrom = cloneString(bestMatch->chrom);
     gp->strand[1] = bestMatch->strand[1];
     gp->strand[0] = bestMatch->strand[0];
     gp->txStart = bestMatch->txStart;
     gp->txEnd = bestMatch->txEnd;
     gp->cdsStart = bestMatch->cdsStart;
     gp->cdsEnd = bestMatch->cdsEnd;
     gp->exonCount = bestMatch->exonCount;
     AllocArray(gp->exonStarts, bestMatch->exonCount);
     AllocArray(gp->exonEnds, bestMatch->exonCount);
     for (i=0; i<bestMatch->exonCount; ++i)
         {
         gp->exonStarts[i] = bestMatch->exonStarts[i] ;
         gp->exonEnds[i] = bestMatch->exonEnds[i] ;
         }
     gp->optFields = bestMatch->optFields;
     gp->score = bestMatch->score;
 
     if (bestMatch->optFields & genePredName2Fld)
         gp->name2 = cloneString(bestMatch->name2);
     else
         gp->name2 = NULL;
     if (bestMatch->optFields & genePredCdsStatFld)
         {
         gp->cdsStartStat = bestMatch->cdsStartStat;
         gp->cdsEndStat = bestMatch->cdsEndStat;
         }
     if (bestMatch->optFields & genePredExonFramesFld)
         {
         gp->exonFrames = AllocArray(eFrames, bestMatch->exonCount);
         for (i = 0; i < bestMatch->exonCount; i++)
             gp->exonFrames[i] = bestMatch->exonFrames[i];
         }
     eFrames = gp->exonFrames;
     }
 
 return gp;
 }
 int genePredBases(struct genePred *gp)
 /* count coding and utr bases in a gene prediction */
 {
 int count = 0, i;
 
 if (gp == NULL) return 0;
 
 for (i=0; i<gp->exonCount; i++)
     {
     count += gp->exonEnds[i] - gp->exonStarts[i] ;
     }
 return count;
 }
 
 int genePredCodingBases(struct genePred *gp)
 /* Count up the number of coding bases in gene prediction. */
 {
 int i, exonCount = gp->exonCount;
 int cdsStart = gp->cdsStart, cdsEnd = gp->cdsEnd;
 int baseCount = 0;
 for (i=0; i<exonCount; ++i)
     {
     baseCount += positiveRangeIntersection(cdsStart,cdsEnd,
     	gp->exonStarts[i], gp->exonEnds[i]);
     }
 return baseCount;
 }
 
 boolean genePredCdsExon(struct genePred *gp, int iExon, int *startPtr, int *endPtr)
 /* Get the CDS range in an exon.  If there is no CDS, return FALSE and then
  * set start == end */
 {
 assert((iExon >= 0) && (iExon < gp->exonCount));
 int start = gp->exonStarts[iExon];
 int end = gp->exonEnds[iExon];
 
 if (start < gp->cdsStart)
     start = gp->cdsStart;
 if (end > gp->cdsEnd)
     end = gp->cdsEnd;
 if (startPtr != NULL)
     *startPtr = start;
 if (endPtr != NULL)
     *endPtr = end;
 return (start < end);
 }
 
 static void gpError(FILE* errFh, int* errorCnt, char *format, ...)
 /* print and count an error */
 {
 va_list args;
 fprintf(errFh, "Error: ");
 va_start(args, format);
 vfprintf(errFh, format, args);
 va_end(args);
 fputc('\n', errFh);
 (*errorCnt)++;
 }
 
 static void checkExon(FILE* errFh, int* errorCnt, char *desc, struct genePred* gp, int iExon)
 /* check one exon of a genePred */
 {
 unsigned exonStart = gp->exonStarts[iExon];
 unsigned exonEnd = gp->exonEnds[iExon];
 if (exonStart >= exonEnd)
     gpError(errFh, errorCnt, "%s: %s exon %u start %u >= end %u", desc, gp->name, iExon, exonStart, exonEnd);
 if (exonStart < gp->txStart)
     gpError(errFh, errorCnt, "%s: %s exon %u start %u < txStart %u", desc, gp->name, iExon, exonStart, gp->txStart);
 if (exonEnd > gp->txEnd)
     gpError(errFh, errorCnt, "%s: %s exon %u end %u > txEnd %u", desc, gp->name, iExon, exonEnd, gp->txEnd);
 if (iExon > 0)
     {
     /* other than first exon */
     if (exonStart < gp->exonEnds[iExon-1])
         gpError(errFh, errorCnt, "%s: %s exon %u (%s:%d-%d) overlaps previous exon (%s:%d-%d)",
                 desc, gp->name, iExon, gp->chrom, exonStart, exonEnd, gp->chrom, gp->exonStarts[iExon-1], gp->exonEnds[iExon-1]);
     }
 
 if (gp->optFields & genePredExonFramesFld)
     {
     int frame = gp->exonFrames[iExon];
     if ((frame < -1) || (frame > 2))
         gpError(errFh, errorCnt, "%s: %s invalid exonFrame: %d", desc, gp->name, frame);
     if ((exonEnd > gp->cdsStart) && (exonStart < gp->cdsEnd))
         {
         if (frame == -1)
             gpError(errFh, errorCnt, "%s: %s no exonFrame on CDS exon %d", desc, gp->name, iExon);
         }
     else
         {
         if (frame != -1)
             gpError(errFh, errorCnt, "%s: %s exonFrame on non-CDS exon %d", desc, gp->name, iExon);
         }
     }
 }
 
 static void checkCdsBounds(FILE* errFh, int* errorCnt, char *desc, struct genePred* gp)
 /* check cdsStart/cdsEnd */
 {
 int iExon;
 boolean foundCdsStart = FALSE, foundCdsEnd = FALSE;
 if (gp->cdsStart > gp->cdsEnd)
     gpError(errFh, errorCnt, "%s: %s cdsStart %u > cdsEnd %u", desc, gp->name, gp->cdsStart, gp->cdsEnd);
 if ((gp->cdsStart < gp->txStart) || (gp->cdsStart > gp->txEnd))
     gpError(errFh, errorCnt, "%s: %s cdsStart %u not in tx bounds %u-%u", desc, gp->name, gp->cdsStart, gp->txStart, gp->txEnd);
 if ((gp->cdsEnd < gp->txStart) || (gp->cdsEnd > gp->txEnd))
     gpError(errFh, errorCnt, "%s: %s cdsEnd %u not in tx bounds %u-%u", desc, gp->name, gp->cdsEnd, gp->txStart, gp->txEnd);
 /*  is cdsStart/cdsEnd in an exon?  */
 for (iExon = 0; iExon < gp->exonCount; iExon++)
     {
     if ((gp->cdsStart >= gp->exonStarts[iExon])
         && (gp->cdsStart < gp->exonEnds[iExon]))
         foundCdsStart = TRUE;
     if ((gp->cdsEnd > gp->exonStarts[iExon])
         && (gp->cdsEnd <= gp->exonEnds[iExon]))
         foundCdsEnd = TRUE;
     }
 if (!foundCdsStart)
     gpError(errFh, errorCnt, "%s: %s cdsStart %u not in an exon", desc, gp->name, gp->cdsStart);
 if (!foundCdsEnd)
     gpError(errFh, errorCnt, "%s: %s cdsEnd %u not in an exon", desc, gp->name, gp->cdsEnd);
 }
 
 int genePredCheck(char *desc, FILE* errFh, int chromSize, struct genePred* gp)
 /* Validate a genePred for consistency.  desc is printed the error messages
  * to file errFh (open /dev/null to discard).  chromSize should contain
  * size of chromosome, or 0 if chrom is not valid, or -1 to not check
  * chromosome bounds. Returns count of errors. */
 {
 int iExon;
 int errorCnt = 0;
 
 if (gp->name == NULL)
     gpError(errFh, &errorCnt, "%s: name is null", desc);
 
 if (!(sameString(gp->strand, "+") || sameString(gp->strand, "-")))
     gpError(errFh, &errorCnt, "%s: %s invalid strand: \"%s\"", desc, gp->name, gp->strand);
 
 /* check chrom */
 if (chromSize == 0)
     gpError(errFh, &errorCnt, "%s: %s chrom not a valid chromosome: \"%s\"", desc, gp->name, gp->chrom);
 else if (chromSize > 0)
     {
     if (gp->txEnd > chromSize)
         gpError(errFh, &errorCnt, "%s: %s txEnd %u >= chromSize %u", desc, gp->name, gp->txEnd, chromSize);
     }
 
 /* check internal consistency */
 if (gp->txStart >= gp->txEnd)
     gpError(errFh, &errorCnt, "%s: %s txStart %u >= txEnd %u", desc, gp->name, gp->txStart, gp->txEnd);
 
 /* no CDS is indicated by cdsStart == cdsEnd */
 if (gp->cdsStart != gp->cdsEnd)
     checkCdsBounds(errFh, &errorCnt, desc, gp);
 
 /* must be at least one exon */
 if (gp->exonCount == 0)
     gpError(errFh, &errorCnt, "%s: %s contains no exons", desc, gp->name);
 else
     {
 /* make sure first/last exons match tx range */
     if (gp->txStart != gp->exonStarts[0])
         gpError(errFh, &errorCnt, "%s: %s first exon start %u doesn't match txStart %u", desc, gp->name, gp->exonStarts[0], gp->txStart);
     if (gp->txEnd != gp->exonEnds[gp->exonCount-1])
         gpError(errFh, &errorCnt, "%s: %s last exon end %u doesn't match txEnd %u", desc, gp->name, gp->exonEnds[gp->exonCount-1], gp->txEnd);
     }
 
 /* check each exon */
 for (iExon = 0; iExon < gp->exonCount; iExon++)
     checkExon(errFh, &errorCnt, desc, gp, iExon);
 
 return errorCnt;
 }
 
 static int lookupChromSize(char *desc, FILE* errFh, char* db, struct genePred* gp, int *errorCnt)
 /* lookup the chromosome size in the database, -1 if invalid */
 {
 // hGetChromInfo is case independent
 struct chromInfo *ci = hGetChromInfo(db, gp->chrom);
 if (ci == NULL)
     {
     fprintf(errFh, "Error: %s: %s has invalid chrom for %s: %s\n", desc, gp->name, db, gp->chrom);
     (*errorCnt)++;
     return -1;  // don't validate
     }
 else if (differentString(gp->chrom, ci->chrom)) // verify case dependent equal
     {
     fprintf(errFh, "Error: %s: %s has invalid chrom for %s: %s\n", desc, gp->name, db, gp->chrom);
     (*errorCnt)++;
     return -1;  // don't validate
     }
 else
     return ci->size;
 }
 
 int genePredCheckDb(char *desc, FILE* errFh, char* db, struct genePred* gp)
 /* Validate a genePred for consistency.  desc is printed the error messages
  * to file errFh (open /dev/null to discard).  Lookup chromosome size in database if
  * db is not NULL. Returns count of errors. */
 {
 int errorCnt = 0;
 int chromSize = -1;  /* default to not checking */
 if (db != NULL)
     chromSize = lookupChromSize(desc, errFh, db, gp, &errorCnt);
 errorCnt += genePredCheck(desc, errFh, chromSize, gp);
 return errorCnt;
 }
 
 int genePredCheckChromSizes(char *desc, FILE* errFh, struct genePred* gp,
                             struct hash* chromSizes)
 /* Validate a genePred for consistency.  desc is printed the error messages
  * to file errFh (open /dev/null to discard).  Lookup chromosome size in hash.
  */
 {
 int errorCnt = 0;
 int chromSize = hashIntValDefault(chromSizes, gp->chrom, -1);
 if (chromSize < 0)
     {
     fprintf(errFh, "Error: %s: %s has invalid chrom for: %s\n", desc, gp->name, gp->chrom);
     errorCnt++;
     }
 else
     errorCnt += genePredCheck(desc, errFh, chromSize, gp);
 return errorCnt;
 }
 
 boolean genePredNmdTarget(struct genePred *gp) 
 /* Return TRUE if cds end is more than 50bp upstream of
    last intron. */
 {
 int gpSize = 0;
 int i = 0;
 int startDist = 0, endDist = 0;
 int blockSize = 0;
 if(gp->exonCount < 2 || gp->cdsStart == gp->cdsEnd)
     return FALSE;
 for(i = 0; i < gp->exonCount; i++)
     {
     int blockStart = gp->exonStarts[i];
     int blockEnd = gp->exonEnds[i];
     blockSize = blockEnd - blockStart;
     if( blockStart <= gp->cdsStart && gp->cdsStart <= blockEnd)
 	startDist += blockSize - (blockEnd - gp->cdsStart);
     else if(blockEnd <= gp->cdsStart)
 	startDist += blockSize;
     if( blockStart <= gp->cdsEnd && gp->cdsEnd <= blockEnd)
 	endDist += blockSize - (blockEnd - gp->cdsEnd);
     else if(blockEnd <= gp->cdsEnd)
 	endDist += blockSize;
     gpSize += blockSize;
     }
 /*  xxxxxXXX----XXXXXXXXXXXXX--------XXXXXXX----XXXxxxxxxxx-------xxxxxxxxx  */                                           
 if(sameString(gp->strand, "+"))
     {
     blockSize = gp->exonEnds[gp->exonCount-1] - gp->exonStarts[gp->exonCount-1];
     return endDist < (gpSize - blockSize - 50); 
     }
 else if(sameString(gp->strand, "-"))
     {
     blockSize = gp->exonEnds[0] - gp->exonStarts[0];
     return startDist > (blockSize + 50);
     }
 else 
     errAbort("genePredNmdsTarget() - Don't recognize strand: %s\n", gp->strand);
 return FALSE;
 }
 
 void genePredAddExonFrames(struct genePred *gp)
 /* Add exonFrames array to a genePred that doesn't have it. Frame is assumed
  * to be contiguous.  NOTE: suggest using genePredFixExonFrames for new code. */
 {
 int iExon, start, end, iBase = 0;
 int iStart, iEnd, iDir;
 
 /* initial array to all -1, for no frame */
 AllocArray(gp->exonFrames, gp->exonCount);
 gp->optFields |= genePredExonFramesFld;
 for (iExon = 0; iExon < gp->exonCount; iExon++)
     gp->exonFrames[iExon] = -1;
 
 if (sameString(gp->strand, "+"))
     {
     iStart = 0;
     iEnd = gp->exonCount;
     iDir = 1;
     }
 else
     {
     iStart = gp->exonCount-1;
     iEnd = -1;
     iDir = -1;
     }
 for (iExon = iStart; iExon != iEnd; iExon += iDir)
     {
     if (genePredCdsExon(gp, iExon, &start, &end))
         {
         gp->exonFrames[iExon] = iBase % 3;
         iBase += (end - start);
         }
     }
 }
 
 void genePredFixExonFrames(struct genePred *gp)
 /* Add exonFrames array to a genePred that has frame on only some or no
  * features. Frame is assumed to be contiguous when an existing frame is not
  * present. */
 {
 int iStart, iEnd, iDir;
 
 if (gp->exonFrames == NULL)
     {
     /* doesn't currently have frames */
     AllocArray(gp->exonFrames, gp->exonCount);
     gp->optFields |= genePredExonFramesFld;
     for (int iExon = 0; iExon < gp->exonCount; iExon++)
         gp->exonFrames[iExon] = -1;
     }
 
 if (sameString(gp->strand, "+"))
     {
     iStart = 0;
     iEnd = gp->exonCount;
     iDir = 1;
     }
 else
     {
     iStart = gp->exonCount - 1;
     iEnd = -1;
     iDir = -1;
     }
 int nextFrame = -1;
 for (int iExon = iStart; iExon != iEnd; iExon += iDir)
     {
     int start, end;
     if (genePredCdsExon(gp, iExon, &start, &end))
         {
         if (gp->exonFrames[iExon] < 0)
             {
             // no existing frame
             if (nextFrame < 0)
                 nextFrame = 0;  // first frame
             gp->exonFrames[iExon] = nextFrame;
             }
         nextFrame = incrFrame(gp->exonFrames[iExon], end - start);
         }
     else
         {
         gp->exonFrames[iExon] = -1;
         }
     }
 }
 
 void genePredRc(struct genePred *gp, int chromSize)
 /* Reverse complement a genePred (project it to the opposite strand).  Useful
  * when doing analysis that is simplified by having things on the same strand.
  */
 {
 int iExon;
 gp->strand[0] = (gp->strand[0] == '+') ? '-' : '+';
 reverseUnsignedRange(&gp->txStart, &gp->txEnd, chromSize);
 reverseUnsignedRange(&gp->cdsStart, &gp->cdsEnd, chromSize);
 for (iExon = 0; iExon < gp->exonCount; iExon++)
     reverseUnsignedRange(&gp->exonStarts[iExon], &gp->exonEnds[iExon], chromSize);
 reverseUnsigned(gp->exonStarts, gp->exonCount);
 reverseUnsigned(gp->exonEnds, gp->exonCount);
 if (gp->optFields & genePredCdsStatFld)
     {
     enum cdsStatus hold =  gp->cdsStartStat;
     gp->cdsStartStat = gp->cdsEndStat;
     gp->cdsEndStat = hold;
     }
 if (gp->optFields & genePredExonFramesFld)
     reverseInts(gp->exonFrames, gp->exonCount);
 }
 
 struct genePred *genePredNew(char *name, char *chrom, char strand,
                              unsigned txStart, unsigned txEnd,
                              unsigned cdsStart, unsigned cdsEnd,
                              unsigned optFields, unsigned exonSpace)
 /* create a new gene with space for the specified number of exons allocated.
  * genePredGrow maybe used to expand this space if needed. */
 {
 struct genePred *gp;
 AllocVar(gp);
 assert(exonSpace > 0);
 
 gp->name = cloneString(name);
 gp->chrom  = cloneString(chrom);
 gp->strand[0] = strand;
 gp->txStart = txStart;
 gp->txEnd = txEnd;
 gp->cdsStart  = cdsStart;
 gp->cdsEnd = cdsEnd;
 gp->optFields = optFields;
 AllocArray(gp->exonStarts, exonSpace);
 AllocArray(gp->exonEnds, exonSpace);
 if (gp->optFields & genePredExonFramesFld)
     AllocArray(gp->exonFrames, exonSpace);
 return gp;
 }
 
 void genePredGrow(struct genePred *gp, unsigned *exonSpacePtr)
 /* Increase memory allocated to a psl to hold more exons.  exonSpacePtr
  * should point the the current maximum number of exons and will be
  * updated to with the new amount of space. */
 {
 int exonSpace = *exonSpacePtr;
 int newSpace = 2 * exonSpace;
 ExpandArray(gp->exonStarts, exonSpace, newSpace);
 ExpandArray(gp->exonEnds, exonSpace, newSpace);
 if (gp->optFields & genePredExonFramesFld)
     ExpandArray(gp->exonFrames, exonSpace, newSpace);
 *exonSpacePtr = newSpace;
 }
 
 struct rbTree *genePredToRangeTree(struct genePred *gp, boolean cdsOnly)
 /* Convert genePred into a range tree. */
 {
 struct rbTree *rangeTree = rangeTreeNew();
 int i;
 for (i=0; i<gp->exonCount; ++i)
     {
     int start = gp->exonStarts[i];
     int end = gp->exonEnds[i];
     if (cdsOnly)
         {
 	start = max(start, gp->cdsStart);
 	end = min(end, gp->cdsEnd);
 	if (start >= end)
 	    continue;
 	}
     rangeTreeAdd(rangeTree, start, end);
     }
 return rangeTree;
 }
 
 void gpPartOutAsBed(struct genePred *gp, int start, int end, FILE *f, 
 	char *type, int id, int minSize)
 /* Write out part of gp as bed12. */
 {
 /* Figure out # of blocks and min/max of area inside start/end */
 int blockCount = 0;
 int newStart = gp->txEnd, newEnd = gp->txStart;
 int size = 0;
 int i;
 for (i=0; i<gp->exonCount; ++i)
     {
     int exonStart = gp->exonStarts[i];
     int exonEnd = gp->exonEnds[i];
     exonStart = max(start, exonStart);
     exonEnd = min(end, exonEnd);
     if (exonStart < exonEnd)
         {
 	++blockCount;
 	newStart = min(exonStart, newStart);
 	newEnd = max(exonEnd, newEnd);
 	size += exonEnd - exonStart;
 	}
     }
 
 /* Output first 10 fields of bed12. */
 if (size > minSize)
     {
     fprintf(f, "%s\t%d\t%d\t", gp->chrom, newStart, newEnd);
     fprintf(f, "%s_%d_%s\t", type, id, gp->name);
     fprintf(f, "0\t%s\t0\t0\t0\t%d\t", gp->strand, blockCount);
 
     /* Output blockSizes field */
     for (i=0; i<gp->exonCount; ++i)
 	{
 	int exonStart = gp->exonStarts[i];
 	int exonEnd = gp->exonEnds[i];
 	exonStart = max(start, exonStart);
 	exonEnd = min(end, exonEnd);
 	if (exonStart < exonEnd)
 	    fprintf(f, "%d,", exonEnd - exonStart);
 	}
     fprintf(f, "\t");
 
     /* Output chromStarts field */
     for (i=0; i<gp->exonCount; ++i)
 	{
 	int exonStart = gp->exonStarts[i];
 	int exonEnd = gp->exonEnds[i];
 	exonStart = max(start, exonStart);
 	exonEnd = min(end, exonEnd);
 	if (exonStart < exonEnd)
 	    fprintf(f, "%d,", exonStart - newStart);
 	}
     fprintf(f, "\n");
     }
 }
 
 boolean codonToPos(struct genePred *gp, unsigned num, int *chromStart, int *chromEnd)
 {
 // map 1-based codon to genomic coordinates. If the codon crosses an exon junction, we return just the beginning (LHS) of the codon.
 // Returns true if we find the codon in given gene predition; chromStart and end are set to appropriate three base region.
 
 int pos = -1;
 int i;
 int offset = -1;  // current 1-based offset in bases (not codons)
 if(gp->strand[0] == '+')
     {
     for(i = 0; i < gp->exonCount; i++)
         {
         if(gp->exonEnds[i] > gp->cdsStart && gp->exonStarts[i] < gp->cdsEnd)
             {
             int start, end;
             if(offset == -1 && gp->cdsStart <= gp->exonEnds[i])
                 {
                 // start counting
                 start = gp->cdsStart;
                 offset = 1;
                 }
             else
                 start = gp->exonStarts[i];
             if(gp->cdsEnd < gp->exonEnds[i])
                 end = gp->cdsEnd;
             else
                 end = gp->exonEnds[i];
             int next = offset + end - start;
             if(next > (num * 3 - 2))
                 {
                 pos = start + (((num - 1) * 3 + 1) - offset);
                 break;
                 }
             else
                 offset = next;
             }
         }
     }
 else
     {
     for(i = gp->exonCount - 1; i >= 0; i--)
         {
         if(gp->exonStarts[i] < gp->cdsEnd && gp->exonEnds[i] > gp->cdsStart)
             {
             int start, end; // start here is really the RHS, and end is the LHS
             if(offset == -1 && gp->cdsEnd >= gp->exonStarts[i])
                 {
                 // start counting
                 start = gp->cdsEnd;
                 offset = 1;
                 }
             else
                 start = gp->exonEnds[i];
             if(gp->cdsStart > gp->exonStarts[i])
                 end = gp->cdsStart;
             else
                 end = gp->exonStarts[i];
             int next = offset + start - end;
             if(next > num * 3)
                 {
                 pos = start - (num*3 - offset) - 1;
                 break;
                 }
             else
                 offset = next;
             }
         }
     }
 if(pos == -1)
     return FALSE;
 else
     {
     *chromStart = pos;
     *chromEnd = pos + 3;
     return TRUE;
     }
 }
 
 boolean exonToPos(struct genePred *gp, unsigned num, int *chromStart, int *chromEnd)
 {
 // map 1-based exon number to genomic coordinates.
 // Returns true if we find the exon in given gene predition; start and end are set to appropriate region.
 
 if(num == 0 || num > gp->exonCount)
     return FALSE;
 else if(gp->strand[0] == '+')
     {
     *chromStart = gp->exonStarts[num - 1];
     *chromEnd = gp->exonEnds[num - 1];
     }
 else
     {
     *chromStart = gp->exonStarts[gp->exonCount - num];
     *chromEnd = gp->exonEnds[gp->exonCount - num];
     }
 return TRUE;
 }
 
 static char *genePredAutoSqlString =
     "table genePred\n"
     "\"A gene prediction.\"\n"
     "    (\n"
     "    string name;	\"Name of gene\"\n"
     "    string chrom;	\"Reference sequence chromosome or scaffold\"\n"
     "    char[1] strand;     \"+ or - for strand\"\n"
     "    uint txStart;	\"Transcription start position\"\n"
     "    uint txEnd;         \"Transcription end position\"\n"
     "    uint cdsStart;	\"Coding region start\"\n"
     "    uint cdsEnd;        \"Coding region end\"\n"
     "    uint exonCount;     \"Number of exons\"\n"
     "    uint[exonCount] exonStarts; \"Exon start positions\"\n"
     "    uint[exonCount] exonEnds;   \"Exon end positions\"\n"
     "    )\n";
 
 struct asObject *genePredAsObj()
 // Return asObject describing fields of genePred
 {
 return asParseText(genePredAutoSqlString);
 }
 
 struct dnaSeq *genePredGetDna(char *database, struct genePred *gp,
                               boolean coding, enum dnaCase dnaCase)
 // Returns the DNA sequence associated with gene prediction.
 // Negative strand genes will return the sequence as read from the negative strand.
 // Optionally restrict to coding sequence only
 {
 struct dnaSeq *dnaSeq = hDnaFromSeq(database, gp->chrom, gp->txStart, gp->txEnd,  dnaCase);
 if (dnaSeq == NULL)
     return NULL;
 DNA *seq = dnaSeq->dna;
 int len = dnaSeq->size;
 if (coding)
     {
     int cdnaOffset = 0, exonIx;
     for (exonIx = 0; exonIx < gp->exonCount; exonIx++)
         {
         int exonStart = max(gp->exonStarts[exonIx],gp->cdsStart);
         int exonEnd   = min(gp->exonEnds[  exonIx],gp->cdsEnd  );
         if (exonStart >= exonEnd) // non-coding exon
             continue;
         exonStart -= gp->txStart;
         exonEnd   -= gp->txStart;
         int exonSize  = exonEnd - exonStart;
 
         assert(cdnaOffset <= exonStart && exonEnd <= len);
         if (cdnaOffset < exonStart)
             memmove(seq + cdnaOffset, seq + exonStart, exonSize);
         cdnaOffset += exonSize;
         }
     seq[cdnaOffset] = '\0';
     dnaSeq->size = cdnaOffset;
     }
 
 if (gp->strand[0] == '-')
     reverseComplement(seq,dnaSeq->size);
 
 return dnaSeq;
 }
 
 int genePredBaseToCodingPos(struct genePred *gp, int basePos,
                             boolean stranded, boolean *isCoding)
 // Given a genePred model and a single (0 based) base position, predict the 0-based
 // DNA (stranded) coding sequence pos.  Dividing this number by 3 should give the AA position!
 // Returns -1 when outside of coding exons unless OPTIONAL isCoding pointer to boolean is
 // provided. In that case, returns last valid position and sets isCoding to FALSE.
 {
 if (isCoding == NULL && (basePos < gp->cdsStart || basePos >= gp->cdsEnd))
     return -1;
 
 boolean reverse = (stranded && gp->strand[0] == '-');
 assert(gp->exonStarts != NULL && gp->exonEnds != NULL);
 
 // Walk through exons, advancing depending upon strand
 int codingBasesSoFar = 0;
 int exonIx = (reverse ? (gp->exonCount - 1) : 0);
 while (0 <= exonIx && exonIx < gp->exonCount)
     {
     int exonStart = max(gp->exonStarts[exonIx],gp->cdsStart);
     int exonEnd   = min(gp->exonEnds[  exonIx],gp->cdsEnd  );
     if (exonStart < exonEnd) // coding exon
         {
         // Within this exon?
         if (exonStart <= basePos && basePos < exonEnd)
             {
             if (isCoding != NULL)
                 *isCoding = TRUE;
             if (reverse)
                 return (codingBasesSoFar + (exonEnd - basePos));
             else
                 return (codingBasesSoFar + (basePos - exonStart));
             }
 
         // Past the target base already?
         if (( reverse && basePos >= exonEnd  )
         ||  (!reverse && basePos <  exonStart))
             break;
 
         // Yet to reach the target base... accumulate exon worth
         codingBasesSoFar += (exonEnd - exonStart);
         }
 
     exonIx += (reverse ? -1 : 1);
     }
 
 if (isCoding != NULL && codingBasesSoFar > 0)
     {
     *isCoding = FALSE;
     return codingBasesSoFar;
     }
 return -1;  // introns not okay
 }
 
 struct genePredExt  *genePredFromBedBigGenePred( char *chrom, struct bed *bed, struct bigBedInterval *bb)
 /* build a genePred from a bigGenePred and a bed file */
 {
 char *extra = cloneString(bb->rest);
 int numCols = 12 + 8 - 3;
 char *row[numCols];
 int wordCount = chopByChar(extra, '\t', row, numCols);
 if (wordCount < numCols)
     errAbort("expected at least %d columns in bigGenePred, got %d; is this actually a bigGenePred?", numCols, wordCount);
 
 struct genePredExt *gp = bedToGenePredExt(bed);
 
 gp->name2 = cloneString(row[ 9]);
 
 int numBlocks;
 sqlSignedDynamicArray(row[ 12],  &gp->exonFrames, &numBlocks);
 gp->optFields |= genePredExonFramesFld;
 //assert (numBlocks == gp->exonCount);
 
 gp->type = cloneString(row[13]);
 gp->geneName = cloneString(row[14]);
 gp->geneName2 = cloneString(row[15]);
 
 return gp;
 }
 
 struct genePredExt  *genePredFromBigGenePred( char *chrom, struct bigBedInterval *bb)
 /* build a genePred from a bigGenePred */
 {
 char *extra = cloneString(bb->rest);
 int numCols = 12 + 8 - 3;
 char *row[numCols];
 int wordCount = chopByChar(extra, '\t', row, numCols);
 assert(wordCount == numCols);
 
 struct genePredExt *gp;
 AllocVar(gp);
 
 gp->chrom = chrom;
 gp->txStart = bb->start;
 gp->txEnd = bb->end;
 gp->name =  cloneString(row[ 0]);
 gp->strand[0] =  row[ 2][0];
 gp->strand[1] =  row[ 2][1];
 gp->cdsStart =  atoi(row[ 3]);
 gp->cdsEnd =  atoi(row[ 4]);
 gp->exonCount =  atoi(row[ 6]);
 int numBlocks;
 sqlUnsignedDynamicArray(row[ 8],  &gp->exonStarts, &numBlocks);
 assert (numBlocks == gp->exonCount);
 sqlUnsignedDynamicArray(row[ 7],  &gp->exonEnds, &numBlocks);
 assert (numBlocks == gp->exonCount);
 
 int ii;
 for(ii=0; ii < numBlocks; ii++)
     {
     gp->exonStarts[ii] += bb->start;
     gp->exonEnds[ii] += gp->exonStarts[ii];
     }
 
 gp->name2 = cloneString(row[ 9]);
 
 gp->cdsStartStat = parseCdsStat(row[ 10]);
 gp->cdsEndStat = parseCdsStat(row[ 11]);
 sqlSignedDynamicArray(row[ 12],  &gp->exonFrames, &numBlocks);
 gp->optFields |= genePredExonFramesFld;
 assert (numBlocks == gp->exonCount);
 
 gp->type = cloneString(row[13]);
 gp->geneName = cloneString(row[14]);
 gp->geneName2 = cloneString(row[15]);
 
 return gp;
 }
 
 static void sqlUnsignedDynamicArrayNoClobber(char *s, unsigned **retArray, int *retSize)
 /* Make a copy of s on stack and chop that up so we don't mangle s. */
 {
 char copy[strlen(s)+1];
 safecpy(copy, sizeof(copy), s);
 sqlUnsignedDynamicArray(copy, retArray, retSize);
 }
 
 struct genePredExt  *genePredFromBigGenePredRow(char **row)
 /* build a genePred from a bigGenePred row */
 {
 struct genePredExt *gp;
 AllocVar(gp);
 gp->chrom = cloneString(row[0]);
 gp->txStart = sqlUnsigned(row[1]);
 gp->txEnd = sqlUnsigned(row[2]);
 gp->name = cloneString(row[3]);
 gp->strand[0] = row[5][0];
 gp->strand[1] = row[5][1];
 gp->cdsStart = sqlUnsigned(row[6]);
 gp->cdsEnd = sqlUnsigned(row[7]);
 gp->exonCount = sqlUnsigned(row[9]);
 int numBlocks;
 sqlUnsignedDynamicArrayNoClobber(row[11], &gp->exonStarts, &numBlocks);
 assert (numBlocks == gp->exonCount);
 // First put blockSizes in exonEnds:
 sqlUnsignedDynamicArrayNoClobber(row[10], &gp->exonEnds, &numBlocks);
 assert (numBlocks == gp->exonCount);
 // Then add in txStart to relative starts, and add starts to block sizes to get ends:
 int ii;
 for(ii=0; ii < numBlocks; ii++)
     {
     gp->exonStarts[ii] += gp->txStart;
     gp->exonEnds[ii] += gp->exonStarts[ii];
     }
 gp->name2 = cloneString(row[12]);
 gp->cdsStartStat = parseCdsStat(row[13]);
 gp->cdsEndStat = parseCdsStat(row[14]);
 gp->optFields |= genePredCdsStatFld;
 sqlSignedDynamicArray(row[15],  &gp->exonFrames, &numBlocks);
 assert (numBlocks == gp->exonCount);
 gp->optFields |= genePredExonFramesFld;
 gp->type = cloneString(row[16]);
 gp->geneName = cloneString(row[17]);
 gp->geneName2 = cloneString(row[18]);
 return gp;
 }
 
 struct cds
 /* CDS sequence being assembled */
 {
     char *bases;    // CDS string being built
     int iCds;       // Index into CDS
     int nextFrame;  // next required frame
 };
 
 static struct dnaSeq* getGeneRegion(struct nibTwoCache* genomeSeqs,  struct genePred *gp, int start, int end,
                                     int *chromSize)
 /* Get the genome sequence covering the a region of a gene, in transcription order */
 {
 struct dnaSeq *dna = nibTwoCacheSeqPart(genomeSeqs, gp->chrom, start, end-start, chromSize);
 if (gp->strand[0] == '-')
     reverseComplement(dna->dna, dna->size);
 return dna;
 }
 
 static void removePartialCodon(struct cds* cds)
 /* remove partial codon that has already been copied to CDS */
 {
 int iCdsNew = cds->iCds - cds->nextFrame;
 if (iCdsNew < 0)
     iCdsNew = 0;
 zeroBytes(cds->bases+iCdsNew, (cds->iCds - iCdsNew));
 cds->iCds = iCdsNew;
 cds->nextFrame = 0;
 }
 
 static void addCdsExonBases(struct nibTwoCache* genomeSeqs, struct genePred *genePred,
                             int exonCdsStart, int exonCdsEnd, struct cds* cds, int *chromSize)
 {
 struct dnaSeq *dna = getGeneRegion(genomeSeqs, genePred, exonCdsStart, exonCdsEnd, chromSize);
 int iDna = 0;
 int iCdsStart = cds->iCds;
 while (iDna < dna->size)
     cds->bases[cds->iCds++] = dna->dna[iDna++];
 cds->nextFrame = ((cds->nextFrame) + (cds->iCds - iCdsStart)) % 3;
 dnaSeqFree(&dna);
 }
     
 static void addCdsExon(struct nibTwoCache* genomeSeqs, struct genePred *gp,
                        int exonCdsStart, int exonCdsEnd, int frame,
                        struct cds* cds)
 /* get CDS from one exon */
 {
 // adjust for frame shift, dropping partial codon
 if (frame != cds->nextFrame)
     {
     removePartialCodon(cds);
     if (gp->strand[0] == '+')
         exonCdsStart += frame;
     else
         exonCdsEnd -= frame;
     }
 if (exonCdsStart < exonCdsEnd)
     {
     int chromSize = 0;
     addCdsExonBases(genomeSeqs, gp, exonCdsStart, exonCdsEnd, cds, &chromSize);
     }
 }
 
 static char* getCdsCodons(struct genePred *gp, struct nibTwoCache* genomeSeqs)
 /* get the CDS sequence, dropping partial codons */
 {
 struct cds cds;
 cds.bases = needMem(genePredCdsSize(gp) + 1);
 cds.iCds = 0;
 cds.nextFrame = 0;
     
 // walk in direction of transcription
 int dir = (gp->strand[0] == '+') ? 1 : -1;
 int iExon = (dir > 0) ? 0 : gp->exonCount-1;
 int iStop = (dir > 0) ? gp->exonCount : -1;
 int exonCdsStart, exonCdsEnd;
 for (; iExon != iStop; iExon += dir)
     {
     if (genePredCdsExon(gp, iExon, &exonCdsStart, &exonCdsEnd))
         addCdsExon(genomeSeqs, gp, exonCdsStart, exonCdsEnd, gp->exonFrames[iExon], &cds);
     }
 if (cds.nextFrame != 0)
     removePartialCodon(&cds);
 assert((strlen(cds.bases) % 3) == 0);  // ;-)
 return cds.bases;
 }
 
 static char translateCodon(boolean isChrM, char* codon, bool lastCodon, unsigned options)
 /* translate the first three bases starting at codon, handling weird
  * biology as requested giving */
 {
 char aa = isChrM ? lookupMitoCodon(codon) : lookupCodon(codon);
 if (aa == '\0')
     {
     // stop, contains `N' or selenocysteine
     boolean isStopOrSelno = isStopCodon(codon);
     boolean isRealStop = isReallyStopCodon(codon, !lastCodon); // internal could be selenocysteine
     if (lastCodon)
         {
         if ((options & GENEPRED_TRANSLATE_INCLUDE_STOP) != 0)
             aa = '*';
         else if (!isRealStop)
             aa = 'X';
         // others, \0' will terminate
         }
     else if (((options & GENEPRED_TRANSLATE_SELENO) != 0)
              && isStopOrSelno && !isRealStop)
         aa = 'U';
     else if (isRealStop && ((options & GENEPRED_TRANSLATE_STAR_INFRAME_STOPS) != 0))
         aa = '*';
     else
         aa = 'X';
     }
 return aa;
 }
 
 static char* translateCds(char* chrom, char* cds, unsigned options)
 /* translate the CDS */
 {
 int cdsLen = strlen(cds);
 char *prot = needMem((cdsLen/3)+1);
 boolean isChrM = isMito(chrom);
 int iCds, iProt;
 for (iCds = 0, iProt = 0; iCds < cdsLen; iCds+=3, iProt++)
     prot[iProt] = translateCodon(isChrM, cds+iCds, (iCds == cdsLen-3), options);
 return prot;
 }
 
 void genePredTranslate(struct genePred *gp, struct nibTwoCache* genomeSeqs, unsigned options,
                        char **protRet, char **cdsRet)
 /* Translate a genePred into a protein.  It can also return the CDS part of the
  * mRNA sequence. If the chrom is chrM, the mitochondrial translation tables are
  * used. If protRet or cdsRet is NULL, those sequences are not returned.
  */
 {
 // note: code tests by genePredToProt
 bool haveFrames = (gp->exonFrames != NULL);
 if (!haveFrames)
     genePredAddExonFrames(gp);  // assume correct frame if not included
 
 char* cds = getCdsCodons(gp, genomeSeqs);
 char *prot = translateCds(gp->chrom, cds, options);
 
 if (protRet != NULL)
     *protRet = prot;
 else
     freeMem(prot);
 if (cdsRet != NULL)
     *cdsRet = cds;
 else
     freeMem(cds);
 
 if (!haveFrames)
     freez(&gp->exonFrames);
 }
 
 void genePredToCds(struct genePred *gp, struct genbankCds *cds)
 /* Fill in cds with transcript offsets computed from genePred. */
 {
 /*
  * Warning: Genbank CDS does't have the ability to represent
  * partial codons.  If we have genePreds created from GFF/GTF, they can have
  * partial codons, which is indicated in frame.  This code does not correctly handle
  * this case, or frame shifting indels.
  */
 cds->start = cds->end = -1;
 cds->startComplete = cds->endComplete = cds->complement = FALSE;
 if (gp->cdsEnd > gp->cdsStart)
     {
     int e, off = 0;
     int qCdsStart = -1, qCdsEnd = -1;
     for (e = 0; e < gp->exonCount; ++e)
         {
         int eCdsStart, eCdsEnd;
         if (genePredCdsExon(gp, e, &eCdsStart, &eCdsEnd))
             {
             if (qCdsStart < 0)
                 qCdsStart = off + (eCdsStart - gp->exonStarts[e]);
             qCdsEnd = off + (eCdsEnd - gp->exonStarts[e]);
             }
         off += gp->exonEnds[e] - gp->exonStarts[e];
         }
     int qSize = off;
     if (gp->strand[0] == '-')
         reverseIntRange(&qCdsStart, &qCdsEnd, qSize);
     cds->start = qCdsStart;
     cds->end = qCdsEnd;
     cds->startComplete = (gp->cdsStartStat != cdsIncomplete);
     cds->endComplete = (gp->cdsEndStat != cdsIncomplete);
     }
 }
 
 struct psl *genePredToPsl(struct genePred *gp, int chromSize, int qSize)
 /* Convert a genePred to psl, assuming perfect concordance between target & query.
  * If qSize is 0 then the number of aligned bases will be used as qSize. */
 {
 int e = 0, aliSize = 0;
 for (e = 0; e < gp->exonCount; ++e)
     aliSize += (gp->exonEnds[e] - gp->exonStarts[e]);
 struct psl *psl = pslNew(gp->name, qSize ? qSize : aliSize, 0, aliSize,
                          gp->chrom, chromSize, gp->txStart, gp->txEnd,
                          gp->strand, gp->exonCount, 0);
 // If qSize is greater than aliSize then we assume the extra bases are at the end of
 // the transcript (poly-A tail).  If the alignment is on the '-' strand, then we need
 // to offset the reversed qStarts by the number of extra bases.
 int sizeAdjust = (gp->strand[0] == '-' && qSize > aliSize) ? (qSize - aliSize) : 0;
 int i = -1;
 for (e = 0; e < gp->exonCount; ++e)
     {
     if (e == 0 || gp->exonStarts[e] != gp->exonEnds[e-1])
         {
         i++;
         psl->blockSizes[i] = (gp->exonEnds[e] - gp->exonStarts[e]);
         psl->qStarts[i] = i==0 ? 0 + sizeAdjust : psl->qStarts[i-1] + psl->blockSizes[i-1];
         psl->tStarts[i] = gp->exonStarts[e];
         }
     else
         {
         // Merge "exons" that have a 0-length gap between them to avoid pslCheck failure
         psl->blockSizes[i] += (gp->exonEnds[e] - gp->exonStarts[e]);
         }
     }
 psl->blockCount = i+1;
 psl->match = aliSize;
 psl->tNumInsert = psl->blockCount-1;
 psl->tBaseInsert = (gp->txEnd - gp->txStart) - aliSize;
 return psl;
 }