src/hg/lib/genePred.c 1.103
1.103 2010/05/10 08:23:35 kent
Fixing indentation.
Index: src/hg/lib/genePred.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/lib/genePred.c,v
retrieving revision 1.102
retrieving revision 1.103
diff -b -B -U 1000000 -r1.102 -r1.103
--- src/hg/lib/genePred.c 13 Feb 2009 01:02:16 -0000 1.102
+++ src/hg/lib/genePred.c 10 May 2010 08:23:35 -0000 1.103
@@ -1,1722 +1,1723 @@
/* genePred.c was originally generated by the autoSql program, which also
* generated genePred.h and genePred.sql. This module links the database and the RAM
* representation of objects. */
#include "common.h"
#include "gff.h"
#include "jksql.h"
#include "psl.h"
#include "linefile.h"
#include "genePred.h"
#include "genbank.h"
#include "rangeTree.h"
#include "hdb.h"
static char const rcsid[] = "$Id$";
/* SQL to create a genePred table */
static char *createSql =
"CREATE TABLE %s ("
" %s" /* bin column goes here */
" name varchar(255) not null," /* mrna accession of gene */
" chrom varchar(255) not null," /* Chromosome name */
" strand char(1) not null," /* + or - for strand */
" txStart int unsigned not null," /* Transcription start position */
" txEnd int unsigned not null," /* Transcription end position */
" cdsStart int unsigned not null," /* Coding region start */
" cdsEnd int unsigned not null," /* Coding region end */
" exonCount int unsigned not null," /* Number of exons */
" exonStarts longblob not null," /* Exon start positions */
" exonEnds longblob not null," /* Exon end positions */
" INDEX(name)";
static char *binFieldSql =
" bin smallint unsigned not null,"
" INDEX(chrom,bin),";
static char *noBinIndexSql =
" INDEX(chrom,txStart),";
static char *scoreFieldSql =
" ,score int";
static char *name2FieldSql =
" ,name2 varchar(255) not null," /* Secondary name. (e.g. name of gene) or NULL if not available */
" INDEX(name2)";
static char *cdsStatFieldSql =
" ,cdsStartStat enum('none', 'unk', 'incmpl', 'cmpl') not null," /* Status of cdsStart annotation */
" cdsEndStat enum('none', 'unk', 'incmpl', 'cmpl') not null"; /* Status of cdsEnd annotation */
static char *exonFramesFieldSql =
" ,exonFrames longblob not null"; /* List of frame for each exon, or -1 if no frame or not known. NULL if not available. */
struct genePred *genePredLoad(char **row)
/* Load a genePred from row fetched with select * from genePred
* from database. Dispose of this with genePredFree(). */
{
struct genePred *ret;
int sizeOne;
AllocVar(ret);
ret->exonCount = sqlUnsigned(row[7]);
ret->name = cloneString(row[0]);
ret->chrom = cloneString(row[1]);
strcpy(ret->strand, row[2]);
ret->txStart = sqlUnsigned(row[3]);
ret->txEnd = sqlUnsigned(row[4]);
ret->cdsStart = sqlUnsigned(row[5]);
ret->cdsEnd = sqlUnsigned(row[6]);
sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne);
if (sizeOne != ret->exonCount)
errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)",
ret->name, sizeOne, ret->exonCount);
sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne);
if (sizeOne != ret->exonCount)
errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)",
ret->name, sizeOne, ret->exonCount);
return ret;
}
struct genePred *genePredLoadAll(char *fileName)
/* Load all genePred from a whitespace-separated file.
* Dispose of this with genePredFreeList(). */
{
struct genePred *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[10];
while (lineFileRow(lf, row))
{
el = genePredLoad(row);
slAddHead(&list, el);
}
lineFileClose(&lf);
slReverse(&list);
return list;
}
struct genePred *genePredLoadAllByChar(char *fileName, char chopper)
/* Load all genePred from a chopper separated file.
* Dispose of this with genePredFreeList(). */
{
struct genePred *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[10];
while (lineFileNextCharRow(lf, chopper, row, ArraySize(row)))
{
el = genePredLoad(row);
slAddHead(&list, el);
}
lineFileClose(&lf);
slReverse(&list);
return list;
}
struct genePred *genePredCommaIn(char **pS, struct genePred *ret)
/* Create a genePred out of a comma separated string.
* This will fill in ret if non-null, otherwise will
* return a new genePred */
{
char *s = *pS;
int i;
if (ret == NULL)
AllocVar(ret);
ret->name = sqlStringComma(&s);
ret->chrom = sqlStringComma(&s);
sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand));
ret->txStart = sqlUnsignedComma(&s);
ret->txEnd = sqlUnsignedComma(&s);
ret->cdsStart = sqlUnsignedComma(&s);
ret->cdsEnd = sqlUnsignedComma(&s);
ret->exonCount = sqlUnsignedComma(&s);
s = sqlEatChar(s, '{');
AllocArray(ret->exonStarts, ret->exonCount);
for (i=0; i<ret->exonCount; ++i)
{
ret->exonStarts[i] = sqlUnsignedComma(&s);
}
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
s = sqlEatChar(s, '{');
AllocArray(ret->exonEnds, ret->exonCount);
for (i=0; i<ret->exonCount; ++i)
{
ret->exonEnds[i] = sqlUnsignedComma(&s);
}
s = sqlEatChar(s, '}');
s = sqlEatChar(s, ',');
*pS = s;
return ret;
}
void genePredFree(struct genePred **pEl)
/* Free a single dynamically allocated genePred such as created
* with genePredLoad(). */
{
struct genePred *el;
if ((el = *pEl) == NULL) return;
freeMem(el->name);
freeMem(el->chrom);
freeMem(el->exonStarts);
freeMem(el->exonEnds);
freeMem(el->name2);
freeMem(el->exonFrames);
freez(pEl);
}
void genePredFreeList(struct genePred **pList)
/* Free a list of dynamically allocated genePred's */
{
struct genePred *el, *next;
for (el = *pList; el != NULL; el = next)
{
next = el->next;
genePredFree(&el);
}
*pList = NULL;
}
void genePredOutput(struct genePred *el, FILE *f, char sep, char lastSep)
/* Print out genePred. Separate fields with sep. Follow last field with lastSep. */
{
int i;
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->name);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->chrom);
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", el->strand);
if (sep == ',') fputc('"',f);
fputc(sep,f);
fprintf(f, "%u", el->txStart);
fputc(sep,f);
fprintf(f, "%u", el->txEnd);
fputc(sep,f);
fprintf(f, "%u", el->cdsStart);
fputc(sep,f);
fprintf(f, "%u", el->cdsEnd);
fputc(sep,f);
fprintf(f, "%u", el->exonCount);
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->exonCount; ++i)
{
fprintf(f, "%u", el->exonStarts[i]);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->exonCount; ++i)
{
fprintf(f, "%u", el->exonEnds[i]);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
/* optional fields, >= test is used so unspecified coumns can be filled in */
if (el->optFields >= genePredScoreFld)
{
fputc(sep,f);
fprintf(f, "%d", el->score);
}
if (el->optFields >= genePredName2Fld)
{
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", ((el->name2 != NULL) ? el->name2 : ""));
if (sep == ',') fputc('"',f);
}
if (el->optFields >= genePredCdsStatFld)
{
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", genePredCdsStatStr(el->cdsStartStat));
if (sep == ',') fputc('"',f);
fputc(sep,f);
if (sep == ',') fputc('"',f);
fprintf(f, "%s", genePredCdsStatStr(el->cdsEndStat));
if (sep == ',') fputc('"',f);
}
if (el->optFields >= genePredExonFramesFld)
{
fputc(sep,f);
if (sep == ',') fputc('{',f);
for (i=0; i<el->exonCount; ++i)
{
fprintf(f, "%d", el->exonFrames[i]);
fputc(',', f);
}
if (sep == ',') fputc('}',f);
}
fputc(lastSep,f);
}
/* --------- Start of hand generated code. ---------------------------- */
char *genePredCdsStatStr(enum cdsStatus stat)
/* get string value of a cdsStatus */
{
switch (stat) {
case cdsNone:
return "none";
case cdsUnknown:
return "unk";
case cdsIncomplete:
return "incmpl";
case cdsComplete:
return "cmpl";
default:
errAbort("invalid cdsStatus enum value: %d", stat);
return NULL; /* make compiler happy */
}
}
static enum cdsStatus parseCdsStat(char *statStr)
/* parse a cdsStatus string */
{
if ((statStr == NULL) || sameString(statStr, "none"))
return cdsNone;
if (sameString(statStr, "unk"))
return cdsUnknown;
if (sameString(statStr, "incmpl"))
return cdsIncomplete;
if (sameString(statStr, "cmpl"))
return cdsComplete;
errAbort("invalid genePred cdsStatus: \"%s\"", statStr);
return cdsNone; /* make compiler happy */
}
struct genePred *genePredExtLoad(char **row, int numCols)
/* Load a genePred with from a row, with optional fields. The row must
* contain columns in the order in the struct, and they must be present up to
* the last specfied optional field. Missing intermediate fields must have
* zero or empty columns, they may not be omitted. Fields at the end can be
* omitted. Dispose of this with genePredFree(). */
{
struct genePred *ret;
int sizeOne, iCol;
AllocVar(ret);
ret->exonCount = sqlUnsigned(row[7]);
ret->name = cloneString(row[0]);
ret->chrom = cloneString(row[1]);
strcpy(ret->strand, row[2]);
ret->txStart = sqlUnsigned(row[3]);
ret->txEnd = sqlUnsigned(row[4]);
ret->cdsStart = sqlUnsigned(row[5]);
ret->cdsEnd = sqlUnsigned(row[6]);
sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne);
if (sizeOne != ret->exonCount)
errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)",
ret->name, sizeOne, ret->exonCount);
sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne);
if (sizeOne != ret->exonCount)
errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)",
ret->name, sizeOne, ret->exonCount);
iCol=GENEPRED_NUM_COLS;
if (iCol < numCols)
{
ret->score = sqlSigned(row[iCol++]);
ret->optFields |= genePredScoreFld;
}
if (iCol < numCols)
{
ret->name2 = cloneString(row[iCol++]);
ret->optFields |= genePredName2Fld;
}
if (iCol < numCols)
{
ret->cdsStartStat = parseCdsStat(row[iCol++]);
ret->optFields |= genePredCdsStatFld;
}
if (iCol < numCols)
{
ret->cdsEndStat = parseCdsStat(row[iCol++]);
ret->optFields |= genePredCdsStatFld;
}
if (iCol < numCols)
{
sqlSignedDynamicArray(row[iCol++], &ret->exonFrames, &sizeOne);
if (sizeOne != ret->exonCount)
errAbort("genePred: %s number of exonFrames (%d) != number of exons (%d)",
ret->name, sizeOne, ret->exonCount);
ret->optFields |= genePredExonFramesFld;
}
return ret;
}
struct genePred *genePredExtLoadAll(char *fileName)
/* Load all genePreds with from tab-separated file, possibly with optional
* fields. Dispose of this with genePredFreeList(). */
{
struct genePred *list = NULL, *el;
struct lineFile *lf = lineFileOpen(fileName, TRUE);
char *row[GENEPREDX_NUM_COLS];
int numCols;
while ((numCols = lineFileChopNextTab(lf, row, ArraySize(row))) > 0)
{
lineFileExpectAtLeast(lf, GENEPRED_NUM_COLS, numCols);
el = genePredExtLoad(row, numCols);
slAddHead(&list, el);
}
lineFileClose(&lf);
slReverse(&list);
return list;
}
int genePredCmp(const void *va, const void *vb)
/* Compare to sort based on chromosome, txStart. */
{
const struct genePred *a = *((struct genePred **)va);
const struct genePred *b = *((struct genePred **)vb);
int dif;
dif = strcmp(a->chrom, b->chrom);
if (dif == 0)
dif = a->txStart - b->txStart;
return dif;
}
static boolean haveStartStopCodons(struct gffFile *gff)
/* For GFFs, determine if any of the annotations use start_codon or
* stop_codon */
{
struct gffFeature *feature;
for(feature = gff->featureList; feature != NULL; feature = feature->next)
{
if(sameWord(feature->name, "start_codon") ||
sameWord(feature->name, "start_codon"))
return TRUE;
}
return FALSE;
}
static boolean isExon(char *feat, boolean isGtf, char *exonSelectWord)
/* determine if a feature is an exon; different criteria for GFF ane GTF */
{
if (isGtf)
{
/* must check for stop_codon here, because GTF put it outside of the
* CDS */
return (sameWord(feat, "CDS") || sameWord(feat, "stop_codon")
|| sameWord(feat, "start_codon") || sameWord(feat, "exon")
|| sameWord(feat, "utr") || sameWord(feat, "5utr")
|| sameWord(feat, "3utr"));
}
else
{
return ((exonSelectWord == NULL) || sameWord(feat, exonSelectWord));
}
}
static boolean isStartStopCodon(char *feat)
/* determine if a feature is GTF style start/stop codon */
{
return sameWord(feat, "stop_codon") || sameWord(feat, "start_codon");
}
static boolean isCds(char *feat)
/* determine if a feature is CDS */
{
return sameWord(feat, "CDS") || isStartStopCodon(feat);
}
static void chkGroupLine(struct gffGroup *group, struct gffLine *gl, struct genePred *gp)
/* check that a gffLine is consistent with the genePred being built. this
* helps detect some problems that lead to corrupt genePreds */
{
if (!(sameString(gl->seq, gp->chrom) && (gl->strand == gp->strand[0])))
{
fprintf(stderr, "invalid gffGroup detected on line: ");
gffTabOut(gl, stderr);
errAbort("GFF/GTF group %s on %s%c, this line is on %s%c, all group members must be on same seq and strand",
group->name, gp->chrom, gp->strand[0],
gl->seq, gl->strand);
}
}
static void fixStopFrame(struct genePred *gp)
/* Nasty corner case: GTF with the all or part of the stop codon as the only
* codon in an exon, we must set the frame on this exon. */
{
int stopPos = (gp->strand[0] == '+') ? gp->cdsEnd-1 : gp->cdsStart;
int iStop = -1, i;
/* find position containing last base */
for (i = 0; (i < gp->exonCount) && (iStop < 0); i++)
{
if ((gp->exonStarts[i] <= stopPos) && (stopPos < gp->exonEnds[i]))
iStop = i;
}
if ((iStop >= 0) && (gp->exonFrames[iStop] < 0))
{
if (gp->strand[0] == '+')
{
/* pos, compute from previous exon */
assert(iStop > 0);
gp->exonFrames[iStop] = (gp->exonFrames[iStop-1]+(gp->exonEnds[iStop-1]-gp->exonStarts[iStop-1])) % 3;
}
else
{
/* neg, compute from next exon */
assert(iStop < gp->exonCount-1);
gp->exonFrames[iStop] = (gp->exonFrames[iStop+1]+(gp->exonEnds[iStop+1]-gp->exonStarts[iStop+1])) % 3;
}
}
}
static int phaseToFrame(char phase)
/* convert GTF/GFF phase to frame */
{
switch (phase)
{
case '0':
return 0;
case '1':
return 2;
case '2':
return 1;
}
return -1;
}
static int findPosExon(struct genePred *gp, int pos)
/* find exon containing the specified position */
{
int i;
for (i = 0; i < gp->exonCount; i++)
{
if ((gp->exonStarts[i] <= pos) && (pos < gp->exonEnds[i]))
return i;
}
errAbort("bug: position %d not found in exon of %s genePred", pos, gp->name);
return 0;
}
static boolean adjImpliedStopPos(struct genePred *gp)
/* implied stop codon adjustment for positive strand gene */
{
int iExon = findPosExon(gp, gp->cdsEnd-1);
unsigned space = (gp->exonEnds[iExon]-gp->cdsEnd);
if (space >= 3)
{
// room in current exon
gp->cdsEnd += 3;
return TRUE;
}
else if ((iExon < gp->exonCount)
&& ((gp->exonEnds[iExon+1]-gp->exonStarts[iExon+1])+space) >= 3)
{
// room including next exon
gp->cdsEnd = gp->exonStarts[iExon+1] + (3-space);
return TRUE;
}
return FALSE;
}
static boolean adjImpliedStopNeg(struct genePred *gp)
/* implied stop codon adjustment for negative strand gene */
{
int iExon = findPosExon(gp, gp->cdsStart);
unsigned space = (gp->cdsStart-gp->exonStarts[iExon]);
if (space >= 3)
{
// room in current exon
gp->cdsStart -= 3;
return TRUE;
}
else if ((iExon > 0)
&& ((gp->exonEnds[iExon-1]-gp->exonStarts[iExon-1])+space) >= 3)
{
// room including previous exon
gp->cdsStart = gp->exonEnds[iExon-1] - (3-space);
return TRUE;
}
return FALSE;
}
static void adjImpliedStopAfterCds(struct genePred *gp)
/* adjust CDS bounds to include implied stop codon outside of CDS. */
{
if (gp->strand[0] == '+')
{
if (adjImpliedStopPos(gp))
{
if (gp->optFields & genePredCdsStatFld)
gp->cdsEndStat = cdsComplete;
}
}
else
{
if (adjImpliedStopNeg(gp))
{
if (gp->optFields & genePredCdsStatFld)
gp->cdsStartStat = cdsComplete;
}
}
}
static void checkForNoFrames(struct genePred *gp)
/* check for requesting frame from a file without frames. */
{
int i;
/* Complain if we have a CDS but exonFrames are all -1: */
if (gp->cdsStart < gp->cdsEnd)
{
boolean foundReal = FALSE;
for (i = 0; i < gp->exonCount; i++)
{
if (gp->exonFrames[i] >= 0 && gp->exonFrames[i] <= 2)
{
foundReal = TRUE;
break;
}
}
if (! foundReal)
{
errAbort("Error: exonFrames field is being added, but I found a "
"gene (%s) with CDS but no valid frames. "
"This can happen if program is invoked with -genePredExt "
"but no valid frames are given in the file. If the 8th "
"field of GFF/GTF file is always a placeholder, then don't use "
"-genePredExt.",
gp->name);
}
}
}
static boolean isGtfFeature(char *feat)
/* check if a feature is one of the recognized features. All otherse
* should be ignored. http://mblab.wustl.edu/GTF22.html */
{
static char *gtfFeats[] = {"CDS", "start_codon", "stop_codon", "5UTR", "3UTR",
"inter", "inter_CNS", "intron_CNS", "exon", NULL};
int i;
for (i = 0; gtfFeats[i] != NULL; i++)
{
if (sameString(feat, gtfFeats[i]))
return TRUE;
}
return FALSE;
}
static boolean ignoreGxfLine(struct gffLine *gl, boolean isGtf)
/* should the specified line be skipped? */
{
if (isGtf)
return !isGtfFeature(gl->feature);
else
return FALSE;
}
static boolean allowedFrameFeature(struct gffLine *gl, boolean isGtf)
/* should this feature be used to assign frame? */
{
if (isStartStopCodon(gl->feature) || ignoreGxfLine(gl, isGtf))
return FALSE;
// for GFF, any features with frame can be used to set frame,
// with GTF, there are some bogus files that set frame on UTR features,
// so only use CDS
if (isGtf)
return sameString(gl->feature, "CDS");
else
return (gl->frame != '.');
}
static struct gffLine *assignFrameForCdsExon(int start, int end, int *framePtr,
struct gffLine *gl, boolean isGtf)
/* set the frame for an exon, advancing gl past end of this exon */
{
/* skip lines preceeding this exon */
while (gl != NULL)
{
if (allowedFrameFeature(gl, isGtf) && (gl->end >= start))
break;
gl = gl->next;
}
/* set frame from any overlapping records with frame. Don't include
* start/stop_codon, as it isn't a full exon. */
while ((gl != NULL) && (rangeIntersection(gl->start, gl->end, start, end) > 0))
{
if (allowedFrameFeature(gl, isGtf))
{
int frame = phaseToFrame(gl->frame);
if (frame >= 0)
*framePtr = frame;
}
gl = gl->next;
}
return gl;
}
static void assignFrame(boolean isGtf, struct gffGroup *group, struct genePred *gp)
/* Assign frame from GFF after genePred has been built.*/
{
struct gffLine *gl = group->lineList;
int start, end, i;
boolean haveFrame = FALSE;
for (i = 0; i < gp->exonCount; i++)
{
if (genePredCdsExon(gp, i, &start, &end))
{
gl = assignFrameForCdsExon(start, end, &(gp->exonFrames[i]), gl, isGtf);
if (gp->exonFrames[i] >= 0)
haveFrame = TRUE;
}
}
/* make sure stop has face if some exons in the gene had frame */
if (haveFrame)
fixStopFrame(gp);
checkForNoFrames(gp);
}
static struct genePred *mkFromGroupedGxf(struct gffFile *gff, struct gffGroup *group, char *name,
boolean isGtf, char *exonSelectWord, unsigned optFields,
unsigned options)
/* common function to create genePreds from GFFs or GTFs. This is a little
* ugly with to many check of isGtf, however the was way to much identical
* code the other way. Options are from genePredFromGxfOpts */
{
struct genePred *gp;
int stopCodonStart = -1, stopCodonEnd = -1;
int cdsStart = BIGNUM, cdsEnd = -BIGNUM;
int exonCount = 0;
boolean haveStartCodon = FALSE, haveStopCodon = FALSE;
struct gffLine *gl;
unsigned *eStarts, *eEnds;
int i;
/* should we count on start/stop codon annotation in GFFs? */
boolean useStartStops = isGtf || haveStartStopCodons(gff);
/* Count up exons and figure out cdsStart and cdsEnd. */
for (gl = group->lineList; gl != NULL; gl = gl->next)
{
if (ignoreGxfLine(gl, isGtf))
continue;
if (isExon(gl->feature, isGtf, exonSelectWord))
++exonCount;
if (isCds(gl->feature))
{
if (gl->start < cdsStart)
cdsStart = gl->start;
if (gl->end > cdsEnd)
cdsEnd = gl->end;
}
if (sameWord(gl->feature, "start_codon"))
haveStartCodon = TRUE;
if (sameWord(gl->feature, "stop_codon"))
{
/* stop_codon can be split, need bounds for adjusting CDS below */
if ((stopCodonStart < 0) || (gl->start < stopCodonStart))
stopCodonStart = gl->start;
if ((stopCodonEnd < 0) || (gl->end > stopCodonEnd))
stopCodonEnd = gl->end;
haveStopCodon = TRUE;
}
}
if (exonCount == 0)
return NULL;
if (cdsStart > cdsEnd)
{
/* no cds annotated */
cdsStart = 0;
cdsEnd = 0;
}
else if (stopCodonStart >= 0)
{
/* adjust CDS to include stop codon as in GTF */
if (group->strand == '+')
{
if (stopCodonEnd > cdsEnd)
cdsEnd = stopCodonEnd;
}
else
{
if (stopCodonStart < cdsStart)
cdsStart = stopCodonStart;
}
}
/* Allocate genePred and fill in values. */
AllocVar(gp);
gp->name = cloneString(name);
gp->chrom = cloneString(group->seq);
gp->strand[0] = group->strand;
gp->txStart = group->start;
gp->txEnd = group->end;
if (cdsStart < cdsEnd)
{
gp->cdsStart = cdsStart;
gp->cdsEnd = cdsEnd;
}
else
{
// no CDS, set to txEnd
gp->cdsStart = gp->txEnd;
gp->cdsEnd = gp->txEnd;
}
gp->exonStarts = AllocArray(eStarts, exonCount);
gp->exonEnds = AllocArray(eEnds, exonCount);
gp->optFields = optFields;
if (optFields & genePredName2Fld)
{
if (group->lineList->geneId != NULL)
gp->name2 = cloneString(group->lineList->geneId);
else
gp->name2 = cloneString("");
}
if (optFields & genePredCdsStatFld)
{
if (cdsStart < cdsEnd)
{
if (!useStartStops)
{
/* GFF doesn't require start or stop, if not used, assume complete */
gp->cdsStartStat = cdsComplete;
gp->cdsEndStat = cdsComplete;
}
else if (group->strand == '+')
{
gp->cdsStartStat = (haveStartCodon ? cdsComplete : cdsIncomplete);
gp->cdsEndStat = (haveStopCodon ? cdsComplete : cdsIncomplete);;
}
else
{
gp->cdsEndStat = (haveStartCodon ? cdsComplete : cdsIncomplete);
gp->cdsStartStat = (haveStopCodon ? cdsComplete : cdsIncomplete);;
}
}
else
{
gp->cdsStartStat = cdsNone;
gp->cdsEndStat = cdsNone;
}
}
if (optFields & genePredExonFramesFld)
{
AllocArray(gp->exonFrames, exonCount);
for (i = 0; i < exonCount; i++)
gp->exonFrames[i] = -1;
}
/* adjust tx range to include stop codon */
if ((group->strand == '+') && (gp->txEnd == stopCodonStart))
gp->txEnd = stopCodonEnd;
else if ((group->strand == '-') && (gp->txStart == stopCodonEnd))
gp->txStart = stopCodonStart;
i = -1; /* before first exon */
/* fill in exons, merging overlaping and adjacent exons */
for (gl = group->lineList; gl != NULL; gl = gl->next)
{
if (ignoreGxfLine(gl, isGtf))
continue;
if (isExon(gl->feature, isGtf, exonSelectWord) || (isGtf && isCds(gl->feature)))
{
chkGroupLine(group, gl, gp);
if ((i < 0) || (gl->start > eEnds[i]))
{
/* start a new exon */
++i;
assert(i < exonCount);
eStarts[i] = gl->start;
eEnds[i] = gl->end;
}
else
{
/* overlap, extend exon, picking the largest of ends */
assert(i < exonCount);
assert(gl->start >= eStarts[i]);
if (gl->end > eEnds[i])
eEnds[i] = gl->end;
}
}
}
gp->exonCount = i+1;
/* adjust for flybase type of GFFs, with stop codon implied and outside of
* CDS. */
if ((options & genePredGxfImpliedStopAfterCds) && !haveStopCodon)
adjImpliedStopAfterCds(gp);
if (optFields & genePredExonFramesFld)
assignFrame(isGtf, group, gp);
return gp;
}
struct genePred *genePredFromGroupedGff(struct gffFile *gff, struct gffGroup *group, char *name,
char *exonSelectWord, unsigned optFields, unsigned options)
/* Convert gff->groupList to genePred list. See genePred.h for details. */
{
struct gffLine *gl;
boolean anyExon = FALSE;
/* Look to see if any exons. If not allow CDS to be used instead. */
if (exonSelectWord)
{
for (gl = group->lineList; gl != NULL; gl = gl->next)
{
if (sameWord(gl->feature, exonSelectWord))
{
anyExon = TRUE;
break;
}
}
}
else
anyExon = TRUE;
if (!anyExon)
exonSelectWord = "CDS";
return mkFromGroupedGxf(gff, group, name, FALSE, exonSelectWord, optFields,
options);
}
struct genePred *genePredFromGroupedGtf(struct gffFile *gff, struct gffGroup *group, char *name,
unsigned optFields, unsigned options)
/* Convert gff->groupList to genePred list, using GTF feature conventions.
* See genePred.h for details. */
{
return mkFromGroupedGxf(gff, group, name, TRUE, NULL, optFields,
options);
}
static void mapCdsToGenome(struct psl *psl, struct genbankCds* cds,
struct genePred* gene)
/* Convert set cdsStart/end from mrna to genomic coordinates. */
{
struct genbankCds genomeCds = genbankCdsToGenome(cds, psl);
if (genomeCds.start < genomeCds.end)
{
gene->cdsStart = genomeCds.start;
gene->cdsEnd = genomeCds.end;
if (gene->optFields & genePredCdsStatFld)
{
if (gene->strand[0] == '+')
{
gene->cdsStartStat = (genomeCds.startComplete) ? cdsComplete : cdsIncomplete;
gene->cdsEndStat = (genomeCds.endComplete) ? cdsComplete : cdsIncomplete;;
}
else
{
gene->cdsStartStat = (genomeCds.endComplete) ? cdsComplete : cdsIncomplete;;
gene->cdsEndStat = (genomeCds.startComplete) ? cdsComplete : cdsIncomplete;
}
}
}
else
{
/* CDS not aligned */
gene->cdsStart = gene->txEnd;
gene->cdsEnd = gene->txEnd;
if (gene->optFields & genePredCdsStatFld)
{
gene->cdsStartStat = cdsUnknown;
gene->cdsEndStat = cdsUnknown;
}
}
}
void genePredAddGenbankCds(struct psl *psl, struct genbankCds* cds,
struct genePred *gene)
/* Convert cdsStart/End from mrna to genomic coordinates.
* Note that the genePred blocks need not be filled in before
* this call. */
{
if (cds == NULL)
{
/* no CDS, set to zero-length at end */
gene->cdsStart = psl->tEnd;
gene->cdsEnd = psl->tEnd;
if (gene->optFields & genePredCdsStatFld)
{
gene->cdsStartStat = cdsNone;
gene->cdsEndStat = cdsNone;
}
}
else if ((cds->start < 0) || (cds->end <= 0))
{
/* unknown CDS, set to end */
gene->cdsStart = psl->tEnd;
gene->cdsEnd = psl->tEnd;
if (gene->optFields & genePredCdsStatFld)
{
gene->cdsStartStat = cdsUnknown;
gene->cdsEndStat = cdsUnknown;
}
}
else
{
/* have CDS annotation, make sure it's in bounds */
struct genbankCds adjCds = *cds;
if (adjCds.end > psl->qSize)
{
adjCds.end = psl->qSize;
adjCds.endComplete = FALSE;
}
mapCdsToGenome(psl, &adjCds, gene);
}
}
static int getFrame(struct psl *psl, int start, int end,
struct genbankCds* cds)
/* get the starting frame for an exon of a mRNA. start and end are
* the bounds of the exon within the mRNA. This may cover multiple psl
* blocks due to merging of small gaps. */
{
/* use the 3' end is used if it's complete, as it is more often accurate when
* genes are defined from mRNAs sequenced with reverse-transcriptase. */
int frame = -1;
/* map to mRNA coords in CDS since frame for an exon is in direction of
* transcription. */
if (psl->strand[0] == '-')
reverseIntRange(&start, &end, psl->qSize);
if (start < cds->start)
start = cds->start;
if (end > cds->end)
end = cds->end;
if (start < end)
{
/* compute from end if it is complete in mRNA */
if (cds->endComplete)
{
int fr = (cds->end-start) % 3;
frame = (fr == 2) ? 1 : ((fr == 1) ? 2 : 0);
}
else
frame = (start-cds->start) % 3;
}
return frame;
}
static boolean shouldMergeBlocks(struct genePred *gene, int iExon,
unsigned tStart, unsigned options,
int cdsMergeSize, int utrMergeSize)
/* determine if a new block starting at tStart whould be merged with
* the preceeding exon, indicated by iExon.
*/
{
/* nothing to check if no exons have been added */
if (iExon >= 0)
{
boolean inCds = (gene->cdsStart <= tStart) && (tStart < gene->cdsEnd);
int gapSize = (tStart - gene->exonEnds[iExon]);
if (inCds)
{
if ((options & genePredPslCdsMod3) && ((gapSize % 3) != 0))
return FALSE; /* not a multiple of three */
if ((cdsMergeSize >= 0) && (gapSize <= cdsMergeSize))
return TRUE;
}
else
{
if ((utrMergeSize >= 0) && (gapSize <= utrMergeSize))
return TRUE;
}
}
return FALSE; /* don't merge */
}
static void pslToExons(struct psl *psl, struct genePred *gene,
struct genbankCds* cds, unsigned options,
int cdsMergeSize, int utrMergeSize)
/* Convert psl alignment blocks to genePred exons, merging together blocks by
* the specified paraemeters. Optionally add frame information. */
{
int iBlk, iExon;
int startIdx, stopIdx, idxIncr;
/* this is bigger than needed if blocks merge */
gene->exonStarts = needMem(psl->blockCount*sizeof(unsigned));
gene->exonEnds = needMem(psl->blockCount*sizeof(unsigned));
if (gene->optFields & genePredExonFramesFld)
{
/* default frames to indicate no frame */
gene->exonFrames = needMem(psl->blockCount*sizeof(unsigned));
for (iExon = 0; iExon < psl->blockCount; iExon++)
gene->exonFrames[iExon] = -1;
}
/* traverse psl in postive target order */
if (psl->strand[1] == '-')
{
startIdx = psl->blockCount-1;
stopIdx = -1;
idxIncr = -1;
}
else
{
startIdx = 0;
stopIdx = psl->blockCount;
idxIncr = 1;
}
iExon = -1; /* indicate none have been added */
for (iBlk = startIdx; iBlk != stopIdx; iBlk += idxIncr)
{
int tStart = psl->tStarts[iBlk];
int tEnd = tStart + psl->blockSizes[iBlk];
if (psl->strand[1] == '-')
reverseIntRange(&tStart, &tEnd, psl->tSize);
if (!shouldMergeBlocks(gene, iExon, tStart, options,
cdsMergeSize, utrMergeSize))
{
/* new exon */
iExon++;
gene->exonStarts[iExon] = tStart;
}
gene->exonEnds[iExon] = tEnd;
/* Set the frame. We have to deal with multiple blocks being merged. For
* positive strand, the first block frame is used, for negative strand,
* the last is used. */
if ((gene->optFields & genePredExonFramesFld) && (cds != NULL)
&& (((psl->strand[0] == '+') && (gene->exonFrames[iExon] < 0))
|| (psl->strand[0] == '-')))
{
int fr = getFrame(psl, psl->qStarts[iBlk], psl->qStarts[iBlk]+psl->blockSizes[iBlk], cds);
if (fr >= 0)
gene->exonFrames[iExon] = fr;
}
}
gene->exonCount = iExon+1;
assert(gene->exonCount <= psl->blockCount);
}
struct genePred *genePredFromPsl3(struct psl *psl, struct genbankCds* cds,
unsigned optFields, unsigned options,
int cdsMergeSize, int utrMergeSize)
/* Convert a PSL of an mRNA alignment to a genePred, converting a genbank CDS
* specification string to genomic coordinates. Small genomic inserts are
* merged based on the mergeSize parameters. Gaps no larger than the
* specified merge sizes result in the adjacent blocks being merged into a
* single exon. Gaps in CDS use cdsMergeSize, in UTR use utrMergeSize. If
* the genePredPslCdsMod3 option is specified, then CDS gaps are only merged
* if a multiple of three. A negative merge sizes disables merging of blocks.
* This differs from specifying zero in that adjacent blocks will not be
* merged. The optfields field is a set from genePredFields, indicated what
* fields to create. Zero-length CDS, or null cds, creates without CDS
* annotation. If cds is null, it will set status fields to cdsNone. */
{
struct genePred *gene;
#if 0
/* FIXME; the browser calls this on a protein psl when rendering the amino
* acids of a protein alignment track. Some of this code doesn't really make
* sense for proteins and I really don't see how this produces the right
* results, however the check is disabled until this can be investigated
* because it *seems* to work. markd 2006-05-22 */
if (pslIsProtein(psl))
errAbort("can't convert protein psls to genePreds");
#endif
AllocVar(gene);
gene->name = cloneString(psl->qName);
gene->chrom = cloneString(psl->tName);
gene->txStart = psl->tStart;
gene->txEnd = psl->tEnd;
gene->optFields = optFields;
/* get strand in genome that the positive version mRNA aligns to */
if (psl->strand[1] == '\0')
{
/* assumed pos target strand, so neg query would be pos target */
gene->strand[0] = psl->strand[0];
}
else
{
/* query and target strand are different; will be neg when query pos */
gene->strand[0] = ((psl->strand[0] != psl->strand[1]) ? '-' : '+');
}
genePredAddGenbankCds(psl, cds, gene);
pslToExons(psl, gene, cds, options, cdsMergeSize, utrMergeSize);
return gene;
}
struct genePred *genePredFromPsl2(struct psl *psl, unsigned optFields,
struct genbankCds* cds, int insertMergeSize)
/* Compatibility function, genePredFromPsl3 is prefered. See that function's
* documentation for details. This calls genePredFromPsl3 with no options
* and insertMergeSize set for CDS and UTR.
*/
{
return genePredFromPsl3(psl, cds, optFields, genePredPslDefaults, insertMergeSize, insertMergeSize);
}
struct genePred *genePredFromPsl(struct psl *psl, int cdsStart, int cdsEnd,
int insertMergeSize)
/* Compatibility function, genePredFromPsl3 is prefered. See that function's
* documentation for details. This calls genePredFromPsl3 with no options.
*/
{
struct genbankCds cds;
ZeroVar(&cds);
cds.start = cdsStart;
cds.end = cdsEnd;
return genePredFromPsl3(psl, &cds, 0, genePredPslDefaults, insertMergeSize, insertMergeSize);
}
char* genePredGetCreateSql(char* table, unsigned optFields, unsigned options,
int chromIndexLen)
/* Get SQL required to create a genePred table. optFields is a bit set
* consisting of the genePredFields values. Options are a bit set of
* genePredCreateOpts. Returned string should be freed. This will create all
* optional fields that preceed the highest optFields column. chromIndexLen
* is now ignored.. */
{
/* the >= is used so that we create preceeding fields. */
char sqlCmd[1024];
safef(sqlCmd, sizeof(sqlCmd), createSql, table,
((options & genePredWithBin) ? binFieldSql : noBinIndexSql));
if (optFields >= genePredScoreFld)
safecat(sqlCmd, sizeof(sqlCmd), scoreFieldSql);
if (optFields >= genePredName2Fld)
safecat(sqlCmd, sizeof(sqlCmd), name2FieldSql);
if (optFields >= genePredCdsStatFld)
safecat(sqlCmd, sizeof(sqlCmd), cdsStatFieldSql);
if (optFields >= genePredExonFramesFld)
safecat(sqlCmd, sizeof(sqlCmd), exonFramesFieldSql);
safecat(sqlCmd, sizeof(sqlCmd), ")");
return cloneString(sqlCmd);
}
// FIXME: this really doesn't belong in this module
struct genePred *getOverlappingGene(char *db, struct genePred **list, char *table, char *chrom, int cStart, int cEnd, char *name, int *retOverlap)
{
/* read all genes from a table find the gene with the biggest overlap.
Cache the list of genes to so we only read it once */
char query[256];
struct genePred *gene;
struct sqlConnection *conn;
struct sqlResult *sr;
char **row;
struct genePred *el = NULL, *bestMatch = NULL, *gp = NULL;
int overlap = 0 , bestOverlap = 0, i;
struct psl *psl;
int *eFrames;
if (*list == NULL)
{
printf("Loading Predictions from %s\n",table);
AllocVar(*list);
conn = hAllocConn(db);
AllocVar(gene);
safef(query, sizeof(query), "select * from %s", table);
sr = sqlGetResult(conn, query);
- while ((row = sqlNextRow(sr)) != NULL){
+ while ((row = sqlNextRow(sr)) != NULL)
+ {
if (!sameString(table,"all_mrna"))
{
el = genePredLoad(row);
}
else
{
psl = pslLoad(row);
el = genePredFromPsl2(psl, 0, NULL, genePredStdInsertMergeSize);
}
slAddHead(list, el);
}
slReverse(list);
sqlFreeResult(&sr);
hFreeConn(&conn);
}
for (el = *list; el != NULL; el = el->next)
{
if (chrom != NULL && el->chrom != NULL)
{
overlap = 0;
if ( sameString(chrom, el->chrom))
{
for (i = 0 ; i<(el->exonCount); i++)
{
overlap += positiveRangeIntersection(cStart,cEnd, el->exonStarts[i], el->exonEnds[i]) ;
}
if (overlap > 20 && sameString(name, el->name))
{
bestMatch = el;
bestOverlap = overlap;
*retOverlap = bestOverlap;
}
if (overlap > bestOverlap)
{
bestMatch = el;
bestOverlap = overlap;
*retOverlap = bestOverlap;
}
}
}
}
if (bestMatch != NULL)
{
/* Allocate genePred and fill in values. */
AllocVar(gp);
gp->name = cloneString(bestMatch->name);
gp->chrom = cloneString(bestMatch->chrom);
gp->strand[1] = bestMatch->strand[1];
gp->strand[0] = bestMatch->strand[0];
gp->txStart = bestMatch->txStart;
gp->txEnd = bestMatch->txEnd;
gp->cdsStart = bestMatch->cdsStart;
gp->cdsEnd = bestMatch->cdsEnd;
gp->exonCount = bestMatch->exonCount;
AllocArray(gp->exonStarts, bestMatch->exonCount);
AllocArray(gp->exonEnds, bestMatch->exonCount);
for (i=0; i<bestMatch->exonCount; ++i)
{
gp->exonStarts[i] = bestMatch->exonStarts[i] ;
gp->exonEnds[i] = bestMatch->exonEnds[i] ;
}
gp->optFields = bestMatch->optFields;
gp->score = bestMatch->score;
if (bestMatch->optFields & genePredName2Fld)
gp->name2 = cloneString(bestMatch->name2);
else
gp->name2 = NULL;
if (bestMatch->optFields & genePredCdsStatFld)
{
gp->cdsStartStat = bestMatch->cdsStartStat;
gp->cdsEndStat = bestMatch->cdsEndStat;
}
if (bestMatch->optFields & genePredExonFramesFld)
{
gp->exonFrames = AllocArray(eFrames, bestMatch->exonCount);
for (i = 0; i < bestMatch->exonCount; i++)
gp->exonFrames[i] = bestMatch->exonFrames[i];
}
eFrames = gp->exonFrames;
}
return gp;
}
int genePredBases(struct genePred *gp)
/* count coding and utr bases in a gene prediction */
{
int count = 0, i;
if (gp == NULL) return 0;
for (i=0; i<gp->exonCount; i++)
{
count += gp->exonEnds[i] - gp->exonStarts[i] ;
}
return count;
}
int genePredCodingBases(struct genePred *gp)
/* Count up the number of coding bases in gene prediction. */
{
int i, exonCount = gp->exonCount;
int cdsStart = gp->cdsStart, cdsEnd = gp->cdsEnd;
int baseCount = 0;
for (i=0; i<exonCount; ++i)
{
baseCount += positiveRangeIntersection(cdsStart,cdsEnd,
gp->exonStarts[i], gp->exonEnds[i]);
}
return baseCount;
}
boolean genePredCdsExon(struct genePred *gp, int iExon, int *startPtr, int *endPtr)
/* Get the CDS range in an exon. If there is no CDS, return FALSE and then
* set start == end */
{
assert((iExon >= 0) && (iExon < gp->exonCount));
int start = gp->exonStarts[iExon];
int end = gp->exonEnds[iExon];
if (start < gp->cdsStart)
start = gp->cdsStart;
if (end > gp->cdsEnd)
end = gp->cdsEnd;
if (startPtr != NULL)
*startPtr = start;
if (endPtr != NULL)
*endPtr = end;
return (start < end);
}
/* state for genePredCheck */
static int gpErrorCnt = 0; /* error count for gpError */
static FILE *gpErrFh = NULL; /* error output for gpError */
static void gpError(char *format, ...)
/* print and count an error */
{
va_list args;
fprintf(gpErrFh, "Error: ");
va_start(args, format);
vfprintf(gpErrFh, format, args);
va_end(args);
fputc('\n', gpErrFh);
gpErrorCnt++;
}
static void checkExon(char *desc, struct genePred* gp, int iExon)
/* check one exon of a genePred */
{
unsigned exonStart = gp->exonStarts[iExon];
unsigned exonEnd = gp->exonEnds[iExon];
if (exonStart >= exonEnd)
gpError("%s: %s exon %u start %u >= end %u", desc, gp->name,
iExon, exonStart, exonEnd);
if (exonStart < gp->txStart)
gpError("%s: %s exon %u start %u < txStart %u", desc, gp->name,
iExon, exonStart, gp->txStart);
if (exonEnd > gp->txEnd)
gpError("%s: %s exon %u end %u > txEnd %u", desc, gp->name,
iExon, exonEnd, gp->txEnd);
if (iExon > 0)
{
/* other than first exon */
unsigned prevExonEnd = gp->exonEnds[iExon-1];
if (exonStart < prevExonEnd)
gpError("%s: %s exon %u overlaps previous exon", desc,
gp->name, iExon);
}
if (gp->optFields & genePredExonFramesFld)
{
int frame = gp->exonFrames[iExon];
if ((frame < -1) || (frame > 2))
gpError("%s: %s invalid exonFrame: %d", desc, gp->name, frame);
if ((exonEnd > gp->cdsStart) && (exonStart < gp->cdsEnd))
{
if (frame == -1)
gpError("%s: %s no exonFrame on CDS exon %d", desc, gp->name, iExon);
}
else
{
if (frame != -1)
gpError("%s: %s exonFrame on non-CDS exon %d", desc, gp->name, iExon);
}
}
}
static void checkCdsBounds(char *desc, FILE* out, struct genePred* gp)
/* check cdsStart/cdsEnd */
{
int iExon;
boolean foundCdsStart = FALSE, foundCdsEnd = FALSE;
if (gp->cdsStart > gp->cdsEnd)
gpError("%s: %s cdsStart %u > cdsEnd %u", desc, gp->name,
gp->cdsStart, gp->cdsEnd);
if ((gp->cdsStart < gp->txStart) || (gp->cdsStart > gp->txEnd))
gpError("%s: %s cdsStart %u not in tx bounds %u-%u", desc,
gp->name, gp->cdsStart, gp->txStart, gp->txEnd);
if ((gp->cdsEnd < gp->txStart) || (gp->cdsEnd > gp->txEnd))
gpError("%s: %s cdsEnd %u not in tx bounds %u-%u", desc,
gp->name, gp->cdsEnd, gp->txStart, gp->txEnd);
/* is cdsStart/cdsEnd in an exon? */
for (iExon = 0; iExon < gp->exonCount; iExon++)
{
if ((gp->cdsStart >= gp->exonStarts[iExon])
&& (gp->cdsStart < gp->exonEnds[iExon]))
foundCdsStart = TRUE;
if ((gp->cdsEnd > gp->exonStarts[iExon])
&& (gp->cdsEnd <= gp->exonEnds[iExon]))
foundCdsEnd = TRUE;
}
if (!foundCdsStart)
gpError("%s: %s cdsStart %u not in an exon", desc, gp->name, gp->cdsStart);
if (!foundCdsEnd)
gpError("%s: %s cdsEnd %u not in an exon", desc, gp->name, gp->cdsEnd);
}
int genePredCheck(char *desc, FILE* out, int chromSize,
struct genePred* gp)
/* Validate a genePred for consistency. desc is printed the error messages
* to file out (open /dev/null to discard). chromSize should contain
* size of chromosome, or 0 if chrom is not valid, or -1 to not check
* chromosome bounds. Returns count of errors. */
{
int iExon;
gpErrorCnt = 0;
gpErrFh = out;
if (!(sameString(gp->strand, "+") || sameString(gp->strand, "-")))
gpError("%s: %s invalid strand: \"%s\"", desc, gp->name, gp->strand);
/* check chrom */
if (chromSize == 0)
gpError("%s: %s chrom not a valid chromosome: \"%s\"", desc, gp->name, gp->chrom);
else if (chromSize > 0)
{
if (gp->txEnd > chromSize)
gpError("%s: %s txEnd %u >= chromSize %u", desc, gp->name,
gp->txEnd, chromSize);
}
/* check internal consistency */
if (gp->txStart >= gp->txEnd)
gpError("%s: %s txStart %u >= txEnd %u", desc, gp->name,
gp->txStart, gp->txEnd);
/* no CDS is indicated by cdsStart == cdsEnd */
if (gp->cdsStart != gp->cdsEnd)
checkCdsBounds(desc, out, gp);
/* make sure first/last exons match tx range */
if (gp->txStart != gp->exonStarts[0])
gpError("%s: %s first exon start %u doesn't match txStart %u", desc,
gp->name, gp->exonStarts[0], gp->txStart);
if (gp->txEnd != gp->exonEnds[gp->exonCount-1])
gpError("%s: %s last exon end %u doesn't match txEnd %u", desc,
gp->name, gp->exonEnds[gp->exonCount-1], gp->txEnd);
/* check each exon */
for (iExon = 0; iExon < gp->exonCount; iExon++)
checkExon(desc, gp, iExon);
return gpErrorCnt;
}
boolean genePredNmdTarget(struct genePred *gp)
/* Return TRUE if cds end is more than 50bp upstream of
last intron. */
{
int gpSize = 0;
int i = 0;
int startDist = 0, endDist = 0;
int blockSize = 0;
if(gp->exonCount < 2)
return FALSE;
for(i = 0; i < gp->exonCount; i++)
{
int blockStart = gp->exonStarts[i];
int blockEnd = gp->exonEnds[i];
blockSize = blockEnd - blockStart;
if( blockStart <= gp->cdsStart && gp->cdsStart <= blockEnd)
startDist += blockSize - (blockEnd - gp->cdsStart);
else if(blockEnd <= gp->cdsStart)
startDist += blockSize;
if( blockStart <= gp->cdsEnd && gp->cdsEnd <= blockEnd)
endDist += blockSize - (blockEnd - gp->cdsEnd);
else if(blockEnd <= gp->cdsEnd)
endDist += blockSize;
gpSize += blockSize;
}
/* xxxxxXXX----XXXXXXXXXXXXX--------XXXXXXX----XXXxxxxxxxx-------xxxxxxxxx */
if(sameString(gp->strand, "+"))
{
blockSize = gp->exonEnds[gp->exonCount-1] - gp->exonStarts[gp->exonCount-1];
return endDist < (gpSize - blockSize - 50);
}
else if(sameString(gp->strand, "-"))
{
blockSize = gp->exonEnds[0] - gp->exonStarts[0];
return startDist > (gp->exonEnds[1] + 50);
}
else
errAbort("genePredNmdsTarget() - Don't recognize strand: %s\n", gp->strand);
return FALSE;
}
void genePredAddExonFrames(struct genePred *gp)
/* Add exonFrames array to a genePred that doesn't have it. Frame is assumed
* to be contiguous. */
{
int iExon, start, end, iBase = 0;
int iStart, iEnd, iDir;
/* initial array to all -1, for no frame */
AllocArray(gp->exonFrames, gp->exonCount);
gp->optFields |= genePredExonFramesFld;
for (iExon = 0; iExon < gp->exonCount; iExon++)
gp->exonFrames[iExon] = -1;
if (sameString(gp->strand, "+"))
{
iStart = 0;
iEnd = gp->exonCount;
iDir = 1;
}
else
{
iStart = gp->exonCount-1;
iEnd = -1;
iDir = -1;
}
for (iExon = iStart; iExon != iEnd; iExon += iDir)
{
if (genePredCdsExon(gp, iExon, &start, &end))
{
gp->exonFrames[iExon] = iBase % 3;
iBase += (end - start);
}
}
}
void genePredRc(struct genePred *gp, int chromSize)
/* Reverse complement a genePred (project it to the opposite strand). Useful
* when doing analysis that is simplified by having things on the same strand.
*/
{
int iExon;
gp->strand[0] = (gp->strand[0] == '+') ? '-' : '+';
reverseUnsignedRange(&gp->txStart, &gp->txEnd, chromSize);
reverseUnsignedRange(&gp->cdsStart, &gp->cdsEnd, chromSize);
for (iExon = 0; iExon < gp->exonCount; iExon++)
reverseUnsignedRange(&gp->exonStarts[iExon], &gp->exonEnds[iExon], chromSize);
reverseUnsigned(gp->exonStarts, gp->exonCount);
reverseUnsigned(gp->exonEnds, gp->exonCount);
if (gp->optFields & genePredCdsStatFld)
{
enum cdsStatus hold = gp->cdsStartStat;
gp->cdsStartStat = gp->cdsEndStat;
gp->cdsEndStat = hold;
}
if (gp->optFields & genePredExonFramesFld)
reverseInts(gp->exonFrames, gp->exonCount);
}
int genePredCdsSize(struct genePred *gp)
/* compute the number of bases of CDS */
{
int iExon, start, end, cdsBases = 0;
for (iExon = 0; iExon < gp->exonCount; iExon++)
{
if (genePredCdsExon(gp, iExon, &start, &end))
cdsBases += (end - start);
}
return cdsBases;
}
struct genePred *genePredNew(char *name, char *chrom, char strand,
unsigned txStart, unsigned txEnd,
unsigned cdsStart, unsigned cdsEnd,
unsigned optFields, unsigned exonSpace)
/* create a new gene with space for the specified number of exons allocated.
* genePredGrow maybe used to expand this space if needed. */
{
struct genePred *gp;
AllocVar(gp);
assert(exonSpace > 0);
gp->name = cloneString(name);
gp->chrom = cloneString(chrom);
gp->strand[0] = strand;
gp->txStart = txStart;
gp->txEnd = txEnd;
gp->cdsStart = cdsStart;
gp->cdsEnd = cdsEnd;
gp->optFields = optFields;
AllocArray(gp->exonStarts, exonSpace);
AllocArray(gp->exonEnds, exonSpace);
if (gp->optFields & genePredExonFramesFld)
AllocArray(gp->exonFrames, exonSpace);
return gp;
}
void genePredGrow(struct genePred *gp, unsigned *exonSpacePtr)
/* Increase memory allocated to a psl to hold more exons. exonSpacePtr
* should point the the current maximum number of exons and will be
* updated to with the new amount of space. */
{
int exonSpace = *exonSpacePtr;
int newSpace = 2 * exonSpace;
ExpandArray(gp->exonStarts, exonSpace, newSpace);
ExpandArray(gp->exonEnds, exonSpace, newSpace);
if (gp->optFields & genePredExonFramesFld)
ExpandArray(gp->exonFrames, exonSpace, newSpace);
*exonSpacePtr = newSpace;
}
struct rbTree *genePredToRangeTree(struct genePred *gp, boolean cdsOnly)
/* Convert genePred into a range tree. */
{
struct rbTree *rangeTree = rangeTreeNew();
int i;
for (i=0; i<gp->exonCount; ++i)
{
int start = gp->exonStarts[i];
int end = gp->exonEnds[i];
if (cdsOnly)
{
start = max(start, gp->cdsStart);
end = min(end, gp->cdsEnd);
if (start >= end)
continue;
}
rangeTreeAdd(rangeTree, start, end);
}
return rangeTree;
}
void gpPartOutAsBed(struct genePred *gp, int start, int end, FILE *f,
char *type, int id, int minSize)
/* Write out part of gp as bed12. */
{
/* Figure out # of blocks and min/max of area inside start/end */
int blockCount = 0;
int newStart = gp->txEnd, newEnd = gp->txStart;
int size = 0;
int i;
for (i=0; i<gp->exonCount; ++i)
{
int exonStart = gp->exonStarts[i];
int exonEnd = gp->exonEnds[i];
exonStart = max(start, exonStart);
exonEnd = min(end, exonEnd);
if (exonStart < exonEnd)
{
++blockCount;
newStart = min(exonStart, newStart);
newEnd = max(exonEnd, newEnd);
size += exonEnd - exonStart;
}
}
/* Output first 10 fields of bed12. */
if (size > minSize)
{
fprintf(f, "%s\t%d\t%d\t", gp->chrom, newStart, newEnd);
fprintf(f, "%s_%d_%s\t", type, id, gp->name);
fprintf(f, "0\t%s\t0\t0\t0\t%d\t", gp->strand, blockCount);
/* Output blockSizes field */
for (i=0; i<gp->exonCount; ++i)
{
int exonStart = gp->exonStarts[i];
int exonEnd = gp->exonEnds[i];
exonStart = max(start, exonStart);
exonEnd = min(end, exonEnd);
if (exonStart < exonEnd)
fprintf(f, "%d,", exonEnd - exonStart);
}
fprintf(f, "\t");
/* Output chromStarts field */
for (i=0; i<gp->exonCount; ++i)
{
int exonStart = gp->exonStarts[i];
int exonEnd = gp->exonEnds[i];
exonStart = max(start, exonStart);
exonEnd = min(end, exonEnd);
if (exonStart < exonEnd)
fprintf(f, "%d,", exonStart - newStart);
}
fprintf(f, "\n");
}
}