48d92ef665025410a2ee23fac5c6d468fe7cb128 markd Wed Jan 26 10:52:34 2011 -0800 modified refSeqGet to return results in predictiable order to minimize test failures when database changes diff --git src/hg/lib/genePred.c src/hg/lib/genePred.c index 76c475b..f5ce54f 100644 --- src/hg/lib/genePred.c +++ src/hg/lib/genePred.c @@ -1,1793 +1,1805 @@ /* genePred.c was originally generated by the autoSql program, which also * generated genePred.h and genePred.sql. This module links the database and the RAM * representation of objects. */ #include "common.h" #include "gff.h" #include "jksql.h" #include "psl.h" #include "linefile.h" #include "genePred.h" #include "genbank.h" #include "rangeTree.h" #include "hdb.h" static char const rcsid[] = "$Id: genePred.c,v 1.103 2010/05/10 08:23:35 kent Exp $"; /* SQL to create a genePred table */ static char *createSql = "CREATE TABLE %s (" " %s" /* bin column goes here */ " name varchar(255) not null," /* mrna accession of gene */ " chrom varchar(255) not null," /* Chromosome name */ " strand char(1) not null," /* + or - for strand */ " txStart int unsigned not null," /* Transcription start position */ " txEnd int unsigned not null," /* Transcription end position */ " cdsStart int unsigned not null," /* Coding region start */ " cdsEnd int unsigned not null," /* Coding region end */ " exonCount int unsigned not null," /* Number of exons */ " exonStarts longblob not null," /* Exon start positions */ " exonEnds longblob not null," /* Exon end positions */ " INDEX(name)"; static char *binFieldSql = " bin smallint unsigned not null," " INDEX(chrom,bin),"; static char *noBinIndexSql = " INDEX(chrom,txStart),"; static char *scoreFieldSql = " ,score int"; static char *name2FieldSql = " ,name2 varchar(255) not null," /* Secondary name. (e.g. name of gene) or NULL if not available */ " INDEX(name2)"; static char *cdsStatFieldSql = " ,cdsStartStat enum('none', 'unk', 'incmpl', 'cmpl') not null," /* Status of cdsStart annotation */ " cdsEndStat enum('none', 'unk', 'incmpl', 'cmpl') not null"; /* Status of cdsEnd annotation */ static char *exonFramesFieldSql = " ,exonFrames longblob not null"; /* List of frame for each exon, or -1 if no frame or not known. NULL if not available. */ struct genePred *genePredLoad(char **row) /* Load a genePred from row fetched with select * from genePred * from database. Dispose of this with genePredFree(). */ { struct genePred *ret; int sizeOne; AllocVar(ret); ret->exonCount = sqlUnsigned(row[7]); ret->name = cloneString(row[0]); ret->chrom = cloneString(row[1]); strcpy(ret->strand, row[2]); ret->txStart = sqlUnsigned(row[3]); ret->txEnd = sqlUnsigned(row[4]); ret->cdsStart = sqlUnsigned(row[5]); ret->cdsEnd = sqlUnsigned(row[6]); sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne); if (sizeOne != ret->exonCount) errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)", ret->name, sizeOne, ret->exonCount); sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne); if (sizeOne != ret->exonCount) errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)", ret->name, sizeOne, ret->exonCount); return ret; } struct genePred *genePredLoadAll(char *fileName) /* Load all genePred from a whitespace-separated file. * Dispose of this with genePredFreeList(). */ { struct genePred *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[10]; while (lineFileRow(lf, row)) { el = genePredLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct genePred *genePredLoadAllByChar(char *fileName, char chopper) /* Load all genePred from a chopper separated file. * Dispose of this with genePredFreeList(). */ { struct genePred *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[10]; while (lineFileNextCharRow(lf, chopper, row, ArraySize(row))) { el = genePredLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct genePred *genePredCommaIn(char **pS, struct genePred *ret) /* Create a genePred out of a comma separated string. * This will fill in ret if non-null, otherwise will * return a new genePred */ { char *s = *pS; int i; if (ret == NULL) AllocVar(ret); ret->name = sqlStringComma(&s); ret->chrom = sqlStringComma(&s); sqlFixedStringComma(&s, ret->strand, sizeof(ret->strand)); ret->txStart = sqlUnsignedComma(&s); ret->txEnd = sqlUnsignedComma(&s); ret->cdsStart = sqlUnsignedComma(&s); ret->cdsEnd = sqlUnsignedComma(&s); ret->exonCount = sqlUnsignedComma(&s); s = sqlEatChar(s, '{'); AllocArray(ret->exonStarts, ret->exonCount); for (i=0; iexonCount; ++i) { ret->exonStarts[i] = sqlUnsignedComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); s = sqlEatChar(s, '{'); AllocArray(ret->exonEnds, ret->exonCount); for (i=0; iexonCount; ++i) { ret->exonEnds[i] = sqlUnsignedComma(&s); } s = sqlEatChar(s, '}'); s = sqlEatChar(s, ','); *pS = s; return ret; } void genePredFree(struct genePred **pEl) /* Free a single dynamically allocated genePred such as created * with genePredLoad(). */ { struct genePred *el; if ((el = *pEl) == NULL) return; freeMem(el->name); freeMem(el->chrom); freeMem(el->exonStarts); freeMem(el->exonEnds); freeMem(el->name2); freeMem(el->exonFrames); freez(pEl); } void genePredFreeList(struct genePred **pList) /* Free a list of dynamically allocated genePred's */ { struct genePred *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; genePredFree(&el); } *pList = NULL; } void genePredOutput(struct genePred *el, FILE *f, char sep, char lastSep) /* Print out genePred. Separate fields with sep. Follow last field with lastSep. */ { int i; if (sep == ',') fputc('"',f); fprintf(f, "%s", el->name); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->chrom); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->strand); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%u", el->txStart); fputc(sep,f); fprintf(f, "%u", el->txEnd); fputc(sep,f); fprintf(f, "%u", el->cdsStart); fputc(sep,f); fprintf(f, "%u", el->cdsEnd); fputc(sep,f); fprintf(f, "%u", el->exonCount); fputc(sep,f); if (sep == ',') fputc('{',f); for (i=0; iexonCount; ++i) { fprintf(f, "%u", el->exonStarts[i]); fputc(',', f); } if (sep == ',') fputc('}',f); fputc(sep,f); if (sep == ',') fputc('{',f); for (i=0; iexonCount; ++i) { fprintf(f, "%u", el->exonEnds[i]); fputc(',', f); } if (sep == ',') fputc('}',f); /* optional fields, >= test is used so unspecified coumns can be filled in */ if (el->optFields >= genePredScoreFld) { fputc(sep,f); fprintf(f, "%d", el->score); } if (el->optFields >= genePredName2Fld) { fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", ((el->name2 != NULL) ? el->name2 : "")); if (sep == ',') fputc('"',f); } if (el->optFields >= genePredCdsStatFld) { fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", genePredCdsStatStr(el->cdsStartStat)); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", genePredCdsStatStr(el->cdsEndStat)); if (sep == ',') fputc('"',f); } if (el->optFields >= genePredExonFramesFld) { fputc(sep,f); if (sep == ',') fputc('{',f); for (i=0; iexonCount; ++i) { fprintf(f, "%d", el->exonFrames[i]); fputc(',', f); } if (sep == ',') fputc('}',f); } fputc(lastSep,f); } /* --------- Start of hand generated code. ---------------------------- */ char *genePredCdsStatStr(enum cdsStatus stat) /* get string value of a cdsStatus */ { switch (stat) { case cdsNone: return "none"; case cdsUnknown: return "unk"; case cdsIncomplete: return "incmpl"; case cdsComplete: return "cmpl"; default: errAbort("invalid cdsStatus enum value: %d", stat); return NULL; /* make compiler happy */ } } static enum cdsStatus parseCdsStat(char *statStr) /* parse a cdsStatus string */ { if ((statStr == NULL) || sameString(statStr, "none")) return cdsNone; if (sameString(statStr, "unk")) return cdsUnknown; if (sameString(statStr, "incmpl")) return cdsIncomplete; if (sameString(statStr, "cmpl")) return cdsComplete; errAbort("invalid genePred cdsStatus: \"%s\"", statStr); return cdsNone; /* make compiler happy */ } struct genePred *genePredExtLoad(char **row, int numCols) /* Load a genePred with from a row, with optional fields. The row must * contain columns in the order in the struct, and they must be present up to * the last specfied optional field. Missing intermediate fields must have * zero or empty columns, they may not be omitted. Fields at the end can be * omitted. Dispose of this with genePredFree(). */ { struct genePred *ret; int sizeOne, iCol; AllocVar(ret); ret->exonCount = sqlUnsigned(row[7]); ret->name = cloneString(row[0]); ret->chrom = cloneString(row[1]); strcpy(ret->strand, row[2]); ret->txStart = sqlUnsigned(row[3]); ret->txEnd = sqlUnsigned(row[4]); ret->cdsStart = sqlUnsigned(row[5]); ret->cdsEnd = sqlUnsigned(row[6]); sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne); if (sizeOne != ret->exonCount) errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)", ret->name, sizeOne, ret->exonCount); sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne); if (sizeOne != ret->exonCount) errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)", ret->name, sizeOne, ret->exonCount); iCol=GENEPRED_NUM_COLS; if (iCol < numCols) { ret->score = sqlSigned(row[iCol++]); ret->optFields |= genePredScoreFld; } if (iCol < numCols) { ret->name2 = cloneString(row[iCol++]); ret->optFields |= genePredName2Fld; } if (iCol < numCols) { ret->cdsStartStat = parseCdsStat(row[iCol++]); ret->optFields |= genePredCdsStatFld; } if (iCol < numCols) { ret->cdsEndStat = parseCdsStat(row[iCol++]); ret->optFields |= genePredCdsStatFld; } if (iCol < numCols) { sqlSignedDynamicArray(row[iCol++], &ret->exonFrames, &sizeOne); if (sizeOne != ret->exonCount) errAbort("genePred: %s number of exonFrames (%d) != number of exons (%d)", ret->name, sizeOne, ret->exonCount); ret->optFields |= genePredExonFramesFld; } return ret; } struct genePred *genePredExtLoadAll(char *fileName) /* Load all genePreds with from tab-separated file, possibly with optional * fields. Dispose of this with genePredFreeList(). */ { struct genePred *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[GENEPREDX_NUM_COLS]; int numCols; while ((numCols = lineFileChopNextTab(lf, row, ArraySize(row))) > 0) { lineFileExpectAtLeast(lf, GENEPRED_NUM_COLS, numCols); el = genePredExtLoad(row, numCols); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } int genePredCmp(const void *va, const void *vb) /* Compare to sort based on chromosome, txStart. */ { const struct genePred *a = *((struct genePred **)va); const struct genePred *b = *((struct genePred **)vb); -int dif; +int dif = strcmp(a->chrom, b->chrom); +if (dif == 0) + dif = a->txStart - b->txStart; +return dif; +} + +int genePredNameCmp(const void *va, const void *vb) +/* Compare to sort based on name, then chromosome, txStart. */ +{ +const struct genePred *a = *((struct genePred **)va); +const struct genePred *b = *((struct genePred **)vb); +int dif = strcmp(a->name, b->name); +if (dif == 0) dif = strcmp(a->chrom, b->chrom); if (dif == 0) dif = a->txStart - b->txStart; return dif; } static boolean haveStartStopCodons(struct gffFile *gff) /* For GFFs, determine if any of the annotations use start_codon or * stop_codon */ { struct gffFeature *feature; for(feature = gff->featureList; feature != NULL; feature = feature->next) { if(sameWord(feature->name, "start_codon") || sameWord(feature->name, "start_codon")) return TRUE; } return FALSE; } static boolean isExon(char *feat, boolean isGtf, char *exonSelectWord) /* determine if a feature is an exon; different criteria for GFF ane GTF */ { if (isGtf) { /* must check for stop_codon here, because GTF put it outside of the * CDS */ return (sameWord(feat, "CDS") || sameWord(feat, "stop_codon") || sameWord(feat, "start_codon") || sameWord(feat, "exon") || sameWord(feat, "utr") || sameWord(feat, "5utr") || sameWord(feat, "3utr")); } else { return ((exonSelectWord == NULL) || sameWord(feat, exonSelectWord)); } } static boolean isStartStopCodon(char *feat) /* determine if a feature is GTF style start/stop codon */ { return sameWord(feat, "stop_codon") || sameWord(feat, "start_codon"); } static boolean isCds(char *feat) /* determine if a feature is CDS */ { return sameWord(feat, "CDS") || isStartStopCodon(feat); } static void chkGroupLine(struct gffGroup *group, struct gffLine *gl, struct genePred *gp) /* check that a gffLine is consistent with the genePred being built. this * helps detect some problems that lead to corrupt genePreds */ { if (!(sameString(gl->seq, gp->chrom) && (gl->strand == gp->strand[0]))) { fprintf(stderr, "invalid gffGroup detected on line: "); gffTabOut(gl, stderr); errAbort("GFF/GTF group %s on %s%c, this line is on %s%c, all group members must be on same seq and strand", group->name, gp->chrom, gp->strand[0], gl->seq, gl->strand); } } static int phaseToFrame(char phase) /* convert GTF/GFF phase to frame */ { switch (phase) { case '0': return 0; case '1': return 2; case '2': return 1; } return -1; } #ifdef NOT_CURRENTLY_USED static char frameToPhase(int frame) /* convert GTF/GFF frame to phase */ { switch (frame) { case 0: return '0'; case 1: return '2'; case 2: return '1'; } return -1; } #endif static int incrFrame(int frame, int amt) /* increment frame, no-op if frame is -1 */ { if (frame < 0) return frame; else if (amt >= 0) return (frame+amt) % 3; else { int amt3 = (-amt) % 3; return (frame - (amt - amt3)) % 3; } } static int findPosExon(struct genePred *gp, int pos) /* find exon containing the specified position */ { int i; for (i = 0; i < gp->exonCount; i++) { if ((gp->exonStarts[i] <= pos) && (pos < gp->exonEnds[i])) return i; } errAbort("bug: position %d not found in exon of %s genePred", pos, gp->name); return 0; } static int findLastFramedExon(struct genePred *gp) /* locate the last exon with frame */ { int iStart = (gp->strand[0] == '+') ? 0 : gp->exonCount-1; int iStop = (gp->strand[0] == '+') ? gp->exonCount : -1; int dir = (gp->strand[0] == '+') ? +1 : -1; int iPrevExon = -1, iExon; // find beginning of frames for (iExon = iStart; (iExon != iStop) && (gp->exonFrames[iExon] == -1); iExon += dir) continue; // find last with frame for (; (iExon != iStop) && (gp->exonFrames[iExon] != -1); iExon += dir) iPrevExon = iExon; if (iPrevExon < 0) errAbort("bug: findLastFramedExon should have found an exon with frame"); return iPrevExon; } static void extendFramePos(struct genePred *gp) /* extend frame missing from last exon(s) for positive strand genes, normally * caused by GTF stop_codon */ { int iExon = findLastFramedExon(gp); int frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon])); for (iExon++; (iExon < gp->exonCount) && (gp->exonStarts[iExon] < gp->cdsEnd); iExon++) { assert(gp->exonFrames[iExon] < 0); gp->exonFrames[iExon] = frame; frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon])); } } static void extendFrameNeg(struct genePred *gp) /* extend frame missing from last exon(s) for negative strand genes, normally * caused by GTF stop_codon */ { int iExon = findLastFramedExon(gp); int frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon])); for (iExon--; (iExon >= 0) && (gp->exonEnds[iExon] > gp->cdsStart); iExon--) { assert(gp->exonFrames[iExon] < 0); gp->exonFrames[iExon] = frame; frame = incrFrame(gp->exonFrames[iExon], (gp->exonEnds[iExon]-gp->exonStarts[iExon])); } } static void fixStopFrame(struct genePred *gp) /* Handle nasty corner case: GTF with the all or part of the stop codon as the * only codon in an exon, we must set the frame on these exons. Some ENSEMBL * GTFs have single base `exons' with just a base of the stop codon, so must * check every base. While less than ideal, we just extend the frame past the * last exon with frame, so a conflicting frame on the stop codon will be * missed. Just switching to GFF3 fixes this mess. */ { if (gp->strand[0] == '+') extendFramePos(gp); else extendFrameNeg(gp); } static boolean adjImpliedStopPos(struct genePred *gp) /* implied stop codon adjustment for positive strand gene */ { int iExon = findPosExon(gp, gp->cdsEnd-1); unsigned space = (gp->exonEnds[iExon]-gp->cdsEnd); if (space >= 3) { // room in current exon gp->cdsEnd += 3; return TRUE; } else if ((iExon < gp->exonCount) && ((gp->exonEnds[iExon+1]-gp->exonStarts[iExon+1])+space) >= 3) { // room including next exon gp->cdsEnd = gp->exonStarts[iExon+1] + (3-space); return TRUE; } return FALSE; } static boolean adjImpliedStopNeg(struct genePred *gp) /* implied stop codon adjustment for negative strand gene */ { int iExon = findPosExon(gp, gp->cdsStart); unsigned space = (gp->cdsStart-gp->exonStarts[iExon]); if (space >= 3) { // room in current exon gp->cdsStart -= 3; return TRUE; } else if ((iExon > 0) && ((gp->exonEnds[iExon-1]-gp->exonStarts[iExon-1])+space) >= 3) { // room including previous exon gp->cdsStart = gp->exonEnds[iExon-1] - (3-space); return TRUE; } return FALSE; } static void adjImpliedStopAfterCds(struct genePred *gp) /* adjust CDS bounds to include implied stop codon outside of CDS. */ { if (gp->strand[0] == '+') { if (adjImpliedStopPos(gp)) { if (gp->optFields & genePredCdsStatFld) gp->cdsEndStat = cdsComplete; } } else { if (adjImpliedStopNeg(gp)) { if (gp->optFields & genePredCdsStatFld) gp->cdsStartStat = cdsComplete; } } } static void checkForNoFrames(struct genePred *gp) /* check for requesting frame from a file without frames. */ { int i; /* Complain if we have a CDS but exonFrames are all -1: */ if (gp->cdsStart < gp->cdsEnd) { boolean foundReal = FALSE; for (i = 0; i < gp->exonCount; i++) { if (gp->exonFrames[i] >= 0 && gp->exonFrames[i] <= 2) { foundReal = TRUE; break; } } if (! foundReal) { errAbort("Error: exonFrames field is being added, but I found a " "gene (%s) with CDS but no valid frames. " "This can happen if program is invoked with -genePredExt " "but no valid frames are given in the file. If the 8th " "field of GFF/GTF file is always a placeholder, then don't use " "-genePredExt.", gp->name); } } } static boolean isGtfFeature(char *feat) /* check if a feature is one of the recognized features. All otherse * should be ignored. http://mblab.wustl.edu/GTF22.html */ { static char *gtfFeats[] = {"CDS", "start_codon", "stop_codon", "5UTR", "3UTR", "inter", "inter_CNS", "intron_CNS", "exon", NULL}; int i; for (i = 0; gtfFeats[i] != NULL; i++) { if (sameString(feat, gtfFeats[i])) return TRUE; } return FALSE; } static boolean ignoreGxfLine(struct gffLine *gl, boolean isGtf) /* should the specified line be skipped? */ { if (isGtf) return !isGtfFeature(gl->feature); else return FALSE; } static boolean allowedFrameFeature(struct gffLine *gl, boolean isGtf) /* should this feature be used to assign frame? */ { if (isStartStopCodon(gl->feature) || ignoreGxfLine(gl, isGtf)) return FALSE; // for GFF, any features with frame can be used to set frame, // with GTF, there are some bogus files that set frame on UTR features, // so only use CDS if (isGtf) return sameString(gl->feature, "CDS"); else return (gl->frame != '.'); } static struct gffLine *assignFrameForCdsExon(int start, int end, int *framePtr, struct gffLine *gl, boolean isGtf) /* set frame from any overlapping records with frame. Don't include * start/stop_codon, as it isn't a full exon. */ { /* skip lines preceeding this exon */ while (gl != NULL) { if (allowedFrameFeature(gl, isGtf) && (gl->end >= start)) break; gl = gl->next; } /* set frame from any overlapping records with frame. */ while ((gl != NULL) && (rangeIntersection(gl->start, gl->end, start, end) > 0)) { if (allowedFrameFeature(gl, isGtf)) { int frame = phaseToFrame(gl->frame); if (frame >= 0) *framePtr = frame; } gl = gl->next; } return gl; } static void assignFrame(boolean isGtf, struct gffGroup *group, struct genePred *gp) /* Assign frame from GFF after genePred has been built.*/ { struct gffLine *gl = group->lineList; int start, end, i; boolean haveFrame = FALSE; for (i = 0; i < gp->exonCount; i++) { if (genePredCdsExon(gp, i, &start, &end)) { gl = assignFrameForCdsExon(start, end, &(gp->exonFrames[i]), gl, isGtf); if (gp->exonFrames[i] >= 0) haveFrame = TRUE; } } /* make sure stop has frame if some exons in the gene had frame */ if (haveFrame) fixStopFrame(gp); checkForNoFrames(gp); } static struct genePred *mkFromGroupedGxf(struct gffFile *gff, struct gffGroup *group, char *name, boolean isGtf, char *exonSelectWord, unsigned optFields, unsigned options) /* common function to create genePreds from GFFs or GTFs. This is a little * ugly with to many check of isGtf, however the was way to much identical * code the other way. Options are from genePredFromGxfOpts */ { struct genePred *gp; int stopCodonStart = -1, stopCodonEnd = -1; int cdsStart = BIGNUM, cdsEnd = -BIGNUM; int exonCount = 0; boolean haveStartCodon = FALSE, haveStopCodon = FALSE; struct gffLine *gl; unsigned *eStarts, *eEnds; int i; /* should we count on start/stop codon annotation in GFFs? */ boolean useStartStops = isGtf || haveStartStopCodons(gff); /* Count up exons and figure out cdsStart and cdsEnd. */ for (gl = group->lineList; gl != NULL; gl = gl->next) { if (ignoreGxfLine(gl, isGtf)) continue; if (isExon(gl->feature, isGtf, exonSelectWord)) ++exonCount; if (isCds(gl->feature)) { if (gl->start < cdsStart) cdsStart = gl->start; if (gl->end > cdsEnd) cdsEnd = gl->end; } if (sameWord(gl->feature, "start_codon")) haveStartCodon = TRUE; if (sameWord(gl->feature, "stop_codon")) { /* stop_codon can be split, need bounds for adjusting CDS below */ if ((stopCodonStart < 0) || (gl->start < stopCodonStart)) stopCodonStart = gl->start; if ((stopCodonEnd < 0) || (gl->end > stopCodonEnd)) stopCodonEnd = gl->end; haveStopCodon = TRUE; } } if (exonCount == 0) return NULL; if (cdsStart > cdsEnd) { /* no cds annotated */ cdsStart = 0; cdsEnd = 0; } else if (stopCodonStart >= 0) { /* adjust CDS to include stop codon as in GTF */ if (group->strand == '+') { if (stopCodonEnd > cdsEnd) cdsEnd = stopCodonEnd; } else { if (stopCodonStart < cdsStart) cdsStart = stopCodonStart; } } /* Allocate genePred and fill in values. */ AllocVar(gp); gp->name = cloneString(name); gp->chrom = cloneString(group->seq); gp->strand[0] = group->strand; gp->txStart = group->start; gp->txEnd = group->end; if (cdsStart < cdsEnd) { gp->cdsStart = cdsStart; gp->cdsEnd = cdsEnd; } else { // no CDS, set to txEnd gp->cdsStart = gp->txEnd; gp->cdsEnd = gp->txEnd; } gp->exonStarts = AllocArray(eStarts, exonCount); gp->exonEnds = AllocArray(eEnds, exonCount); gp->optFields = optFields; if (optFields & genePredName2Fld) { if (options & genePredGxfGeneNameAsName2) { if (group->lineList->geneName != NULL) gp->name2 = cloneString(group->lineList->geneName); } else { if (group->lineList->geneId != NULL) gp->name2 = cloneString(group->lineList->geneId); } if (gp->name2 == NULL) gp->name2 = cloneString(""); } if (optFields & genePredCdsStatFld) { if (cdsStart < cdsEnd) { if (!useStartStops) { /* GFF doesn't require start or stop, if not used, assume complete */ gp->cdsStartStat = cdsComplete; gp->cdsEndStat = cdsComplete; } else if (group->strand == '+') { gp->cdsStartStat = (haveStartCodon ? cdsComplete : cdsIncomplete); gp->cdsEndStat = (haveStopCodon ? cdsComplete : cdsIncomplete);; } else { gp->cdsEndStat = (haveStartCodon ? cdsComplete : cdsIncomplete); gp->cdsStartStat = (haveStopCodon ? cdsComplete : cdsIncomplete);; } } else { gp->cdsStartStat = cdsNone; gp->cdsEndStat = cdsNone; } } if (optFields & genePredExonFramesFld) { AllocArray(gp->exonFrames, exonCount); for (i = 0; i < exonCount; i++) gp->exonFrames[i] = -1; } /* adjust tx range to include stop codon */ if ((group->strand == '+') && (gp->txEnd == stopCodonStart)) gp->txEnd = stopCodonEnd; else if ((group->strand == '-') && (gp->txStart == stopCodonEnd)) gp->txStart = stopCodonStart; i = -1; /* before first exon */ /* fill in exons, merging overlaping and adjacent exons */ for (gl = group->lineList; gl != NULL; gl = gl->next) { if (ignoreGxfLine(gl, isGtf)) continue; if (isExon(gl->feature, isGtf, exonSelectWord) || (isGtf && isCds(gl->feature))) { chkGroupLine(group, gl, gp); if ((i < 0) || (gl->start > eEnds[i])) { /* start a new exon */ ++i; assert(i < exonCount); eStarts[i] = gl->start; eEnds[i] = gl->end; } else { /* overlap, extend exon, picking the largest of ends */ assert(i < exonCount); assert(gl->start >= eStarts[i]); if (gl->end > eEnds[i]) eEnds[i] = gl->end; } } } gp->exonCount = i+1; /* adjust for flybase type of GFFs, with stop codon implied and outside of * CDS. */ if ((options & genePredGxfImpliedStopAfterCds) && !haveStopCodon) adjImpliedStopAfterCds(gp); if (optFields & genePredExonFramesFld) assignFrame(isGtf, group, gp); return gp; } struct genePred *genePredFromGroupedGff(struct gffFile *gff, struct gffGroup *group, char *name, char *exonSelectWord, unsigned optFields, unsigned options) /* Convert gff->groupList to genePred list. See genePred.h for details. */ { struct gffLine *gl; boolean anyExon = FALSE; /* Look to see if any exons. If not allow CDS to be used instead. */ if (exonSelectWord) { for (gl = group->lineList; gl != NULL; gl = gl->next) { if (sameWord(gl->feature, exonSelectWord)) { anyExon = TRUE; break; } } } else anyExon = TRUE; if (!anyExon) exonSelectWord = "CDS"; return mkFromGroupedGxf(gff, group, name, FALSE, exonSelectWord, optFields, options); } struct genePred *genePredFromGroupedGtf(struct gffFile *gff, struct gffGroup *group, char *name, unsigned optFields, unsigned options) /* Convert gff->groupList to genePred list, using GTF feature conventions. * See genePred.h for details. */ { return mkFromGroupedGxf(gff, group, name, TRUE, NULL, optFields, options); } static void mapCdsToGenome(struct psl *psl, struct genbankCds* cds, struct genePred* gene) /* Convert set cdsStart/end from mrna to genomic coordinates. */ { struct genbankCds genomeCds = genbankCdsToGenome(cds, psl); if (genomeCds.start < genomeCds.end) { gene->cdsStart = genomeCds.start; gene->cdsEnd = genomeCds.end; if (gene->optFields & genePredCdsStatFld) { if (gene->strand[0] == '+') { gene->cdsStartStat = (genomeCds.startComplete) ? cdsComplete : cdsIncomplete; gene->cdsEndStat = (genomeCds.endComplete) ? cdsComplete : cdsIncomplete;; } else { gene->cdsStartStat = (genomeCds.endComplete) ? cdsComplete : cdsIncomplete;; gene->cdsEndStat = (genomeCds.startComplete) ? cdsComplete : cdsIncomplete; } } } else { /* CDS not aligned */ gene->cdsStart = gene->txEnd; gene->cdsEnd = gene->txEnd; if (gene->optFields & genePredCdsStatFld) { gene->cdsStartStat = cdsUnknown; gene->cdsEndStat = cdsUnknown; } } } void genePredAddGenbankCds(struct psl *psl, struct genbankCds* cds, struct genePred *gene) /* Convert cdsStart/End from mrna to genomic coordinates. * Note that the genePred blocks need not be filled in before * this call. */ { if (cds == NULL) { /* no CDS, set to zero-length at end */ gene->cdsStart = psl->tEnd; gene->cdsEnd = psl->tEnd; if (gene->optFields & genePredCdsStatFld) { gene->cdsStartStat = cdsNone; gene->cdsEndStat = cdsNone; } } else if ((cds->start < 0) || (cds->end <= 0)) { /* unknown CDS, set to end */ gene->cdsStart = psl->tEnd; gene->cdsEnd = psl->tEnd; if (gene->optFields & genePredCdsStatFld) { gene->cdsStartStat = cdsUnknown; gene->cdsEndStat = cdsUnknown; } } else { /* have CDS annotation, make sure it's in bounds */ struct genbankCds adjCds = *cds; if (adjCds.end > psl->qSize) { adjCds.end = psl->qSize; adjCds.endComplete = FALSE; } mapCdsToGenome(psl, &adjCds, gene); } } static int getFrame(struct psl *psl, int start, int end, struct genbankCds* cds) /* get the starting frame for an exon of a mRNA. start and end are * the bounds of the exon within the mRNA. This may cover multiple psl * blocks due to merging of small gaps. */ { /* use the 3' end is used if it's complete, as it is more often accurate when * genes are defined from mRNAs sequenced with reverse-transcriptase. */ int frame = -1; /* map to mRNA coords in CDS since frame for an exon is in direction of * transcription. */ if (psl->strand[0] == '-') reverseIntRange(&start, &end, psl->qSize); if (start < cds->start) start = cds->start; if (end > cds->end) end = cds->end; if (start < end) { /* compute from end if it is complete in mRNA */ if (cds->endComplete) { int fr = (cds->end-start) % 3; frame = (fr == 2) ? 1 : ((fr == 1) ? 2 : 0); } else frame = (start-cds->start) % 3; } return frame; } static boolean shouldMergeBlocks(struct genePred *gene, int iExon, unsigned tStart, unsigned options, int cdsMergeSize, int utrMergeSize) /* determine if a new block starting at tStart whould be merged with * the preceeding exon, indicated by iExon. */ { /* nothing to check if no exons have been added */ if (iExon >= 0) { boolean inCds = (gene->cdsStart <= tStart) && (tStart < gene->cdsEnd); int gapSize = (tStart - gene->exonEnds[iExon]); if (inCds) { if ((options & genePredPslCdsMod3) && ((gapSize % 3) != 0)) return FALSE; /* not a multiple of three */ if ((cdsMergeSize >= 0) && (gapSize <= cdsMergeSize)) return TRUE; } else { if ((utrMergeSize >= 0) && (gapSize <= utrMergeSize)) return TRUE; } } return FALSE; /* don't merge */ } static void pslToExons(struct psl *psl, struct genePred *gene, struct genbankCds* cds, unsigned options, int cdsMergeSize, int utrMergeSize) /* Convert psl alignment blocks to genePred exons, merging together blocks by * the specified paraemeters. Optionally add frame information. */ { int iBlk, iExon; int startIdx, stopIdx, idxIncr; /* this is bigger than needed if blocks merge */ gene->exonStarts = needMem(psl->blockCount*sizeof(unsigned)); gene->exonEnds = needMem(psl->blockCount*sizeof(unsigned)); if (gene->optFields & genePredExonFramesFld) { /* default frames to indicate no frame */ gene->exonFrames = needMem(psl->blockCount*sizeof(unsigned)); for (iExon = 0; iExon < psl->blockCount; iExon++) gene->exonFrames[iExon] = -1; } /* traverse psl in postive target order */ if (psl->strand[1] == '-') { startIdx = psl->blockCount-1; stopIdx = -1; idxIncr = -1; } else { startIdx = 0; stopIdx = psl->blockCount; idxIncr = 1; } iExon = -1; /* indicate none have been added */ for (iBlk = startIdx; iBlk != stopIdx; iBlk += idxIncr) { int tStart = psl->tStarts[iBlk]; int tEnd = tStart + psl->blockSizes[iBlk]; if (psl->strand[1] == '-') reverseIntRange(&tStart, &tEnd, psl->tSize); if (!shouldMergeBlocks(gene, iExon, tStart, options, cdsMergeSize, utrMergeSize)) { /* new exon */ iExon++; gene->exonStarts[iExon] = tStart; } gene->exonEnds[iExon] = tEnd; /* Set the frame. We have to deal with multiple blocks being merged. For * positive strand, the first block frame is used, for negative strand, * the last is used. */ if ((gene->optFields & genePredExonFramesFld) && (cds != NULL) && (((psl->strand[0] == '+') && (gene->exonFrames[iExon] < 0)) || (psl->strand[0] == '-'))) { int fr = getFrame(psl, psl->qStarts[iBlk], psl->qStarts[iBlk]+psl->blockSizes[iBlk], cds); if (fr >= 0) gene->exonFrames[iExon] = fr; } } gene->exonCount = iExon+1; assert(gene->exonCount <= psl->blockCount); } struct genePred *genePredFromPsl3(struct psl *psl, struct genbankCds* cds, unsigned optFields, unsigned options, int cdsMergeSize, int utrMergeSize) /* Convert a PSL of an mRNA alignment to a genePred, converting a genbank CDS * specification string to genomic coordinates. Small genomic inserts are * merged based on the mergeSize parameters. Gaps no larger than the * specified merge sizes result in the adjacent blocks being merged into a * single exon. Gaps in CDS use cdsMergeSize, in UTR use utrMergeSize. If * the genePredPslCdsMod3 option is specified, then CDS gaps are only merged * if a multiple of three. A negative merge sizes disables merging of blocks. * This differs from specifying zero in that adjacent blocks will not be * merged. The optfields field is a set from genePredFields, indicated what * fields to create. Zero-length CDS, or null cds, creates without CDS * annotation. If cds is null, it will set status fields to cdsNone. */ { struct genePred *gene; #if 0 /* FIXME; the browser calls this on a protein psl when rendering the amino * acids of a protein alignment track. Some of this code doesn't really make * sense for proteins and I really don't see how this produces the right * results, however the check is disabled until this can be investigated * because it *seems* to work. markd 2006-05-22 */ if (pslIsProtein(psl)) errAbort("can't convert protein psls to genePreds"); #endif AllocVar(gene); gene->name = cloneString(psl->qName); gene->chrom = cloneString(psl->tName); gene->txStart = psl->tStart; gene->txEnd = psl->tEnd; gene->optFields = optFields; /* get strand in genome that the positive version mRNA aligns to */ if (psl->strand[1] == '\0') { /* assumed pos target strand, so neg query would be pos target */ gene->strand[0] = psl->strand[0]; } else { /* query and target strand are different; will be neg when query pos */ gene->strand[0] = ((psl->strand[0] != psl->strand[1]) ? '-' : '+'); } genePredAddGenbankCds(psl, cds, gene); pslToExons(psl, gene, cds, options, cdsMergeSize, utrMergeSize); return gene; } struct genePred *genePredFromPsl2(struct psl *psl, unsigned optFields, struct genbankCds* cds, int insertMergeSize) /* Compatibility function, genePredFromPsl3 is prefered. See that function's * documentation for details. This calls genePredFromPsl3 with no options * and insertMergeSize set for CDS and UTR. */ { return genePredFromPsl3(psl, cds, optFields, genePredPslDefaults, insertMergeSize, insertMergeSize); } struct genePred *genePredFromPsl(struct psl *psl, int cdsStart, int cdsEnd, int insertMergeSize) /* Compatibility function, genePredFromPsl3 is prefered. See that function's * documentation for details. This calls genePredFromPsl3 with no options. */ { struct genbankCds cds; ZeroVar(&cds); cds.start = cdsStart; cds.end = cdsEnd; return genePredFromPsl3(psl, &cds, 0, genePredPslDefaults, insertMergeSize, insertMergeSize); } char* genePredGetCreateSql(char* table, unsigned optFields, unsigned options, int chromIndexLen) /* Get SQL required to create a genePred table. optFields is a bit set * consisting of the genePredFields values. Options are a bit set of * genePredCreateOpts. Returned string should be freed. This will create all * optional fields that preceed the highest optFields column. chromIndexLen * is now ignored.. */ { /* the >= is used so that we create preceeding fields. */ char sqlCmd[1024]; safef(sqlCmd, sizeof(sqlCmd), createSql, table, ((options & genePredWithBin) ? binFieldSql : noBinIndexSql)); if (optFields >= genePredScoreFld) safecat(sqlCmd, sizeof(sqlCmd), scoreFieldSql); if (optFields >= genePredName2Fld) safecat(sqlCmd, sizeof(sqlCmd), name2FieldSql); if (optFields >= genePredCdsStatFld) safecat(sqlCmd, sizeof(sqlCmd), cdsStatFieldSql); if (optFields >= genePredExonFramesFld) safecat(sqlCmd, sizeof(sqlCmd), exonFramesFieldSql); safecat(sqlCmd, sizeof(sqlCmd), ")"); return cloneString(sqlCmd); } // FIXME: this really doesn't belong in this module struct genePred *getOverlappingGene(char *db, struct genePred **list, char *table, char *chrom, int cStart, int cEnd, char *name, int *retOverlap) { /* read all genes from a table find the gene with the biggest overlap. Cache the list of genes to so we only read it once */ char query[256]; struct genePred *gene; struct sqlConnection *conn; struct sqlResult *sr; char **row; struct genePred *el = NULL, *bestMatch = NULL, *gp = NULL; int overlap = 0 , bestOverlap = 0, i; struct psl *psl; int *eFrames; if (*list == NULL) { printf("Loading Predictions from %s\n",table); AllocVar(*list); conn = hAllocConn(db); AllocVar(gene); safef(query, sizeof(query), "select * from %s", table); sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { if (!sameString(table,"all_mrna")) { el = genePredLoad(row); } else { psl = pslLoad(row); el = genePredFromPsl2(psl, 0, NULL, genePredStdInsertMergeSize); } slAddHead(list, el); } slReverse(list); sqlFreeResult(&sr); hFreeConn(&conn); } for (el = *list; el != NULL; el = el->next) { if (chrom != NULL && el->chrom != NULL) { overlap = 0; if ( sameString(chrom, el->chrom)) { for (i = 0 ; i<(el->exonCount); i++) { overlap += positiveRangeIntersection(cStart,cEnd, el->exonStarts[i], el->exonEnds[i]) ; } if (overlap > 20 && sameString(name, el->name)) { bestMatch = el; bestOverlap = overlap; *retOverlap = bestOverlap; } if (overlap > bestOverlap) { bestMatch = el; bestOverlap = overlap; *retOverlap = bestOverlap; } } } } if (bestMatch != NULL) { /* Allocate genePred and fill in values. */ AllocVar(gp); gp->name = cloneString(bestMatch->name); gp->chrom = cloneString(bestMatch->chrom); gp->strand[1] = bestMatch->strand[1]; gp->strand[0] = bestMatch->strand[0]; gp->txStart = bestMatch->txStart; gp->txEnd = bestMatch->txEnd; gp->cdsStart = bestMatch->cdsStart; gp->cdsEnd = bestMatch->cdsEnd; gp->exonCount = bestMatch->exonCount; AllocArray(gp->exonStarts, bestMatch->exonCount); AllocArray(gp->exonEnds, bestMatch->exonCount); for (i=0; iexonCount; ++i) { gp->exonStarts[i] = bestMatch->exonStarts[i] ; gp->exonEnds[i] = bestMatch->exonEnds[i] ; } gp->optFields = bestMatch->optFields; gp->score = bestMatch->score; if (bestMatch->optFields & genePredName2Fld) gp->name2 = cloneString(bestMatch->name2); else gp->name2 = NULL; if (bestMatch->optFields & genePredCdsStatFld) { gp->cdsStartStat = bestMatch->cdsStartStat; gp->cdsEndStat = bestMatch->cdsEndStat; } if (bestMatch->optFields & genePredExonFramesFld) { gp->exonFrames = AllocArray(eFrames, bestMatch->exonCount); for (i = 0; i < bestMatch->exonCount; i++) gp->exonFrames[i] = bestMatch->exonFrames[i]; } eFrames = gp->exonFrames; } return gp; } int genePredBases(struct genePred *gp) /* count coding and utr bases in a gene prediction */ { int count = 0, i; if (gp == NULL) return 0; for (i=0; iexonCount; i++) { count += gp->exonEnds[i] - gp->exonStarts[i] ; } return count; } int genePredCodingBases(struct genePred *gp) /* Count up the number of coding bases in gene prediction. */ { int i, exonCount = gp->exonCount; int cdsStart = gp->cdsStart, cdsEnd = gp->cdsEnd; int baseCount = 0; for (i=0; iexonStarts[i], gp->exonEnds[i]); } return baseCount; } boolean genePredCdsExon(struct genePred *gp, int iExon, int *startPtr, int *endPtr) /* Get the CDS range in an exon. If there is no CDS, return FALSE and then * set start == end */ { assert((iExon >= 0) && (iExon < gp->exonCount)); int start = gp->exonStarts[iExon]; int end = gp->exonEnds[iExon]; if (start < gp->cdsStart) start = gp->cdsStart; if (end > gp->cdsEnd) end = gp->cdsEnd; if (startPtr != NULL) *startPtr = start; if (endPtr != NULL) *endPtr = end; return (start < end); } /* state for genePredCheck */ static int gpErrorCnt = 0; /* error count for gpError */ static FILE *gpErrFh = NULL; /* error output for gpError */ static void gpError(char *format, ...) /* print and count an error */ { va_list args; fprintf(gpErrFh, "Error: "); va_start(args, format); vfprintf(gpErrFh, format, args); va_end(args); fputc('\n', gpErrFh); gpErrorCnt++; } static void checkExon(char *desc, struct genePred* gp, int iExon) /* check one exon of a genePred */ { unsigned exonStart = gp->exonStarts[iExon]; unsigned exonEnd = gp->exonEnds[iExon]; if (exonStart >= exonEnd) gpError("%s: %s exon %u start %u >= end %u", desc, gp->name, iExon, exonStart, exonEnd); if (exonStart < gp->txStart) gpError("%s: %s exon %u start %u < txStart %u", desc, gp->name, iExon, exonStart, gp->txStart); if (exonEnd > gp->txEnd) gpError("%s: %s exon %u end %u > txEnd %u", desc, gp->name, iExon, exonEnd, gp->txEnd); if (iExon > 0) { /* other than first exon */ unsigned prevExonEnd = gp->exonEnds[iExon-1]; if (exonStart < prevExonEnd) gpError("%s: %s exon %u overlaps previous exon", desc, gp->name, iExon); } if (gp->optFields & genePredExonFramesFld) { int frame = gp->exonFrames[iExon]; if ((frame < -1) || (frame > 2)) gpError("%s: %s invalid exonFrame: %d", desc, gp->name, frame); if ((exonEnd > gp->cdsStart) && (exonStart < gp->cdsEnd)) { if (frame == -1) gpError("%s: %s no exonFrame on CDS exon %d", desc, gp->name, iExon); } else { if (frame != -1) gpError("%s: %s exonFrame on non-CDS exon %d", desc, gp->name, iExon); } } } static void checkCdsBounds(char *desc, FILE* out, struct genePred* gp) /* check cdsStart/cdsEnd */ { int iExon; boolean foundCdsStart = FALSE, foundCdsEnd = FALSE; if (gp->cdsStart > gp->cdsEnd) gpError("%s: %s cdsStart %u > cdsEnd %u", desc, gp->name, gp->cdsStart, gp->cdsEnd); if ((gp->cdsStart < gp->txStart) || (gp->cdsStart > gp->txEnd)) gpError("%s: %s cdsStart %u not in tx bounds %u-%u", desc, gp->name, gp->cdsStart, gp->txStart, gp->txEnd); if ((gp->cdsEnd < gp->txStart) || (gp->cdsEnd > gp->txEnd)) gpError("%s: %s cdsEnd %u not in tx bounds %u-%u", desc, gp->name, gp->cdsEnd, gp->txStart, gp->txEnd); /* is cdsStart/cdsEnd in an exon? */ for (iExon = 0; iExon < gp->exonCount; iExon++) { if ((gp->cdsStart >= gp->exonStarts[iExon]) && (gp->cdsStart < gp->exonEnds[iExon])) foundCdsStart = TRUE; if ((gp->cdsEnd > gp->exonStarts[iExon]) && (gp->cdsEnd <= gp->exonEnds[iExon])) foundCdsEnd = TRUE; } if (!foundCdsStart) gpError("%s: %s cdsStart %u not in an exon", desc, gp->name, gp->cdsStart); if (!foundCdsEnd) gpError("%s: %s cdsEnd %u not in an exon", desc, gp->name, gp->cdsEnd); } int genePredCheck(char *desc, FILE* out, int chromSize, struct genePred* gp) /* Validate a genePred for consistency. desc is printed the error messages * to file out (open /dev/null to discard). chromSize should contain * size of chromosome, or 0 if chrom is not valid, or -1 to not check * chromosome bounds. Returns count of errors. */ { int iExon; gpErrorCnt = 0; gpErrFh = out; if (!(sameString(gp->strand, "+") || sameString(gp->strand, "-"))) gpError("%s: %s invalid strand: \"%s\"", desc, gp->name, gp->strand); /* check chrom */ if (chromSize == 0) gpError("%s: %s chrom not a valid chromosome: \"%s\"", desc, gp->name, gp->chrom); else if (chromSize > 0) { if (gp->txEnd > chromSize) gpError("%s: %s txEnd %u >= chromSize %u", desc, gp->name, gp->txEnd, chromSize); } /* check internal consistency */ if (gp->txStart >= gp->txEnd) gpError("%s: %s txStart %u >= txEnd %u", desc, gp->name, gp->txStart, gp->txEnd); /* no CDS is indicated by cdsStart == cdsEnd */ if (gp->cdsStart != gp->cdsEnd) checkCdsBounds(desc, out, gp); /* make sure first/last exons match tx range */ if (gp->txStart != gp->exonStarts[0]) gpError("%s: %s first exon start %u doesn't match txStart %u", desc, gp->name, gp->exonStarts[0], gp->txStart); if (gp->txEnd != gp->exonEnds[gp->exonCount-1]) gpError("%s: %s last exon end %u doesn't match txEnd %u", desc, gp->name, gp->exonEnds[gp->exonCount-1], gp->txEnd); /* check each exon */ for (iExon = 0; iExon < gp->exonCount; iExon++) checkExon(desc, gp, iExon); return gpErrorCnt; } boolean genePredNmdTarget(struct genePred *gp) /* Return TRUE if cds end is more than 50bp upstream of last intron. */ { int gpSize = 0; int i = 0; int startDist = 0, endDist = 0; int blockSize = 0; if(gp->exonCount < 2) return FALSE; for(i = 0; i < gp->exonCount; i++) { int blockStart = gp->exonStarts[i]; int blockEnd = gp->exonEnds[i]; blockSize = blockEnd - blockStart; if( blockStart <= gp->cdsStart && gp->cdsStart <= blockEnd) startDist += blockSize - (blockEnd - gp->cdsStart); else if(blockEnd <= gp->cdsStart) startDist += blockSize; if( blockStart <= gp->cdsEnd && gp->cdsEnd <= blockEnd) endDist += blockSize - (blockEnd - gp->cdsEnd); else if(blockEnd <= gp->cdsEnd) endDist += blockSize; gpSize += blockSize; } /* xxxxxXXX----XXXXXXXXXXXXX--------XXXXXXX----XXXxxxxxxxx-------xxxxxxxxx */ if(sameString(gp->strand, "+")) { blockSize = gp->exonEnds[gp->exonCount-1] - gp->exonStarts[gp->exonCount-1]; return endDist < (gpSize - blockSize - 50); } else if(sameString(gp->strand, "-")) { blockSize = gp->exonEnds[0] - gp->exonStarts[0]; return startDist > (gp->exonEnds[1] + 50); } else errAbort("genePredNmdsTarget() - Don't recognize strand: %s\n", gp->strand); return FALSE; } void genePredAddExonFrames(struct genePred *gp) /* Add exonFrames array to a genePred that doesn't have it. Frame is assumed * to be contiguous. */ { int iExon, start, end, iBase = 0; int iStart, iEnd, iDir; /* initial array to all -1, for no frame */ AllocArray(gp->exonFrames, gp->exonCount); gp->optFields |= genePredExonFramesFld; for (iExon = 0; iExon < gp->exonCount; iExon++) gp->exonFrames[iExon] = -1; if (sameString(gp->strand, "+")) { iStart = 0; iEnd = gp->exonCount; iDir = 1; } else { iStart = gp->exonCount-1; iEnd = -1; iDir = -1; } for (iExon = iStart; iExon != iEnd; iExon += iDir) { if (genePredCdsExon(gp, iExon, &start, &end)) { gp->exonFrames[iExon] = iBase % 3; iBase += (end - start); } } } void genePredRc(struct genePred *gp, int chromSize) /* Reverse complement a genePred (project it to the opposite strand). Useful * when doing analysis that is simplified by having things on the same strand. */ { int iExon; gp->strand[0] = (gp->strand[0] == '+') ? '-' : '+'; reverseUnsignedRange(&gp->txStart, &gp->txEnd, chromSize); reverseUnsignedRange(&gp->cdsStart, &gp->cdsEnd, chromSize); for (iExon = 0; iExon < gp->exonCount; iExon++) reverseUnsignedRange(&gp->exonStarts[iExon], &gp->exonEnds[iExon], chromSize); reverseUnsigned(gp->exonStarts, gp->exonCount); reverseUnsigned(gp->exonEnds, gp->exonCount); if (gp->optFields & genePredCdsStatFld) { enum cdsStatus hold = gp->cdsStartStat; gp->cdsStartStat = gp->cdsEndStat; gp->cdsEndStat = hold; } if (gp->optFields & genePredExonFramesFld) reverseInts(gp->exonFrames, gp->exonCount); } int genePredCdsSize(struct genePred *gp) /* compute the number of bases of CDS */ { int iExon, start, end, cdsBases = 0; for (iExon = 0; iExon < gp->exonCount; iExon++) { if (genePredCdsExon(gp, iExon, &start, &end)) cdsBases += (end - start); } return cdsBases; } struct genePred *genePredNew(char *name, char *chrom, char strand, unsigned txStart, unsigned txEnd, unsigned cdsStart, unsigned cdsEnd, unsigned optFields, unsigned exonSpace) /* create a new gene with space for the specified number of exons allocated. * genePredGrow maybe used to expand this space if needed. */ { struct genePred *gp; AllocVar(gp); assert(exonSpace > 0); gp->name = cloneString(name); gp->chrom = cloneString(chrom); gp->strand[0] = strand; gp->txStart = txStart; gp->txEnd = txEnd; gp->cdsStart = cdsStart; gp->cdsEnd = cdsEnd; gp->optFields = optFields; AllocArray(gp->exonStarts, exonSpace); AllocArray(gp->exonEnds, exonSpace); if (gp->optFields & genePredExonFramesFld) AllocArray(gp->exonFrames, exonSpace); return gp; } void genePredGrow(struct genePred *gp, unsigned *exonSpacePtr) /* Increase memory allocated to a psl to hold more exons. exonSpacePtr * should point the the current maximum number of exons and will be * updated to with the new amount of space. */ { int exonSpace = *exonSpacePtr; int newSpace = 2 * exonSpace; ExpandArray(gp->exonStarts, exonSpace, newSpace); ExpandArray(gp->exonEnds, exonSpace, newSpace); if (gp->optFields & genePredExonFramesFld) ExpandArray(gp->exonFrames, exonSpace, newSpace); *exonSpacePtr = newSpace; } struct rbTree *genePredToRangeTree(struct genePred *gp, boolean cdsOnly) /* Convert genePred into a range tree. */ { struct rbTree *rangeTree = rangeTreeNew(); int i; for (i=0; iexonCount; ++i) { int start = gp->exonStarts[i]; int end = gp->exonEnds[i]; if (cdsOnly) { start = max(start, gp->cdsStart); end = min(end, gp->cdsEnd); if (start >= end) continue; } rangeTreeAdd(rangeTree, start, end); } return rangeTree; } void gpPartOutAsBed(struct genePred *gp, int start, int end, FILE *f, char *type, int id, int minSize) /* Write out part of gp as bed12. */ { /* Figure out # of blocks and min/max of area inside start/end */ int blockCount = 0; int newStart = gp->txEnd, newEnd = gp->txStart; int size = 0; int i; for (i=0; iexonCount; ++i) { int exonStart = gp->exonStarts[i]; int exonEnd = gp->exonEnds[i]; exonStart = max(start, exonStart); exonEnd = min(end, exonEnd); if (exonStart < exonEnd) { ++blockCount; newStart = min(exonStart, newStart); newEnd = max(exonEnd, newEnd); size += exonEnd - exonStart; } } /* Output first 10 fields of bed12. */ if (size > minSize) { fprintf(f, "%s\t%d\t%d\t", gp->chrom, newStart, newEnd); fprintf(f, "%s_%d_%s\t", type, id, gp->name); fprintf(f, "0\t%s\t0\t0\t0\t%d\t", gp->strand, blockCount); /* Output blockSizes field */ for (i=0; iexonCount; ++i) { int exonStart = gp->exonStarts[i]; int exonEnd = gp->exonEnds[i]; exonStart = max(start, exonStart); exonEnd = min(end, exonEnd); if (exonStart < exonEnd) fprintf(f, "%d,", exonEnd - exonStart); } fprintf(f, "\t"); /* Output chromStarts field */ for (i=0; iexonCount; ++i) { int exonStart = gp->exonStarts[i]; int exonEnd = gp->exonEnds[i]; exonStart = max(start, exonStart); exonEnd = min(end, exonEnd); if (exonStart < exonEnd) fprintf(f, "%d,", exonStart - newStart); } fprintf(f, "\n"); } }