be8645fb43ba545dc342deb80cff297c5b677a5e braney Tue Sep 6 11:11:15 2016 -0700 allow bigGenePred to be used to optimize knownGene on hgTracks #15259 diff --git src/hg/lib/genePred.c src/hg/lib/genePred.c index a00c210..6165e05 100644 --- src/hg/lib/genePred.c +++ src/hg/lib/genePred.c @@ -289,30 +289,57 @@ /* parse a cdsStatus string */ { if ((statStr == NULL) || sameString(statStr, "none")) return cdsNone; if (sameString(statStr, "unk")) return cdsUnknown; if (sameString(statStr, "incmpl")) return cdsIncomplete; if (sameString(statStr, "cmpl")) return cdsComplete; errAbort("invalid genePred cdsStatus: \"%s\"", statStr); return cdsNone; /* make compiler happy */ } +struct genePred *genePredKnownLoad(char **row, int numCols) +/* Load a genePred in knownGene format from row. */ +{ +struct genePred *ret; +int sizeOne; + +AllocVar(ret); +ret->exonCount = sqlUnsigned(row[7]); +ret->name = cloneString(row[0]); +ret->chrom = cloneString(row[1]); +strcpy(ret->strand, row[2]); +ret->txStart = sqlUnsigned(row[3]); +ret->txEnd = sqlUnsigned(row[4]); +ret->cdsStart = sqlUnsigned(row[5]); +ret->cdsEnd = sqlUnsigned(row[6]); +sqlUnsignedDynamicArray(row[8], &ret->exonStarts, &sizeOne); +if (sizeOne != ret->exonCount) + errAbort("genePred: %s number of exonStarts (%d) != number of exons (%d)", + ret->name, sizeOne, ret->exonCount); +sqlUnsignedDynamicArray(row[9], &ret->exonEnds, &sizeOne); +if (sizeOne != ret->exonCount) + errAbort("genePred: %s number of exonEnds (%d) != number of exons (%d)", + ret->name, sizeOne, ret->exonCount); + +ret->name2 = cloneString(row[11]); +return ret; +} struct genePred *genePredExtLoad(char **row, int numCols) /* Load a genePred with from a row, with optional fields. The row must * contain columns in the order in the struct, and they must be present up to * the last specfied optional field. Missing intermediate fields must have * zero or empty columns, they may not be omitted. Fields at the end can be * omitted. Dispose of this with genePredFree(). */ { struct genePred *ret; int sizeOne, iCol; AllocVar(ret); ret->exonCount = sqlUnsigned(row[7]); ret->name = cloneString(row[0]); ret->chrom = cloneString(row[1]); strcpy(ret->strand, row[2]); @@ -351,30 +378,49 @@ { ret->cdsEndStat = parseCdsStat(row[iCol++]); ret->optFields |= genePredCdsStatFld; } if (iCol < numCols) { sqlSignedDynamicArray(row[iCol++], &ret->exonFrames, &sizeOne); if (sizeOne != ret->exonCount) errAbort("genePred: %s number of exonFrames (%d) != number of exons (%d)", ret->name, sizeOne, ret->exonCount); ret->optFields |= genePredExonFramesFld; } return ret; } +struct genePred *genePredKnownLoadAll(char *fileName) +/* Load all genePreds with from tab-separated file in knownGene format */ +{ +struct genePred *list = NULL, *el; +struct lineFile *lf = lineFileOpen(fileName, TRUE); +char *row[GENEPREDX_NUM_COLS]; +int numCols; + +while ((numCols = lineFileChopNextTab(lf, row, ArraySize(row))) > 0) + { + lineFileExpectAtLeast(lf, GENEPRED_NUM_COLS, numCols); + el = genePredKnownLoad(row, numCols); + slAddHead(&list, el); + } +lineFileClose(&lf); +slReverse(&list); +return list; +} + struct genePred *genePredExtLoadAll(char *fileName) /* Load all genePreds with from tab-separated file, possibly with optional * fields. Dispose of this with genePredFreeList(). */ { struct genePred *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[GENEPREDX_NUM_COLS]; int numCols; while ((numCols = lineFileChopNextTab(lf, row, ArraySize(row))) > 0) { lineFileExpectAtLeast(lf, GENEPRED_NUM_COLS, numCols); el = genePredExtLoad(row, numCols); slAddHead(&list, el); } @@ -2083,40 +2129,40 @@ // Yet to reach the target base... accumulate exon worth codingBasesSoFar += (exonEnd - exonStart); } exonIx += (reverse ? -1 : 1); } if (isCoding != NULL && codingBasesSoFar > 0) { *isCoding = FALSE; return codingBasesSoFar; } return -1; // introns not okay } -struct genePred *genePredFromBigGenePred( char *chrom, struct bigBedInterval *bb) +struct genePredExt *genePredFromBigGenePred( char *chrom, struct bigBedInterval *bb) /* build a genePred from a bigGenePred */ { char *extra = cloneString(bb->rest); int numCols = 12 + 8 - 3; char *row[numCols]; int wordCount = chopByChar(extra, '\t', row, numCols); assert(wordCount == numCols); -struct genePred *gp; +struct genePredExt *gp; AllocVar(gp); gp->chrom = chrom; gp->txStart = bb->start; gp->txEnd = bb->end; gp->name = cloneString(row[ 0]); gp->strand[0] = row[ 2][0]; gp->strand[1] = row[ 2][1]; gp->cdsStart = atoi(row[ 3]); gp->cdsEnd = atoi(row[ 4]); gp->exonCount = atoi(row[ 6]); int numBlocks; sqlUnsignedDynamicArray(row[ 8], &gp->exonStarts, &numBlocks); assert (numBlocks == gp->exonCount); sqlUnsignedDynamicArray(row[ 7], &gp->exonEnds, &numBlocks); @@ -2125,62 +2171,69 @@ int ii; for(ii=0; ii < numBlocks; ii++) { gp->exonStarts[ii] += bb->start; gp->exonEnds[ii] += gp->exonStarts[ii]; } gp->name2 = cloneString(row[ 9]); gp->cdsStartStat = parseCdsStat(row[ 10]); gp->cdsEndStat = parseCdsStat(row[ 11]); sqlSignedDynamicArray(row[ 12], &gp->exonFrames, &numBlocks); gp->optFields |= genePredExonFramesFld; assert (numBlocks == gp->exonCount); +gp->type = cloneString(row[13]); +gp->geneName = cloneString(row[14]); +gp->geneName2 = cloneString(row[15]); + return gp; } static void sqlUnsignedDynamicArrayNoClobber(char *s, unsigned **retArray, int *retSize) /* Make a copy of s on stack and chop that up so we don't mangle s. */ { char copy[strlen(s)+1]; safecpy(copy, sizeof(copy), s); sqlUnsignedDynamicArray(copy, retArray, retSize); } -struct genePred *genePredFromBigGenePredRow(char **row) +struct genePredExt *genePredFromBigGenePredRow(char **row) /* build a genePred from a bigGenePred row */ { -struct genePred *gp; +struct genePredExt *gp; AllocVar(gp); gp->chrom = cloneString(row[0]); gp->txStart = sqlUnsigned(row[1]); gp->txEnd = sqlUnsigned(row[2]); gp->name = cloneString(row[3]); gp->strand[0] = row[5][0]; gp->strand[1] = row[5][1]; gp->cdsStart = sqlUnsigned(row[6]); gp->cdsEnd = sqlUnsigned(row[7]); gp->exonCount = sqlUnsigned(row[9]); int numBlocks; sqlUnsignedDynamicArrayNoClobber(row[11], &gp->exonStarts, &numBlocks); assert (numBlocks == gp->exonCount); // First put blockSizes in exonEnds: sqlUnsignedDynamicArrayNoClobber(row[10], &gp->exonEnds, &numBlocks); assert (numBlocks == gp->exonCount); // Then add in txStart to relative starts, and add starts to block sizes to get ends: int ii; for(ii=0; ii < numBlocks; ii++) { gp->exonStarts[ii] += gp->txStart; gp->exonEnds[ii] += gp->exonStarts[ii]; } gp->name2 = cloneString(row[12]); gp->cdsStartStat = parseCdsStat(row[13]); gp->cdsEndStat = parseCdsStat(row[14]); gp->optFields |= genePredCdsStatFld; sqlSignedDynamicArray(row[15], &gp->exonFrames, &numBlocks); assert (numBlocks == gp->exonCount); gp->optFields |= genePredExonFramesFld; +gp->type = cloneString(row[16]); +gp->geneName = cloneString(row[17]); +gp->geneName2 = cloneString(row[18]); return gp; }