e2467a639cc1e98174ffbd9d0da399b3b75bc9ae markd Thu Jul 26 21:33:38 2012 -0700 highlighting by attribute functionality for GENCODE diff --git src/hg/lib/genePredReader.c src/hg/lib/genePredReader.c index 75d5c47..917e394 100644 --- src/hg/lib/genePredReader.c +++ src/hg/lib/genePredReader.c @@ -1,343 +1,309 @@ /* genePredReader - object to read genePred objects from database tables * or files. */ #include "common.h" #include "genePredReader.h" #include "jksql.h" #include "hdb.h" #include "linefile.h" #include "genePred.h" #include "binRange.h" /* Aliases for name2 field */ static char *name2Aliases[] = { "geneName", /* refFlat */ "proteinID", /* knownGene */ NULL }; /* Field names in the order that they must be passed to genePredLoadExt, with * the index and optFields flags and aliases for the fields in certain * tables. Also default values to use. */ struct field { char *fld; int fldIdx; unsigned optFlag; char **aliases; char *defaultVal; }; static struct field fieldTbl[] = { {"name", 0, 0, NULL, NULL}, {"chrom", 1, 0, NULL, NULL}, {"strand", 2, 0, NULL, NULL}, {"txStart", 3, 0, NULL, NULL}, {"txEnd", 4, 0, NULL, NULL}, {"cdsStart", 5, 0, NULL, NULL}, {"cdsEnd", 6, 0, NULL, NULL}, {"exonCount", 7, 0, NULL, NULL}, {"exonStarts", 8, 0, NULL, NULL}, {"exonEnds", 9, 0, NULL, NULL}, {"score", 10, genePredScoreFld, NULL, "0"}, {"name2", 11, genePredName2Fld, name2Aliases, ""}, {"cdsStartStat", 12, genePredCdsStatFld, NULL, "none"}, {"cdsEndStat", 13, genePredCdsStatFld, NULL, "none"}, {"exonFrames", 14, genePredExonFramesFld, NULL, NULL}, {NULL, 0, 0, NULL, NULL}, }; struct genePredReader /* Object to read genePred objects from database tables or files. */ { char *table; /* name of table or file */ /* for DB access */ unsigned optFields; /* optional fields being included from DB */ unsigned numFields; unsigned queryCols; struct sqlResult *sr; /* results if reading from a DB */ int rowOffset; /* offset if have a bin column */ int queryToFldMap[GENEPREDX_NUM_COLS+1]; /* map of columns in query order * to field order, may include * bin, hence =1 */ /* for file access */ struct lineFile *lf; /* lineFile when reading from a file */ char* chrom; /* chrom restriction for files */ }; static int optFieldsToNumFields(unsigned optFields) /* determine the required number of columns, given the desired optional * fields */ { int numFields = GENEPRED_NUM_COLS; if (optFields & genePredScoreFld) numFields = GENEPRED_NUM_COLS+1; if (optFields & genePredName2Fld) numFields = GENEPRED_NUM_COLS+2; if (optFields & genePredCdsStatFld) numFields = GENEPRED_NUM_COLS+4; /* two columns */ if (optFields & genePredExonFramesFld) numFields = GENEPRED_NUM_COLS+5; return numFields; } static struct field* findField(char *fname) /* search fieldTbl for the specified field */ { int iFld, iAlias; for (iFld = 0; fieldTbl[iFld].fld != NULL; iFld++) { if (sameString(fieldTbl[iFld].fld, fname)) return &(fieldTbl[iFld]); if (fieldTbl[iFld].aliases != NULL) { for (iAlias = 0; fieldTbl[iFld].aliases[iAlias] != NULL; iAlias++) { if (sameString(fieldTbl[iFld].aliases[iAlias], fname)) return &(fieldTbl[iFld]); } } } return NULL; } static void buildResultFieldMap(struct genePredReader* gpr) /* determine indices of fields for current result and fill in the mapping * table. */ { int iCol = 0, iFld; char *fname; /* initialize to not used */ for (iFld = 0; iFld < GENEPREDX_NUM_COLS+1; iFld++) gpr->queryToFldMap[iFld] = -1; /* build sparse field map */ while ((fname = sqlFieldName(gpr->sr)) != NULL) { if (sameString(fname, "bin")) { if (iCol != 0) errAbort("bin column not first column in %s", gpr->table); gpr->rowOffset = 1; } else { struct field* field = findField(fname); if (field != NULL) { gpr->queryToFldMap[iCol] = field->fldIdx; gpr->optFields |= field->optFlag; } } gpr->queryCols++; iCol++; } gpr->numFields = optFieldsToNumFields(gpr->optFields); } struct genePredReader *genePredReaderQuery(struct sqlConnection* conn, char* table, char* where) /* Create a new genePredReader to read from the given table in the database. * If where is not null, it is added as a where clause. It will determine if * extended genePred columns are in the table. */ { char query[1024]; struct genePredReader* gpr; AllocVar(gpr); gpr->table = cloneString(table); if (where != NULL) safef(query, sizeof(query), "select * from %s where %s", table, where); else safef(query, sizeof(query), "select * from %s", table); gpr->sr = sqlGetResult(conn, query); buildResultFieldMap(gpr); return gpr; } struct genePredReader *genePredReaderRangeQuery(struct sqlConnection* conn, char* table, char* chrom, int start, int end, char* extraWhere) /* Create a new genePredReader to read a chrom range in a database table. If * extraWhere is not null, it is added as an additional where condition. It * will determine if extended genePred columns are in the table. */ { struct genePredReader* gpr; int rowOffset; AllocVar(gpr); gpr->table = cloneString(table); /* non-existant table will return null */ gpr->sr = hRangeQuery(conn, table, chrom, start, end, extraWhere, &rowOffset); if (gpr->sr != NULL) buildResultFieldMap(gpr); assert(gpr->rowOffset == rowOffset); return gpr; } struct genePredReader *genePredReaderFile(char* gpFile, char* chrom) /* Create a new genePredReader to read from a file. If chrom is not null, * only this chromsome is read. The rows must contain columns in the order in * the struct, and they must be present up to the last specfied optional * field. Missing intermediate fields must have zero or empty columns, they * may not be omitted. */ { struct genePredReader* gpr; AllocVar(gpr); gpr->table = cloneString(gpFile); if (chrom != NULL) gpr->chrom = cloneString(chrom); gpr->lf = lineFileOpen(gpFile, TRUE); return gpr; } static struct genePred *queryNext(struct genePredReader* gpr) /* read the next record from a query */ { int iFld, iCol; char **row = sqlNextRow(gpr->sr); char *reorderedRow[GENEPREDX_NUM_COLS]; if (row == NULL) return NULL; /* fill in row defaults */ for (iFld = 0; iFld < GENEPREDX_NUM_COLS; iFld++) reorderedRow[iFld] = fieldTbl[iFld].defaultVal; /* reorder row */ for (iCol = 0; iCol < gpr->queryCols; iCol++) { iFld = gpr->queryToFldMap[iCol]; if (iFld >= 0) reorderedRow[iFld] = row[iCol]; } return genePredExtLoad(reorderedRow, gpr->numFields); } static struct genePred *fileNext(struct genePredReader* gpr) /* read the next record from a file */ { char *row[GENEPREDX_NUM_COLS]; int numFields; while ((numFields = lineFileChopNextTab(gpr->lf, row, GENEPREDX_NUM_COLS)) > 0) { lineFileExpectAtLeast(gpr->lf, GENEPRED_NUM_COLS, numFields); if ((gpr->chrom == NULL) || (sameString(row[1], gpr->chrom))) return genePredExtLoad(row, numFields); } return NULL; } struct genePred *genePredReaderNext(struct genePredReader* gpr) /* Read the next genePred, returning NULL if no more */ { if (gpr->lf != NULL) return fileNext(gpr); else return queryNext(gpr); } struct genePred *genePredReaderAll(struct genePredReader* gpr) /* Read the all of genePreds */ { struct genePred* gpList = NULL, *gp; while ((gp = genePredReaderNext(gpr)) != NULL) slAddHead(&gpList, gp); slReverse(&gpList); return gpList; } void genePredReaderFree(struct genePredReader** gprPtr) /* Free the genePredRead object. */ { struct genePredReader* gpr = *gprPtr; if (gpr != NULL) { freeMem(gpr->table); sqlFreeResult(&gpr->sr); lineFileClose(&gpr->lf); freeMem(gpr->chrom); freez(gprPtr); } } struct genePred *genePredReaderLoadQuery(struct sqlConnection* conn, char* table, char* where) /* Function that encapsulates doing a query and loading the results */ { struct genePredReader *gpr = genePredReaderQuery(conn, table, where); struct genePred *gpList = genePredReaderAll(gpr); genePredReaderFree(&gpr); return gpList; } struct genePred *genePredReaderLoadRangeQuery(struct sqlConnection* conn, char* table, char* chrom, int start, int end, char* extraWhere) /* Function that encapsulates doing a range query and loading the results */ { struct genePredReader *gpr = genePredReaderRangeQuery(conn, table, chrom, start, end, extraWhere); struct genePred *gpList = genePredReaderAll(gpr); genePredReaderFree(&gpr); return gpList; } struct genePred *genePredReaderLoadFile(char* gpFile, char* chrom) /* Function that encapsulates reading a genePred file */ { struct genePredReader *gpr = genePredReaderFile(gpFile, chrom); struct genePred *gpList = genePredReaderAll(gpr); genePredReaderFree(&gpr); return gpList; } - -struct hash *genePredToBinKeeper(char *sizeFileName, char *gpFile) -/* read a list of genePreds and return results in hash of binKeeper structure for fast query*/ -{ -struct binKeeper *bk; -struct genePred *gp , *gpList = NULL; -struct lineFile *sf = lineFileOpen(sizeFileName, TRUE); -struct hash *hash = newHash(0); -char *chromRow[2]; - -while (lineFileRow(sf, chromRow)) - { - char *name = chromRow[0]; - int size = lineFileNeedNum(sf, chromRow, 1); - - if (hashLookup(hash, name) != NULL) - warn("Duplicate %s, ignoring all but first\n", name); - else - { - bk = binKeeperNew(0, size); - assert(size > 1); - hashAdd(hash, name, bk); - } - } -gpList = genePredReaderLoadFile(gpFile, NULL); -for (gp = gpList ; gp!= NULL; gp=gp->next) - { - bk = hashMustFindVal(hash, gp->chrom); - binKeeperAdd(bk, gp->txStart, gp->txEnd, gp); - } -lineFileClose(&sf); -return hash; -} -