be8645fb43ba545dc342deb80cff297c5b677a5e braney Tue Sep 6 11:11:15 2016 -0700 allow bigGenePred to be used to optimize knownGene on hgTracks #15259 diff --git src/hg/inc/genePred.h src/hg/inc/genePred.h index 71dd402..f60b8a3 100644 --- src/hg/inc/genePred.h +++ src/hg/inc/genePred.h @@ -55,30 +55,63 @@ * ids if available */ }; enum genePredFields /* Bit set to indicate which optional fields are used. * N.B. value order must match order in genePred */ { genePredNoOptFld = 0x00, /* use for no opt fields */ genePredScoreFld = 0x01, /* score field */ genePredName2Fld = 0x02, /* name2 field */ genePredCdsStatFld = 0x04, /* cdsStart/EndStat fields */ genePredExonFramesFld = 0x08, /* exonFrames field */ genePredAllFlds = 0xFF /* include all extended fields */ }; +struct genePredExt +/* A gene prediction, with extended fields. */ +{ + struct genePredExt *next; /* Next in singly linked list. */ + char *name; /* Name of loci, transcript, mRNA, etc */ + char *chrom; /* Chromosome name */ + char strand[2]; /* + or - for strand */ + unsigned txStart; /* Transcription start position */ + unsigned txEnd; /* Transcription end position */ + unsigned cdsStart; /* Coding region start */ + unsigned cdsEnd; /* Coding region end */ + unsigned exonCount; /* Number of exons */ + unsigned *exonStarts; /* Exon start positions */ + unsigned *exonEnds; /* Exon end positions */ + + /* optional fields */ + unsigned optFields; /* which optional fields are used (not in + * database) */ + int score; /* score */ + char *name2; /* Secondary name. (e.g. name of gene), or + * empty if none, NULL if field not + * requested */ + enum cdsStatus cdsStartStat; /* Status of cdsStart annotation */ + enum cdsStatus cdsEndStat; /* Status of cdsEnd annotation */ + int *exonFrames; /* List of frame for each exon, or -1 + * if no frame or not known. NULL if not + * available. */ + char *type; + char *geneName; + char *geneName2; + char *geneType; +}; + struct genePred /* A gene prediction, with optional fields. */ { struct genePred *next; /* Next in singly linked list. */ char *name; /* Name of loci, transcript, mRNA, etc */ char *chrom; /* Chromosome name */ char strand[2]; /* + or - for strand */ unsigned txStart; /* Transcription start position */ unsigned txEnd; /* Transcription end position */ unsigned cdsStart; /* Coding region start */ unsigned cdsEnd; /* Coding region end */ unsigned exonCount; /* Number of exons */ unsigned *exonStarts; /* Exon start positions */ unsigned *exonEnds; /* Exon end positions */ @@ -132,37 +165,43 @@ void genePredFreeList(struct genePred **pList); /* Free a list of dynamically allocated genePred's */ void genePredOutput(struct genePred *el, FILE *f, char sep, char lastSep); /* Print out genePred. Separate fields with sep. Follow last field with lastSep. */ #define genePredTabOut(el,f) genePredOutput(el,f,'\t','\n') /* Print out genePred as a line in a tab-separated file. */ #define genePredCommaOut(el,f) genePredOutput(el,f,',',',') /* Print out genePred as a comma separated list including final comma. */ /* --------- Start of hand generated code. ---------------------------- */ +struct genePred *genePredKnownLoad(char **row, int numCols); +/* Load all genePreds with from tab-separated file in knownGene format */ + struct genePred *genePredExtLoad(char **row, int numCols); /* Load a genePred with from a row, with optional fields. The row must * contain columns in the order in the struct, and they must be present up to * the last specfied optional field. Missing intermediate fields must have * zero or empty columns, they may not be omitted. Fields at the end can be * omitted. Dispose of this with genePredFree(). */ +struct genePred *genePredKnownLoadAll(char *fileName); +/* Load all genePreds with from tab-separated file in knownGene format */ + struct genePred *genePredExtLoadAll(char *fileName); /* Load all genePreds with from tab-separated file, possibly with optional * fields. Dispose of this with genePredFreeList(). */ char *genePredCdsStatStr(enum cdsStatus stat); /* get string value of a cdsStatus */ enum cdsStatus parseCdsStat(char *statStr); /* parse a cdsStatus string */ void genePredAddGenbankCds(struct psl *psl, struct genbankCds* cds, struct genePred *gene); /* Convert cdsStart/End from mrna to genomic coordinates. * Note that the genePred blocks need not be filled in before * this call. */ @@ -315,22 +354,22 @@ // Return asObject describing fields of genePred struct dnaSeq *genePredGetDna(char *database, struct genePred *gp, boolean coding, enum dnaCase dnaCase); // Returns the DNA sequence associated with gene prediction. // Negative strand genes will return the sequence as read from the negative strand. // Optionally restrict to coding sequence only int genePredBaseToCodingPos(struct genePred *gp, int basePos, boolean stranded, boolean *isCoding); // Given a genePred model and a single (0 based) base position, predict the 0-based // DNA (stranded) coding sequence pos. Dividing this number by 3 should give the AA position! // Returns -1 when outside of coding exons unless OPTIONAL isCoding pointer to boolean is // provided. In that case, returns last valid position and sets isCoding to FALSE. -struct genePred *genePredFromBigGenePred( char *chrom, struct bigBedInterval *bb); +struct genePredExt *genePredFromBigGenePred( char *chrom, struct bigBedInterval *bb); /* build a genePred from a bigGenePred interval */ -struct genePred *genePredFromBigGenePredRow(char **row); +struct genePredExt *genePredFromBigGenePredRow(char **row); /* build a genePred from a bigGenePred row */ #endif /* GENEPRED_H */