 /* genePred.h was originally generated by the autoSql program, which also 
  * generated genePred.c and genePred.sql.  This header links the database and the RAM 
  * representation of objects. */
+/* Copyright (C) 2013 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
 #ifndef GENEPRED_H
 #define GENEPRED_H
 #include "dnaseq.h"
 struct gff;
 struct gffFile;
 struct gffGroup;
 struct psl;
 struct genbankCds;
 struct rbTree;
 enum cdsStatus
 /* value to indicate status of CDS annotation at either start or end */
     cdsNone,        /* "none" - No CDS (non-coding)  */
     cdsUnknown,     /* "unk" - CDS is unknown (coding, but not known)  */
     cdsIncomplete,  /* "incmpl" - CDS is not complete at this end  */
     cdsComplete,    /* "cmpl" - CDS is complete at this end  */
 enum genePredCreateOpts
 /* bit set of options for genePredGetCreateSql */
     genePredBasicSql = 0x00, /* used if nothing special */
     genePredWithBin = 0x01   /* create bin column */
 enum genePredFromPslOpts
 /* bit set of options for genePredFromPsl3 */
     genePredPslDefaults = 0x00,  /* used if nothing special */
     genePredPslCdsMod3  = 0x01   /* only merge gaps in CDS if mod 3 */
 enum genePredFromGxfOpts
 /* bit set of options for genePredFromGroupedGff/genePredFromGroupedGtf */
     genePredGxfDefaults = 0x00,            /* used if nothing special */
     genePredGxfImpliedStopAfterCds = 0x01, /* stop codon is implied outside of
                                             * the annotated CDS bounds  */
     genePredGxfGeneNameAsName2 = 0x02      /* use gene_name instead of gene_id
                                             * for name2 */
 enum genePredFields
 /* Bit set to indicate which optional fields are used.
  * N.B. value order must match order in genePred */
     genePredNoOptFld      = 0x00,  /* use for no opt fields */
     genePredScoreFld      = 0x01,  /* score field */
     genePredName2Fld      = 0x02,  /* name2 field */
     genePredCdsStatFld    = 0x04,  /* cdsStart/EndStat fields */
     genePredExonFramesFld = 0x08,  /* exonFrames field */
     genePredAllFlds       = 0xFF   /* include all extended fields */
 struct genePred
 /* A gene prediction, with optional fields. */
     struct genePred *next;  /* Next in singly linked list. */
     char *name;	/* Name of loci, transcript, mRNA, etc */
     char *chrom;	/* Chromosome name */
     char strand[2];	/* + or - for strand */
     unsigned txStart;	/* Transcription start position */
     unsigned txEnd;	/* Transcription end position */
     unsigned cdsStart;	/* Coding region start */
     unsigned cdsEnd;	/* Coding region end */
     unsigned exonCount;	/* Number of exons */
     unsigned *exonStarts;	/* Exon start positions */
     unsigned *exonEnds;	/* Exon end positions */
     /* optional fields */
     unsigned optFields;           /* which optional fields are used (not in
                                    * database) */
     int score;                    /* score */
     char *name2;                  /* Secondary name. (e.g. name of gene), or
                                    * empty if none, NULL if field not
                                    * requested */
     enum cdsStatus cdsStartStat;  /* Status of cdsStart annotation */
     enum cdsStatus cdsEndStat;    /* Status of cdsEnd annotation */
     int *exonFrames;              /* List of frame for each exon, or -1
                                    * if no frame or not known. NULL if not
                                    * available. */
 /* Standard value to use for insertMergeSize when creating genePred.
  * Set to 8 due to microdeletions.
 #define genePredStdInsertMergeSize 8
 #define GENEPRED_NUM_COLS 10  /* number of columns in a genePred */
 #define GENEPREDX_NUM_COLS 15  /* max number of columns in extended genePred */
 struct genePred *genePredLoad(char **row);
 /* Load a genePred from row fetched with select * from genePred
  * from database.  Dispose of this with genePredFree(). 
  * NOTE: cannabalizes the row argument */
 struct genePred *genePredLoadAll(char *fileName);
 /* Load all genePred from whitespace-separated file.
  * Dispose of this with genePredFreeList(). */
 struct genePred *genePredLoadAllByChar(char *fileName, char chopper);
 /* Load all genePred from chopper separated file.
  * Dispose of this with genePredFreeList(). */
 #define genePredLoadAllByTab(a) genePredLoadAllByChar(a, '\t');
 /* Load all genePred from tab separated file.
  * Dispose of this with genePredFreeList(). */
 struct genePred *genePredCommaIn(char **pS, struct genePred *ret);
 /* Create a genePred out of a comma separated string. 
  * This will fill in ret if non-null, otherwise will
  * return a new genePred */
 void genePredFree(struct genePred **pEl);
 /* Free a single dynamically allocated genePred such as created
  * with genePredLoad(). */
 void genePredFreeList(struct genePred **pList);
 /* Free a list of dynamically allocated genePred's */
 void genePredOutput(struct genePred *el, FILE *f, char sep, char lastSep);
 /* Print out genePred.  Separate fields with sep. Follow last field with lastSep. */
 #define genePredTabOut(el,f) genePredOutput(el,f,'\t','\n')
 /* Print out genePred as a line in a tab-separated file. */
 #define genePredCommaOut(el,f) genePredOutput(el,f,',',',')
 /* Print out genePred as a comma separated list including final comma. */
 /* ---------  Start of hand generated code. ---------------------------- */
 struct genePred *genePredExtLoad(char **row, int numCols);
 /* Load a genePred with from a row, with optional fields.  The row must
  * contain columns in the order in the struct, and they must be present up to
  * the last specfied optional field.  Missing intermediate fields must have
  * zero or empty columns, they may not be omitted.  Fields at the end can be
  * omitted. Dispose of this with genePredFree(). */
 struct genePred *genePredExtLoadAll(char *fileName);
 /* Load all genePreds with from tab-separated file, possibly with optional
  * fields. Dispose of this with genePredFreeList(). */
 char *genePredCdsStatStr(enum cdsStatus stat);
 /* get string value of a cdsStatus */
 void genePredAddGenbankCds(struct psl *psl, struct genbankCds* cds, 
 	struct genePred *gene);
 /* Convert cdsStart/End from mrna to genomic coordinates. 
  * Note that the genePred blocks need not be filled in before
  * this call. */
 int genePredCmp(const void *va, const void *vb);
 /* Compare to sort based on chromosome, txStart. */
 int genePredNameCmp(const void *va, const void *vb);
 /* Compare to sort based on name, then chromosome, txStart. */
 struct genePred *genePredFromGroupedGff(struct gffFile *gff, struct gffGroup *group, 
                                         char *name, char *exonSelectWord, unsigned optFields,
                                         unsigned options);
 /* Convert gff->groupList to genePred list.   Only put lines where feature type  matches
  * exonSelectWord into the gene.  (If exonSelectWord is NULL, all go in)
  * If optFields contains the bit set of optional fields to add to the genePred.
  * If genePredCdsStatFld is set, then the CDS status information is
  * set based on the presences of start_codon, stop_codon, and CDS features.
  * If genePredExonFramesFld is set, then frame is set as specified in the GTF.
  * Options are from genePredFromGxfOpts.  If genePredGxfImpliedStopAfterCds
  * is specified, it is treated as if a stop_codon annotation was found,
  * if there isn't one.  If genePredGxfGeneNameAsName2 is specified, use
  * gene_name for the name2 field otherwise gene_id.
 struct genePred *genePredFromGroupedGtf(struct gffFile *gff, struct gffGroup *group, char *name,
                                         unsigned optFields, unsigned options);
 /* Convert gff->groupList to genePred list, using GTF feature conventions;
  * including the stop codon in the 3' UTR, not the CDS (grr).  Assumes
  * gffGroup is sorted in assending coords, with overlaping starts sorted by
  * end coords, which is true if it was created by gffGroupLines().  If
  * optFields contains the bit set of optional fields to add to the genePred.
  * If genePredName2Fld is specified, then the gene_id is used for the name2
  * field.  If genePredCdsStatFld is set, then the CDS status information is
  * set based on the presences of start_codon, stop_codon, and CDS features.
  * If genePredExonFramesFld is set, then frame is set as specified in the GTF.
  * Options are from genePredFromGxfOpts.  If genePredGxfImpliedStopAfterCds
  * is specified, it is treated as if a stop_codon annotation was found,
  * if there isn't one.
 struct genePred *genePredFromPsl3(struct psl *psl,  struct genbankCds* cds, 
                                   unsigned optFields, unsigned options,
                                   int cdsMergeSize, int utrMergeSize);
 /* Convert a PSL of an mRNA alignment to a genePred, converting a genbank CDS
  * specification string to genomic coordinates. Small genomic inserts are
  * merged based on the mergeSize parameters.  Gaps no larger than the
  * specified merge sizes result in the adjacent blocks being merged into a
  * single exon.  Gaps in CDS use cdsMergeSize, in UTR use utrMergeSize.  If
  * the genePredPslCdsMod3 option is specified, then CDS gaps are only merged
  * if a multiple of three.  A negative merge sizes disables merging of blocks.
  * This differs from specifying zero in that adjacent blocks will not be
  * merged. The optfields field is a set from genePredFields, indicated what
  * fields to create.  Zero-length CDS, or null cds, creates without CDS
  * annotation.  If cds is null, it will set status fields to cdsNone.  */
 struct genePred *genePredFromPsl2(struct psl *psl, unsigned optFields,
                                   struct genbankCds* cds, int insertMergeSize);
 /* Compatibility function, genePredFromPsl3 is prefered.  See that function's
  * documentation for details. This calls genePredFromPsl3 with no options
  * and insertMergeSize set for CDS and UTR.
 struct genePred *genePredFromPsl(struct psl *psl, int cdsStart, int cdsEnd,
                                  int insertMergeSize);
 /* Compatibility function, genePredFromPsl3 is prefered.  See that function's
  * documentation for details. This calls genePredFromPsl3 with no options.
 char* genePredGetCreateSql(char* table, unsigned optFields, unsigned options,
                            int chromIndexLen);
 /* Get SQL required to create a genePred table. optFields is a bit set
  * consisting of the genePredFields values. Options are a bit set of
  * genePredCreateOpts. Returned string should be freed.  This will create all
  * optional fields that preceed the highest optFields column.  chromIndexLen
  * is now ignored.. */
 struct genePred *getOverlappingGene(char *db, struct genePred **list,  char *table, char *chrom, int cStart, int cEnd, char *name, int *retOverlap);
 /* read all genes from a table find the gene with the biggest overlap. 
  * Cache the list of genes to so we only read it once.
  * If there are multiple hits and the name that matches exactly, 
  * this overrides the biggest overlap */
 int genePredBases(struct genePred *gp);
 /* count coding and utr bases in a gene prediction */
 int genePredCodingBases(struct genePred *gp);
 /* Count up the number of coding bases in gene prediction. */
 boolean genePredCdsExon(struct genePred *gp, int iExon, int *startPtr, int *endPtr);
 /* Get the CDS range in an exon.  If there is no CDS, return FALSE and then
  * set start == end */
 int genePredCheck(char *desc, FILE* out, int chromSize, 
                   struct genePred* gp);
 /* Validate a genePred for consistency.  desc is printed the error messages
  * to file out (open /dev/null to discard).  chromSize should contain
  * size of chromosome, or 0 if chrom is not valid, or -1 to not check
  * chromosome bounds. Returns count of errors. */
 boolean genePredNmdTarget(struct genePred *gp);
 /* Return TRUE if cds end is more than 50bp upstream of
    last intron. */
 void genePredAddExonFrames(struct genePred *gp);
 /* Add exonFrames array to a genePred that doesn't have it. Frame is assumed
  * to be contiguous. */
 void genePredRc(struct genePred *gp, int chromSize);
 /* Reverse complement a genePred (project it to the opposite strand).  Useful
  * when doing analysis that is simplified by having things on the same strand.
 int genePredCdsSize(struct genePred *gp);
 /* compute the number of bases of CDS */
 struct genePred *genePredNew(char *name, char *chrom, char strand,
                              unsigned txStart, unsigned txEnd,
                              unsigned cdsStart, unsigned cdsEnd,
                              unsigned optFields, unsigned exonSpace);
 /* create a new gene with space for the specified number of exons allocated.
  * genePredGrow maybe used to expand this space if needed. */
 void genePredGrow(struct genePred *gp, unsigned *exonSpacePtr);
 /* Increase memory allocated to a psl to hold more exons.  exonSpacePtr
  * should point the the current maximum number of exons and will be
  * updated to with the new amount of space. */
 struct rbTree *genePredToRangeTree(struct genePred *gp, boolean cdsOnly);
 /* Convert genePred into a range tree. */
 void gpPartOutAsBed(struct genePred *gp, int start, int end, FILE *f, 
 	char *type, int id, int minSize);
 /* Write out part of gp as bed12. */
 boolean codonToPos(struct genePred *gp, unsigned num, int *chromStart, int *chromEnd);
 // map 1-based codon to genomic coordinates. If the codon crosses an exon junction, we return just the beginning (LHS) of the codon.
 // Returns true if we find the codon in given gene predition; chromStart and chromEnd are set to appropriate three base region.
 boolean exonToPos(struct genePred *gp, unsigned num, int *chromStart, int *chromEnd);
 // map 1-based exon number to genomic coordinates.
 // Returns true if we find the exon in given gene predition; chromStart and chromEnd are set to appropriate region.
 struct asObject *genePredAsObj();
 // Return asObject describing fields of genePred
 struct dnaSeq *genePredGetDna(char *database, struct genePred *gp,
                               boolean coding, enum dnaCase dnaCase);
 // Returns the DNA sequence associated with gene prediction.
 // Negative strand genes will return the sequence as read from the negative strand.
 // Optionally restrict to coding sequence only
 int genePredBaseToCodingPos(struct genePred *gp, int basePos,
                             boolean stranded, boolean *isCoding);
 // Given a genePred model and a single (0 based) base position, predict the 0-based
 // DNA (stranded) coding sequence pos.  Dividing this number by 3 should give the AA position!
 // Returns -1 when outside of coding exons unless OPTIONAL isCoding pointer to boolean is
 // provided. In that case, returns last valid position and sets isCoding to FALSE.
 #endif /* GENEPRED_H */