874c24c932ee906b6332b5df5b6f4e1c93bf8b92
markd
  Mon Feb 9 22:15:49 2015 -0800
Created a program to filter genePreds file.  Currently filters
based on genePredCheck validation errors.

diff --git src/hg/lib/genePred.c src/hg/lib/genePred.c
index a6d06ce..8abeefe 100644
--- src/hg/lib/genePred.c
+++ src/hg/lib/genePred.c
@@ -2,30 +2,31 @@
  * generated genePred.h and genePred.sql.  This module links the database and the RAM 
  * representation of objects. */
 
 /* Copyright (C) 2014 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 
 #include "common.h"
 #include "gff.h"
 #include "jksql.h"
 #include "psl.h"
 #include "linefile.h"
 #include "genePred.h"
 #include "genbank.h"
 #include "rangeTree.h"
 #include "hdb.h"
+#include "chromInfo.h"
 
 
 /* SQL to create a genePred table */
 static char *createSql = 
 "CREATE TABLE %s ("
 "   %-s"                                 /* bin column goes here */
 "   name varchar(255) not null,"	/* mrna accession of gene */
 "   chrom varchar(255) not null,"	/* Chromosome name */
 "   strand char(1) not null,"		/* + or - for strand */
 "   txStart int unsigned not null,"	/* Transcription start position */
 "   txEnd int unsigned not null,"	/* Transcription end position */
 "   cdsStart int unsigned not null,"	/* Coding region start */
 "   cdsEnd int unsigned not null,"	/* Coding region end */
 "   exonCount int unsigned not null,"	/* Number of exons */
 "   exonStarts longblob not null,"	/* Exon start positions */
@@ -1454,163 +1455,180 @@
 assert((iExon >= 0) && (iExon < gp->exonCount));
 int start = gp->exonStarts[iExon];
 int end = gp->exonEnds[iExon];
 
 if (start < gp->cdsStart)
     start = gp->cdsStart;
 if (end > gp->cdsEnd)
     end = gp->cdsEnd;
 if (startPtr != NULL)
     *startPtr = start;
 if (endPtr != NULL)
     *endPtr = end;
 return (start < end);
 }
 
-/* state for genePredCheck */
-static int gpErrorCnt = 0;  /* error count for gpError */
-static FILE *gpErrFh = NULL;  /* error output for gpError */
-
-static void gpError(char *format, ...)
+static void gpError(FILE* errFh, int* errorCnt, char *format, ...)
 /* print and count an error */
 {
 va_list args;
-fprintf(gpErrFh, "Error: ");
+fprintf(errFh, "Error: ");
 va_start(args, format);
-vfprintf(gpErrFh, format, args);
+vfprintf(errFh, format, args);
 va_end(args);
-fputc('\n', gpErrFh);
-gpErrorCnt++;
+fputc('\n', errFh);
+(*errorCnt)++;
 }
 
-static void checkExon(char *desc, struct genePred* gp, int iExon)
+static void checkExon(FILE* errFh, int* errorCnt, char *desc, struct genePred* gp, int iExon)
 /* check one exon of a genePred */
 {
 unsigned exonStart = gp->exonStarts[iExon];
 unsigned exonEnd = gp->exonEnds[iExon];
 if (exonStart >= exonEnd)
-    gpError("%s: %s exon %u start %u >= end %u", desc, gp->name,
-            iExon, exonStart, exonEnd);
+    gpError(errFh, errorCnt, "%s: %s exon %u start %u >= end %u", desc, gp->name, iExon, exonStart, exonEnd);
 if (exonStart < gp->txStart)
-    gpError("%s: %s exon %u start %u < txStart %u", desc, gp->name,
-            iExon, exonStart, gp->txStart);
+    gpError(errFh, errorCnt, "%s: %s exon %u start %u < txStart %u", desc, gp->name, iExon, exonStart, gp->txStart);
 if (exonEnd > gp->txEnd)
-    gpError("%s: %s exon %u end %u > txEnd %u", desc, gp->name,
-            iExon, exonEnd, gp->txEnd);
+    gpError(errFh, errorCnt, "%s: %s exon %u end %u > txEnd %u", desc, gp->name, iExon, exonEnd, gp->txEnd);
 if (iExon > 0)
     {
     /* other than first exon */
     unsigned prevExonEnd = gp->exonEnds[iExon-1];
     if (exonStart < prevExonEnd)
-        gpError("%s: %s exon %u overlaps previous exon", desc,
-                gp->name, iExon);
+        gpError(errFh, errorCnt, "%s: %s exon %u overlaps previous exon", desc, gp->name, iExon);
     }
 
 if (gp->optFields & genePredExonFramesFld)
     {
     int frame = gp->exonFrames[iExon];
     if ((frame < -1) || (frame > 2))
-        gpError("%s: %s invalid exonFrame: %d", desc, gp->name, frame);
+        gpError(errFh, errorCnt, "%s: %s invalid exonFrame: %d", desc, gp->name, frame);
     if ((exonEnd > gp->cdsStart) && (exonStart < gp->cdsEnd))
         {
         if (frame == -1)
-            gpError("%s: %s no exonFrame on CDS exon %d", desc, gp->name, iExon);
+            gpError(errFh, errorCnt, "%s: %s no exonFrame on CDS exon %d", desc, gp->name, iExon);
         }
     else
         {
         if (frame != -1)
-            gpError("%s: %s exonFrame on non-CDS exon %d", desc, gp->name, iExon);
+            gpError(errFh, errorCnt, "%s: %s exonFrame on non-CDS exon %d", desc, gp->name, iExon);
         }
     }
 }
 
-static void checkCdsBounds(char *desc, FILE* out, struct genePred* gp)
+static void checkCdsBounds(FILE* errFh, int* errorCnt, char *desc, struct genePred* gp)
 /* check cdsStart/cdsEnd */
 {
 int iExon;
 boolean foundCdsStart = FALSE, foundCdsEnd = FALSE;
 if (gp->cdsStart > gp->cdsEnd)
-    gpError("%s: %s cdsStart %u > cdsEnd %u", desc, gp->name,
-            gp->cdsStart, gp->cdsEnd);
+    gpError(errFh, errorCnt, "%s: %s cdsStart %u > cdsEnd %u", desc, gp->name, gp->cdsStart, gp->cdsEnd);
 if ((gp->cdsStart < gp->txStart) || (gp->cdsStart > gp->txEnd))
-    gpError("%s: %s cdsStart %u not in tx bounds %u-%u", desc,
-            gp->name, gp->cdsStart, gp->txStart, gp->txEnd);
+    gpError(errFh, errorCnt, "%s: %s cdsStart %u not in tx bounds %u-%u", desc, gp->name, gp->cdsStart, gp->txStart, gp->txEnd);
 if ((gp->cdsEnd < gp->txStart) || (gp->cdsEnd > gp->txEnd))
-    gpError("%s: %s cdsEnd %u not in tx bounds %u-%u", desc, 
-            gp->name, gp->cdsEnd, gp->txStart, gp->txEnd);
+    gpError(errFh, errorCnt, "%s: %s cdsEnd %u not in tx bounds %u-%u", desc, gp->name, gp->cdsEnd, gp->txStart, gp->txEnd);
 /*  is cdsStart/cdsEnd in an exon?  */
 for (iExon = 0; iExon < gp->exonCount; iExon++)
     {
     if ((gp->cdsStart >= gp->exonStarts[iExon])
         && (gp->cdsStart < gp->exonEnds[iExon]))
         foundCdsStart = TRUE;
     if ((gp->cdsEnd > gp->exonStarts[iExon])
         && (gp->cdsEnd <= gp->exonEnds[iExon]))
         foundCdsEnd = TRUE;
     }
 if (!foundCdsStart)
-    gpError("%s: %s cdsStart %u not in an exon", desc, gp->name, gp->cdsStart);
+    gpError(errFh, errorCnt, "%s: %s cdsStart %u not in an exon", desc, gp->name, gp->cdsStart);
 if (!foundCdsEnd)
-    gpError("%s: %s cdsEnd %u not in an exon", desc, gp->name, gp->cdsEnd);
+    gpError(errFh, errorCnt, "%s: %s cdsEnd %u not in an exon", desc, gp->name, gp->cdsEnd);
 }
 
-int genePredCheck(char *desc, FILE* out, int chromSize, 
-                  struct genePred* gp)
+int genePredCheck(char *desc, FILE* errFh, int chromSize, struct genePred* gp)
 /* Validate a genePred for consistency.  desc is printed the error messages
- * to file out (open /dev/null to discard).  chromSize should contain
+ * to file errFh (open /dev/null to discard).  chromSize should contain
  * size of chromosome, or 0 if chrom is not valid, or -1 to not check
  * chromosome bounds. Returns count of errors. */
 {
 int iExon;
-gpErrorCnt = 0;
-gpErrFh = out;
+int errorCnt = 0;
 if (!(sameString(gp->strand, "+") || sameString(gp->strand, "-")))
-    gpError("%s: %s invalid strand: \"%s\"", desc, gp->name, gp->strand);
+    gpError(errFh, &errorCnt, "%s: %s invalid strand: \"%s\"", desc, gp->name, gp->strand);
 
 /* check chrom */
 if (chromSize == 0)
-    gpError("%s: %s chrom not a valid chromosome: \"%s\"", desc, gp->name, gp->chrom);
+    gpError(errFh, &errorCnt, "%s: %s chrom not a valid chromosome: \"%s\"", desc, gp->name, gp->chrom);
 else if (chromSize > 0)
     {
     if (gp->txEnd > chromSize)
-        gpError("%s: %s txEnd %u >= chromSize %u", desc, gp->name,
-                gp->txEnd, chromSize);
+        gpError(errFh, &errorCnt, "%s: %s txEnd %u >= chromSize %u", desc, gp->name, gp->txEnd, chromSize);
     }
 
 /* check internal consistency */
 if (gp->txStart >= gp->txEnd)
-    gpError("%s: %s txStart %u >= txEnd %u", desc, gp->name,
-            gp->txStart, gp->txEnd);
+    gpError(errFh, &errorCnt, "%s: %s txStart %u >= txEnd %u", desc, gp->name, gp->txStart, gp->txEnd);
 
 /* no CDS is indicated by cdsStart == cdsEnd */
 if (gp->cdsStart != gp->cdsEnd)
-    checkCdsBounds(desc, out, gp);
+    checkCdsBounds(errFh, &errorCnt, desc, gp);
 
 /* make sure first/last exons match tx range */
 if (gp->txStart != gp->exonStarts[0])
-    gpError("%s: %s first exon start %u doesn't match txStart %u", desc, 
-            gp->name, gp->exonStarts[0], gp->txStart);
+    gpError(errFh, &errorCnt, "%s: %s first exon start %u doesn't match txStart %u", desc, gp->name, gp->exonStarts[0], gp->txStart);
 if (gp->txEnd != gp->exonEnds[gp->exonCount-1])
-    gpError("%s: %s last exon end %u doesn't match txEnd %u", desc, 
-            gp->name, gp->exonEnds[gp->exonCount-1], gp->txEnd);
+    gpError(errFh, &errorCnt, "%s: %s last exon end %u doesn't match txEnd %u", desc, gp->name, gp->exonEnds[gp->exonCount-1], gp->txEnd);
 
 
 /* check each exon */
 for (iExon = 0; iExon < gp->exonCount; iExon++)
-    checkExon(desc, gp, iExon);
+    checkExon(errFh, &errorCnt, desc, gp, iExon);
+
+return errorCnt;
+}
+
+static int lookupChromSize(char *desc, FILE* errFh, char* db, struct genePred* gp, int *errorCnt)
+/* lookup the chromosome size in the database, -1 if invalid */
+{
+// hGetChromInfo is case independent
+struct chromInfo *ci = hGetChromInfo(db, gp->chrom);
+if (ci == NULL)
+    {
+    fprintf(errFh, "Error: %s: %s has invalid chrom for %s: %s\n", desc, gp->name, db, gp->chrom);
+    (*errorCnt)++;
+    return -1;  // don't validate
+    }
+else if (differentString(gp->chrom, ci->chrom)) // verify case dependent equal
+    {
+    fprintf(errFh, "Error: %s: %s has invalid chrom for %s: %s\n", desc, gp->name, db, gp->chrom);
+    (*errorCnt)++;
+    return -1;  // don't validate
+    }
+else
+    return ci->size;
+}
 
-return gpErrorCnt;
+int genePredCheckDb(char *desc, FILE* errFh, char* db, struct genePred* gp)
+/* Validate a genePred for consistency.  desc is printed the error messages
+ * to file errFh (open /dev/null to discard).  Lookup chromosome size in database if
+ * db is not NULL. Returns count of errors. */
+{
+int errorCnt = 0;
+int chromSize = -1;  /* default to not checking */
+if (db != NULL)
+    chromSize = lookupChromSize(desc, errFh, db, gp, &errorCnt);
+errorCnt += genePredCheck(desc, errFh, chromSize, gp);
+return errorCnt;
 }
 
 boolean genePredNmdTarget(struct genePred *gp) 
 /* Return TRUE if cds end is more than 50bp upstream of
    last intron. */
 {
 int gpSize = 0;
 int i = 0;
 int startDist = 0, endDist = 0;
 int blockSize = 0;
 if(gp->exonCount < 2 || gp->cdsStart == gp->cdsEnd)
     return FALSE;
 for(i = 0; i < gp->exonCount; i++)
     {
     int blockStart = gp->exonStarts[i];