src/hg/genePredToFakePsl/genePredToFakePsl.c 1.4

1.4 2010/04/28 04:46:37 markd
Fixed incorrect calculation of CDS refactor
Index: src/hg/genePredToFakePsl/genePredToFakePsl.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/genePredToFakePsl/genePredToFakePsl.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/genePredToFakePsl/genePredToFakePsl.c	3 Sep 2008 19:18:39 -0000	1.3
+++ src/hg/genePredToFakePsl/genePredToFakePsl.c	28 Apr 2010 04:46:37 -0000	1.4
@@ -1,125 +1,123 @@
 /* genePredToFakePsl - create fake .psl of mRNA aligned to dna from genePred file or table. */
 #include "common.h"
 #include "options.h"
 #include "portable.h"
 #include "hdb.h"
 #include "genePred.h"
 #include "genePredReader.h"
 #include "psl.h"
 
 static char const rcsid[] = "$Id$";
 
 /* Command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {NULL, 0}
 };
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "genePredToFakePsl - Create a psl of fake-mRNA aligned to gene-preds from a file or table.\n"
   "usage:\n"
   "   genePredToFakePsl db fileTbl pslOut cdsOut\n"
   "\n"
   "If fileTbl is an existing file, then it is used.\n"
   "Otherwise, the table by this name is used.\n"
   "\n"
   "pslOut specifies the fake-mRNA output psl filename.\n"
   "\n"
   "cdsOut specifies the output cds tab-separated file which contains\n"
   "genbank-style CDS records showing cdsStart..cdsEnd\n"  
   "e.g. NM_123456 34..305\n"
   "\n");
 }
 
-void fakePslFromGenePred(char *db, char *fileTbl, char *pslOut, char *cdsOut)
-/* check a genePred */
+static void cnvGenePredCds(struct genePred *gp, int qSize, FILE *cdsFh)
+/* determine CDS and output */
 {
-struct sqlConnection *conn = NULL;
-struct genePredReader *gpr;
-struct genePred *gp;
-int iRec = 0;
-int chromSize = -1; 
-FILE *out = mustOpen(pslOut, "w");
-FILE *cds = mustOpen(cdsOut, "w");
+int e, off = 0;
+int qCdsStart = -1, qCdsEnd = -1;
+int eCdsStart, eCdsEnd;
 
-conn = hAllocConn(db);
-
-if (fileExists(fileTbl))
+for (e = 0; e < gp->exonCount; ++e)
     {
-    gpr = genePredReaderFile(fileTbl, NULL);
-    }
-else
+    if (genePredCdsExon(gp, e, &eCdsStart, &eCdsEnd))
     {
-    gpr = genePredReaderQuery(conn, fileTbl, NULL);
+        if (qCdsStart < 0)
+            qCdsStart = off + (eCdsStart - gp->exonStarts[e]);
+        qCdsEnd = off + (eCdsEnd - gp->exonStarts[e]);
+        }
+    off += gp->exonEnds[e] - gp->exonStarts[e];
     }
+if (gp->strand[0] == '-')
+    reverseIntRange(&qCdsStart, &qCdsEnd, qSize);
+fprintf(cdsFh,"%s\t%d..%d\n", gp->name, qCdsStart+1, qCdsEnd); /* genbank cds is closed 1-based */
+}
 
-while ((gp = genePredReaderNext(gpr)) != NULL)
-    {
-    struct psl *psl;
-    int e = 0, qSize=0, qCdsStart=0, qCdsEnd=0;
+static void cnvGenePred(char *db, struct genePred *gp, FILE *pslFh, FILE *cdsFh)
+/* convert a genePred to a psl and CDS */
+{
+int chromSize = hChromSize(db, gp->chrom);
+int e = 0, qSize=0;
     
-    iRec++;
-    chromSize = hChromSize(db, gp->chrom);
-    for (e = 0; e < gp->exonCount; ++e)
+for (e = 0; e < gp->exonCount; ++e)
 	qSize+=(gp->exonEnds[e] - gp->exonStarts[e]);
-    psl = pslNew(gp->name, qSize, 0, qSize,
+struct psl *psl = pslNew(gp->name, qSize, 0, qSize,
                  gp->chrom, chromSize, gp->txStart, gp->txEnd,
                  gp->strand, gp->exonCount, 0);
-    psl->blockCount = gp->exonCount;		    
-    for (e = 0; e < gp->exonCount; ++e)
+psl->blockCount = gp->exonCount;		    
+for (e = 0; e < gp->exonCount; ++e)
 	{
 	psl->blockSizes[e] = (gp->exonEnds[e] - gp->exonStarts[e]);
 	psl->qStarts[e] = e==0 ? 0 : psl->qStarts[e-1] + psl->blockSizes[e-1];
 	psl->tStarts[e] = gp->exonStarts[e];
 	}
-    psl->match = qSize;	
-    psl->tNumInsert = psl->blockCount-1; 
-    psl->tBaseInsert = (gp->txEnd - gp->txStart) - qSize;
-    pslTabOut(psl, out);
-    qCdsStart = gp->cdsStart - gp->txStart;
-    qCdsEnd = gp->cdsEnd - gp->txStart;
+psl->match = qSize;	
+psl->tNumInsert = psl->blockCount-1; 
+psl->tBaseInsert = (gp->txEnd - gp->txStart) - qSize;
+pslTabOut(psl, pslFh);
+pslFree(&psl);
+if (gp->cdsStart < gp->cdsEnd)
+    cnvGenePredCds(gp, qSize, cdsFh);
+}
     
-    for (e = 1; e < gp->exonCount; ++e)
+static void fakePslFromGenePred(char *db, char *fileTbl, char *pslOut, char *cdsOut)
+/* check a genePred */
+{
+struct sqlConnection *conn = NULL;
+struct genePredReader *gpr;
+struct genePred *gp;
+FILE *pslFh = mustOpen(pslOut, "w");
+FILE *cdsFh = mustOpen(cdsOut, "w");
+
+conn = hAllocConn(db);
+
+if (fileExists(fileTbl))
 	{
-	int intronSize = gp->exonStarts[e] - gp->exonEnds[e-1];
-	if (gp->exonStarts[e] <= gp->cdsStart)
-	    qCdsStart -= intronSize;
-	if (gp->exonStarts[e] <= gp->cdsEnd)
-	    qCdsEnd -= intronSize;
+    gpr = genePredReaderFile(fileTbl, NULL);
 	} 
-    
-    if (gp->strand[0] == '-')
+else
 	{
-	int temp = qCdsEnd;
-	qCdsEnd = qSize - qCdsStart;
-	qCdsStart = qSize - temp;
+    gpr = genePredReaderQuery(conn, fileTbl, NULL);
 	}
 	
-    fprintf(cds,"%s\t", gp->name);
-    //if (gp->strand[0] == '-')
-    //	fprintf(cds,"complement(");
-    
-    fprintf(cds,"%d..%d", qCdsStart+1, qCdsEnd); /* genbank cds is closed 1-based */
-    
-    //if (gp->strand[0] == '-')
-    //	fprintf(cds,")");
-    fprintf(cds,"\n");
-    
-    pslFree(&psl);
+while ((gp = genePredReaderNext(gpr)) != NULL)
+    {
+    cnvGenePred(db, gp, pslFh, cdsFh);
     }
 genePredReaderFree(&gpr);
-if (conn != NULL)
-    hFreeConn(&conn);
+hFreeConn(&conn);
+carefulClose(&pslFh);
+carefulClose(&cdsFh);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, optionSpecs);
 if (argc != 5)
     usage();
 fakePslFromGenePred(argv[1],argv[2],argv[3],argv[4]);
 return 0;
 }