b642fe3045bbc0df86e56957d61d7851fc5a127c
hiram
  Tue Aug 30 14:10:06 2016 -0700
add qSizes argument so the qSizes can be adjusted to their true RNA size refs #13673

diff --git src/hg/genePredToFakePsl/genePredToFakePsl.c src/hg/genePredToFakePsl/genePredToFakePsl.c
index 346f50e..8b38107 100644
--- src/hg/genePredToFakePsl/genePredToFakePsl.c
+++ src/hg/genePredToFakePsl/genePredToFakePsl.c
@@ -1,58 +1,63 @@
 /* genePredToFakePsl - create fake .psl of mRNA aligned to dna from genePred file or table. */
 
 /* Copyright (C) 2013 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 #include "common.h"
 #include "options.h"
 #include "portable.h"
 #include "hash.h"
 #include "hdb.h"
 #include "genePred.h"
 #include "genePredReader.h"
 #include "psl.h"
 
 
 /* Command line switches. */
-char *chromSizes = NULL;  /* read chrom sizes from file instead of database . */
+static char *chromSizes = NULL;  /* read chrom sizes from file instead of database . */
+static char *qSizes = NULL;  /* read query sizes from file */
+static struct hash *qSizeHash = NULL;  /* key is name, value is size */
 
 /* Command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {"chromSize", OPTION_STRING},
+    {"qSizes", OPTION_STRING},
     {NULL, 0}
 };
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "genePredToFakePsl - Create a psl of fake-mRNA aligned to gene-preds from a file or table.\n"
   "usage:\n"
   "   genePredToFakePsl [options] db fileTbl pslOut cdsOut\n"
   "\n"
   "If fileTbl is an existing file, then it is used.\n"
   "Otherwise, the table by this name is used.\n"
   "\n"
   "pslOut specifies the fake-mRNA output psl filename.\n"
   "\n"
   "cdsOut specifies the output cds tab-separated file which contains\n"
   "genbank-style CDS records showing cdsStart..cdsEnd\n"  
   "e.g. NM_123456 34..305\n"
   "options:\n"
   "   -chromSize=sizefile\tRead chrom sizes from file instead of database\n"
   "             sizefile contains two white space separated fields per line:\n"
   "		chrom name and size\n"
+  "   -qSizes=qSizesFile\tRead in query sizes to fixup qSize and qStarts\n"
+
   "\n");
 }
 
 static void cnvGenePredCds(struct genePred *gp, int qSize, FILE *cdsFh)
 /* determine CDS and output */
 {
 /*
  * Warning: Genbank CDS does't have the ability to represent
  * partial codons.  If we have genePreds created from GFF/GTF, they can have
  * partial codons, which is indicated in frame.  This code does not correctly handle
  * this case, or frame shifting indels.
  */
 int e, off = 0;
 int qCdsStart = -1, qCdsEnd = -1;
 int eCdsStart, eCdsEnd;
@@ -68,40 +73,53 @@
     off += gp->exonEnds[e] - gp->exonStarts[e];
     } 
 if (gp->strand[0] == '-')
     reverseIntRange(&qCdsStart, &qCdsEnd, qSize);
 fprintf(cdsFh,"%s\t%d..%d\n", gp->name, qCdsStart+1, qCdsEnd); /* genbank cds is closed 1-based */
 }
 
 static void cnvGenePred(struct hash *chromHash, struct genePred *gp, FILE *pslFh, FILE *cdsFh)
 /* convert a genePred to a psl and CDS */
 {
 int chromSize = hashIntValDefault(chromHash, gp->chrom, 0);
 if (chromSize == 0)
     errAbort("Couldn't find chromosome/scaffold '%s' in chromInfo", gp->chrom);
 int e = 0, qSize=0;
 
+
 for (e = 0; e < gp->exonCount; ++e)
     qSize+=(gp->exonEnds[e] - gp->exonStarts[e]);
-struct psl *psl = pslNew(gp->name, qSize, 0, qSize,
+
+int sizeAdjust = 0;
+if (qSizes != NULL)
+    {
+    int realSize = hashIntValDefault(qSizeHash, gp->name, 0);
+    if (realSize > 0)
+       sizeAdjust = realSize - qSize;
+    }
+
+struct psl *psl = pslNew(gp->name, qSize+sizeAdjust, 0, qSize,
                          gp->chrom, chromSize, gp->txStart, gp->txEnd,
                          gp->strand, gp->exonCount, 0);
+/* no size adjustment to qStarts for positive strand */
+if (gp->strand[0] == '+')
+   sizeAdjust = 0;
 psl->blockCount = gp->exonCount;		    
 for (e = 0; e < gp->exonCount; ++e)
     {
     psl->blockSizes[e] = (gp->exonEnds[e] - gp->exonStarts[e]);
-    psl->qStarts[e] = e==0 ? 0 : psl->qStarts[e-1] + psl->blockSizes[e-1];
+    psl->qStarts[e] = e==0 ? 0 + sizeAdjust : psl->qStarts[e-1] + psl->blockSizes[e-1];
     psl->tStarts[e] = gp->exonStarts[e];
     }
 psl->match = qSize;	
 psl->tNumInsert = psl->blockCount-1; 
 psl->tBaseInsert = (gp->txEnd - gp->txStart) - qSize;
 pslTabOut(psl, pslFh);
 pslFree(&psl);
 if (gp->cdsStart < gp->cdsEnd)
     cnvGenePredCds(gp, qSize, cdsFh);
 }
 
 static struct hash *getChromHash(char *db)
 /* Return a hash of chrom names and sizes, from either -chromSize=file or db */
 {
 struct hash *chromHash = NULL;
@@ -135,20 +153,23 @@
 
 while ((gp = genePredReaderNext(gpr)) != NULL)
     {
     cnvGenePred(chromHash, gp, pslFh, cdsFh);
     }
 genePredReaderFree(&gpr);
 carefulClose(&pslFh);
 carefulClose(&cdsFh);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, optionSpecs);
 chromSizes = optionVal("chromSize", NULL);
+qSizes = optionVal("qSizes", NULL);
 if (argc != 5)
     usage();
+if (qSizes != NULL)
+    qSizeHash = hChromSizeHashFromFile(qSizes);
 fakePslFromGenePred(argv[1],argv[2],argv[3],argv[4]);
 return 0;
 }