bc3bbb2794e3067f0ff35ac2003fd69362e2ed58
angie
  Thu Mar 8 11:34:17 2018 -0800
Libified genePredToFakePsl's code for extracting PSL and CDS from genePred so I can use it to infer PSL+CDS from genePred tracks like Gencode for HGVS.
refs #21076

diff --git src/hg/genePredToFakePsl/genePredToFakePsl.c src/hg/genePredToFakePsl/genePredToFakePsl.c
index a4e37b1..9fc57a2 100644
--- src/hg/genePredToFakePsl/genePredToFakePsl.c
+++ src/hg/genePredToFakePsl/genePredToFakePsl.c
@@ -1,24 +1,25 @@
 /* genePredToFakePsl - create fake .psl of mRNA aligned to dna from genePred file or table. */
 
 /* Copyright (C) 2013 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 #include "common.h"
 #include "options.h"
 #include "portable.h"
 #include "hash.h"
 #include "hdb.h"
+#include "genbank.h"
 #include "genePred.h"
 #include "genePredReader.h"
 #include "psl.h"
 
 
 /* Command line switches. */
 static char *chromSizes = NULL;  /* read chrom sizes from file instead of database . */
 static char *qSizes = NULL;  /* read query sizes from file */
 static struct hash *qSizeHash = NULL;  /* key is name, value is size */
 
 /* Command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {"chromSize", OPTION_STRING},
     {"qSizes", OPTION_STRING},
     {NULL, 0}
@@ -40,105 +41,45 @@
   "cdsOut specifies the output cds tab-separated file which contains\n"
   "genbank-style CDS records showing cdsStart..cdsEnd\n"  
   "e.g. NM_123456 34..305\n"
   "options:\n"
   "   -chromSize=sizefile\tRead chrom sizes from file instead of database\n"
   "             sizefile contains two white space separated fields per line:\n"
   "		chrom name and size\n"
   "   -qSizes=qSizesFile\tRead in query sizes to fixup qSize and qStarts\n"
 
   "\n");
 }
 
 static void cnvGenePredCds(struct genePred *gp, int qSize, FILE *cdsFh)
 /* determine CDS and output */
 {
-/*
- * Warning: Genbank CDS does't have the ability to represent
- * partial codons.  If we have genePreds created from GFF/GTF, they can have
- * partial codons, which is indicated in frame.  This code does not correctly handle
- * this case, or frame shifting indels.
- */
-int e, off = 0;
-int qCdsStart = -1, qCdsEnd = -1;
-int eCdsStart, eCdsEnd;
-
-for (e = 0; e < gp->exonCount; ++e)
-    {
-    if (genePredCdsExon(gp, e, &eCdsStart, &eCdsEnd))
-        {
-        if (qCdsStart < 0)
-            qCdsStart = off + (eCdsStart - gp->exonStarts[e]);
-        qCdsEnd = off + (eCdsEnd - gp->exonStarts[e]);
-        }
-    off += gp->exonEnds[e] - gp->exonStarts[e];
-    } 
-if (gp->strand[0] == '-')
-    reverseIntRange(&qCdsStart, &qCdsEnd, qSize);
-fprintf(cdsFh,"%s\t%d..%d\n", gp->name, qCdsStart+1, qCdsEnd); /* genbank cds is closed 1-based */
+struct genbankCds cds;
+genePredToCds(gp, &cds);
+fprintf(cdsFh,"%s\t%d..%d\n", gp->name, cds.start+1, cds.end); /* genbank cds is closed 1-based */
 }
 
 static void cnvGenePred(struct hash *chromHash, struct genePred *gp, FILE *pslFh, FILE *cdsFh)
 /* convert a genePred to a psl and CDS */
 {
 int chromSize = hashIntValDefault(chromHash, gp->chrom, 0);
 if (chromSize == 0)
     errAbort("Couldn't find chromosome/scaffold '%s' in chromInfo", gp->chrom);
-int e = 0, qSize=0;
-
-
-for (e = 0; e < gp->exonCount; ++e)
-    qSize+=(gp->exonEnds[e] - gp->exonStarts[e]);
-
-int realSize = 0;
-int sizeAdjust = 0;
+int qSize = 0;
 if (qSizes != NULL)
-    {
-    realSize = hashIntValDefault(qSizeHash, gp->name, 0);
-    // If there is a realSize (>0), but realSize is less than qSize, this subtraction would
-    // cause unsigned underflow so don't do it.  qSize > realSize implies that one of the genePred
-    // "exons" is glossing over a query gap.
-    if (realSize > qSize)
-       sizeAdjust = realSize - qSize;
-    }
-
-struct psl *psl = pslNew(gp->name, realSize ? realSize : qSize, 0, qSize,
-                         gp->chrom, chromSize, gp->txStart, gp->txEnd,
-                         gp->strand, gp->exonCount, 0);
-/* no size adjustment to qStarts for positive strand */
-if (gp->strand[0] == '+')
-   sizeAdjust = 0;
-int i = -1;
-for (e = 0; e < gp->exonCount; ++e)
-    {
-    if (e == 0 || gp->exonStarts[e] != gp->exonEnds[e-1])
-        {
-        i++;
-        psl->blockSizes[i] = (gp->exonEnds[e] - gp->exonStarts[e]);
-        psl->qStarts[i] = i==0 ? 0 + sizeAdjust : psl->qStarts[i-1] + psl->blockSizes[i-1];
-        psl->tStarts[i] = gp->exonStarts[e];
-        }
-    else
-        {
-        // Merge "exons" that have a 0-length gap between them to avoid pslCheck failure
-        psl->blockSizes[i] += (gp->exonEnds[e] - gp->exonStarts[e]);
-        }
-    }
-psl->blockCount = i+1;
-psl->match = qSize;	
-psl->tNumInsert = psl->blockCount-1; 
-psl->tBaseInsert = (gp->txEnd - gp->txStart) - qSize;
+    qSize = hashIntValDefault(qSizeHash, gp->name, 0);
+struct psl *psl = genePredToPsl(gp, chromSize, qSize);
 pslTabOut(psl, pslFh);
 pslFree(&psl);
 if (gp->cdsStart < gp->cdsEnd)
     cnvGenePredCds(gp, qSize, cdsFh);
 }
 
 static struct hash *getChromHash(char *db)
 /* Return a hash of chrom names and sizes, from either -chromSize=file or db */
 {
 struct hash *chromHash = NULL;
 if (chromSizes != NULL)
     chromHash = hChromSizeHashFromFile(chromSizes);
 else
     chromHash = hChromSizeHash(db);
 return chromHash;