bc3bbb2794e3067f0ff35ac2003fd69362e2ed58
angie
  Thu Mar 8 11:34:17 2018 -0800
Libified genePredToFakePsl's code for extracting PSL and CDS from genePred so I can use it to infer PSL+CDS from genePred tracks like Gencode for HGVS.
refs #21076

diff --git src/hg/lib/genePred.c src/hg/lib/genePred.c
index 98fec64..c66f33b 100644
--- src/hg/lib/genePred.c
+++ src/hg/lib/genePred.c
@@ -2398,15 +2398,89 @@
 char* cds = getCdsCodons(gp, genomeSeqs);
 char *prot = translateCds(gp->chrom, cds, options);
 
 if (protRet != NULL)
     *protRet = prot;
 else
     freeMem(prot);
 if (cdsRet != NULL)
     *cdsRet = cds;
 else
     freeMem(cds);
 
 if (!haveFrames)
     freez(&gp->exonFrames);
 }
+
+void genePredToCds(struct genePred *gp, struct genbankCds *cds)
+/* Fill in cds with transcript offsets computed from genePred. */
+{
+/*
+ * Warning: Genbank CDS does't have the ability to represent
+ * partial codons.  If we have genePreds created from GFF/GTF, they can have
+ * partial codons, which is indicated in frame.  This code does not correctly handle
+ * this case, or frame shifting indels.
+ */
+cds->start = cds->end = -1;
+cds->startComplete = cds->endComplete = cds->complement = FALSE;
+if (gp->cdsEnd > gp->cdsStart)
+    {
+    int e, off = 0;
+    int qCdsStart = -1, qCdsEnd = -1;
+    for (e = 0; e < gp->exonCount; ++e)
+        {
+        int eCdsStart, eCdsEnd;
+        if (genePredCdsExon(gp, e, &eCdsStart, &eCdsEnd))
+            {
+            if (qCdsStart < 0)
+                qCdsStart = off + (eCdsStart - gp->exonStarts[e]);
+            qCdsEnd = off + (eCdsEnd - gp->exonStarts[e]);
+            }
+        off += gp->exonEnds[e] - gp->exonStarts[e];
+        }
+    int qSize = off;
+    if (gp->strand[0] == '-')
+        reverseIntRange(&qCdsStart, &qCdsEnd, qSize);
+    cds->start = qCdsStart;
+    cds->end = qCdsEnd;
+    cds->startComplete = (gp->cdsStartStat != cdsIncomplete);
+    cds->endComplete = (gp->cdsEndStat != cdsIncomplete);
+    }
+}
+
+struct psl *genePredToPsl(struct genePred *gp, int chromSize, int qSize)
+/* Convert a genePred to psl, assuming perfect concordance between target & query.
+ * If qSize is 0 then the number of aligned bases will be used as qSize. */
+{
+int e = 0, aliSize = 0;
+for (e = 0; e < gp->exonCount; ++e)
+    aliSize += (gp->exonEnds[e] - gp->exonStarts[e]);
+struct psl *psl = pslNew(gp->name, qSize ? qSize : aliSize, 0, aliSize,
+                         gp->chrom, chromSize, gp->txStart, gp->txEnd,
+                         gp->strand, gp->exonCount, 0);
+// If qSize is greater than aliSize then we assume the extra bases are at the end of
+// the transcript (poly-A tail).  If the alignment is on the '-' strand, then we need
+// to offset the reversed qStarts by the number of extra bases.
+int sizeAdjust = (gp->strand[0] == '-' && qSize > aliSize) ? (qSize - aliSize) : 0;
+int i = -1;
+for (e = 0; e < gp->exonCount; ++e)
+    {
+    if (e == 0 || gp->exonStarts[e] != gp->exonEnds[e-1])
+        {
+        i++;
+        psl->blockSizes[i] = (gp->exonEnds[e] - gp->exonStarts[e]);
+        psl->qStarts[i] = i==0 ? 0 + sizeAdjust : psl->qStarts[i-1] + psl->blockSizes[i-1];
+//#*** TODO: use exonFrame to detect ribosomal frameshift that makes us skip a base on t
+        psl->tStarts[i] = gp->exonStarts[e];
+        }
+    else
+        {
+        // Merge "exons" that have a 0-length gap between them to avoid pslCheck failure
+        psl->blockSizes[i] += (gp->exonEnds[e] - gp->exonStarts[e]);
+        }
+    }
+psl->blockCount = i+1;
+psl->match = aliSize;
+psl->tNumInsert = psl->blockCount-1;
+psl->tBaseInsert = (gp->txEnd - gp->txStart) - aliSize;
+return psl;
+}