cb4ac73d3a456f2b757fd10112a44c2abffbb54f
tdreszer
  Tue Apr 23 13:46:43 2013 -0700
Added routines for getting genePred coding DNA and determining coding position.  To be used in haplotypes code about to be checked in.
diff --git src/hg/lib/genePred.c src/hg/lib/genePred.c
index 8295832..89224f7 100644
--- src/hg/lib/genePred.c
+++ src/hg/lib/genePred.c
@@ -1931,15 +1931,107 @@
     "    char[1] strand;     \"+ or - for strand\"\n"
     "    uint txStart;	\"Transcription start position\"\n"
     "    uint txEnd;         \"Transcription end position\"\n"
     "    uint cdsStart;	\"Coding region start\"\n"
     "    uint cdsEnd;        \"Coding region end\"\n"
     "    uint exonCount;     \"Number of exons\"\n"
     "    uint[exonCount] exonStarts; \"Exon start positions\"\n"
     "    uint[exonCount] exonEnds;   \"Exon end positions\"\n"
     "    )\n";
 
 struct asObject *genePredAsObj()
 // Return asObject describing fields of genePred
 {
 return asParseText(genePredAutoSqlString);
 }
+
+struct dnaSeq *genePredGetDna(char *database, struct genePred *gp,
+                              boolean coding, enum dnaCase dnaCase)
+// Returns the DNA sequence associated with gene prediction.
+// Negative strand genes will return the sequence as read from the negative strand.
+// Optionally restrict to coding sequence only
+{
+struct dnaSeq *dnaSeq = hDnaFromSeq(database, gp->chrom, gp->txStart, gp->txEnd,  dnaCase);
+if (dnaSeq == NULL)
+    return NULL;
+DNA *seq = dnaSeq->dna;
+int len = dnaSeq->size;
+if (coding)
+    {
+    int cdnaOffset = 0, exonIx;
+    for (exonIx = 0; exonIx < gp->exonCount; exonIx++)
+        {
+        int exonStart = max(gp->exonStarts[exonIx],gp->cdsStart);
+        int exonEnd   = min(gp->exonEnds[  exonIx],gp->cdsEnd  );
+        if (exonStart >= exonEnd) // non-coding exon
+            continue;
+        exonStart -= gp->txStart;
+        exonEnd   -= gp->txStart;
+        int exonSize  = exonEnd - exonStart;
+
+        assert(cdnaOffset <= exonStart && exonEnd <= len);
+        if (cdnaOffset < exonStart)
+            memmove(seq + cdnaOffset, seq + exonStart, exonSize);
+        cdnaOffset += exonSize;
+        }
+    seq[cdnaOffset] = '\0';
+    dnaSeq->size = cdnaOffset;
+    }
+
+if (gp->strand[0] == '-')
+    reverseComplement(seq,dnaSeq->size);
+
+return dnaSeq;
+}
+
+int genePredBaseToCodingPos(struct genePred *gp, int basePos,
+                            boolean stranded, boolean *isCoding)
+// Given a genePred model and a single (0 based) base position, predict the 0-based
+// DNA (stranded) coding sequence pos.  Dividing this number by 3 should give the AA position!
+// Returns -1 when outside of coding exons unless OPTIONAL isCoding pointer to boolean is
+// provided. In that case, returns last valid position and sets isCoding to FALSE.
+{
+if (isCoding == NULL && (basePos < gp->cdsStart || basePos >= gp->cdsEnd))
+    return -1;
+
+boolean reverse = (stranded && gp->strand[0] == '-');
+assert(gp->exonStarts != NULL && gp->exonEnds != NULL);
+
+// Walk through exons, advancing depending upon strand
+int codingBasesSoFar = 0;
+int exonIx = (reverse ? (gp->exonCount - 1) : 0);
+while (0 <= exonIx && exonIx < gp->exonCount)
+    {
+    int exonStart = max(gp->exonStarts[exonIx],gp->cdsStart);
+    int exonEnd   = min(gp->exonEnds[  exonIx],gp->cdsEnd  );
+    if (exonStart < exonEnd) // coding exon
+        {
+        // Within this exon?
+        if (exonStart <= basePos && basePos < exonEnd)
+            {
+            if (isCoding != NULL)
+                *isCoding = TRUE;
+            if (reverse)
+                return (codingBasesSoFar + (exonEnd - basePos));
+            else
+                return (codingBasesSoFar + (basePos - exonStart));
+            }
+
+        // Past the target base already?
+        if (( reverse && basePos >= exonEnd  )
+        ||  (!reverse && basePos <  exonStart))
+            break;
+
+        // Yet to reach the target base... accumulate exon worth
+        codingBasesSoFar += (exonEnd - exonStart);
+        }
+
+    exonIx += (reverse ? -1 : 1);
+    }
+
+if (isCoding != NULL && codingBasesSoFar > 0)
+    {
+    *isCoding = FALSE;
+    return codingBasesSoFar;
+    }
+return -1;  // introns not okay
+}