6085f4dbce05570d9daefdec328b2fcb6fc9bf9f
hartera
  Mon Jul 16 14:19:11 2012 -0700
Added code so that if the utr option is set, then alignments are retrieved for all exons not just the CDS region.
diff --git src/hg/lib/mafGene.c src/hg/lib/mafGene.c
index 38bcd9f..4b5da13 100644
--- src/hg/lib/mafGene.c
+++ src/hg/lib/mafGene.c
@@ -1,866 +1,877 @@
 
 #include "common.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "hdb.h"
 #include "maf.h"
 #include "obscure.h"
 #include "genePred.h"
 #include "mafGene.h"
 
 
 struct exonInfo
 {
     struct exonInfo *next;
     struct mafAli *ali;
     int exonStart;
     int exonSize;
     int chromStart;
     int chromEnd;
     int frame;
     char strand;
     char *name;
 };
 
 struct speciesInfo
 {
     struct speciesInfo *next;
     char *name;
     int size;
     char *nucSequence;
     char *aaSequence;
     int aaSize;
     struct slName *posStrings;
     struct slName *curPosString;
     char *posString;
     char *chrom;
     int start, end;
     char strand;
     char frameStrand;
 };
 
 /* rather than allocate space for these every time,
  * just re-use a global resource.  It's not 
  * re-entrant safe, but....
  */
 #define MAX_EXON_SIZE 100 * 1024
 static char exonBuffer[MAX_EXON_SIZE];
 
 static char bigBuffer[100 * 1024];
 
 #define MAX_COMPS  	5000 
 char *compText[MAX_COMPS]; 
 char *siText[MAX_COMPS]; 
 
 /* is the sequence all dashes ? */
 static boolean allDashes(char *seq)
 {
 while (*seq)
     if (*seq++ != '-')
 	return FALSE;
 
 return TRUE;
 }
 
 /* translate a nuc sequence into amino acids. If there
  * are any dashes in any of the three nuc positions
  * make the AA a dash.
  */
 static aaSeq *doTranslate(struct dnaSeq *inSeq, unsigned offset, 
     unsigned inSize, boolean stop)
 {
 aaSeq *seq;
 DNA *dna = inSeq->dna;
 AA *pep, aa;
 int i, lastCodon;
 int actualSize = 0;
 
 assert(offset <= inSeq->size);
 if ((inSize == 0) || (inSize > (inSeq->size - offset)))
     inSize = inSeq->size - offset;
 lastCodon = offset + inSize - 3;
 
 AllocVar(seq);
 seq->dna = pep = needLargeMem(inSize/3+1);
 for (i=offset; i <= lastCodon; i += 3)
     {
     aa = lookupCodon(dna+i);
     if (aa == 'X')
 	{
 	if ((dna[i] == '-') ||
 	    (dna[i+1] == '-') ||
 	    (dna[i+2] == '-'))
 	    aa = '-';
 	}
     if (aa == 0)
 	{
         if (stop)
 	    break;
 	else
 	    aa = 'Z';
 	}
     *pep++ = aa;
     ++actualSize;
     }
 *pep = 0;
 assert(actualSize <= inSize/3+1);
 seq->size = actualSize;
 return seq;
 }
 
 static struct mafAli *getRefAli(char *database, char *chrom, int start, int end)
 {
 struct mafAli *ali;
 struct mafComp *comp;
 char buffer[1024];
 
 AllocVar(ali);
 AllocVar(comp);
 ali->components = comp;
 ali->textSize = end - start;
 
 safef(buffer, sizeof buffer, "%s.%s", database, chrom);
 comp->src = cloneString(buffer);
 comp->start = start;
 comp->strand = '+';
 comp->size = end - start;
 struct dnaSeq *seq = hChromSeqMixed(database, chrom, start , end);
 comp->text = cloneString(seq->dna);
 freeDnaSeq(&seq);
 
 return ali;
 }
 
 /* make sure we have the whole range even if
  * there isn't a maf loaded in this region
  */
 static struct mafAli *padOutAli(struct mafAli *list, char *database, 
     char *chrom, int start, int end)
 {
 if (list == NULL)
     {
     struct mafAli *ali = getRefAli(database, chrom, start, end);
     return ali;
     }
 
 int aliStart = list->components->start;
 if (start != aliStart)
     {
     struct mafAli *ali = getRefAli(database, chrom, start, aliStart);
     slAddHead(&list, ali);
     }
 
 struct mafAli *last = list;
 for(; last->next; last = last->next)
     ;
 
 int aliEnd = last->components->start + last->components->size;
 if (end != aliEnd)
     {
     struct mafAli *ali = getRefAli(database, chrom, aliEnd, end);
     slAddTail(&list, ali);
     }
 
 return list;
 }
 
 static struct mafAli *getAliForRange(char *database, char *mafTable, 
     char *chrom, int start, int end)
 {
 struct sqlConnection *conn = hAllocConn(database);
 struct mafAli *aliAll = mafLoadInRegion(conn, mafTable,
 	chrom, start, end);
 struct mafAli *ali;
 struct mafAli *list = NULL;
 struct mafAli *nextAli;
 
 hFreeConn(&conn);
 
 for(ali = aliAll; ali; ali = nextAli)
     {
     nextAli = ali->next;
     ali->next = NULL;
 
     char *masterSrc = ali->components->src;
     struct mafAli *subAli = NULL;
 
     if (mafNeedSubset(ali, masterSrc, start, end))
 	{
 	subAli = mafSubset( ali, masterSrc, start, end);
 	if (subAli == NULL)
 	    continue;
 	}
 
     if (subAli)
 	{
 	slAddHead(&list, subAli);
 	mafAliFree(&ali);
 	}
     else
 	slAddHead(&list, ali);
     }
 slReverse(&list);
 
 list = padOutAli(list, database, chrom, start, end);
 
 return list;
 }
 
 
 /* allocate space for the nuc and aa sequence for each species */
 static struct speciesInfo *getSpeciesInfo(struct exonInfo *giList, 
     struct slName *speciesNames, struct hash *siHash)
 {
 struct exonInfo *gi;
 int size = 0;
 struct speciesInfo *siList = NULL;
 
 for(gi = giList ; gi ; gi = gi->next)
     size += gi->exonSize;
 
 struct slName *name = speciesNames;
 
 for(; name ; name = name->next)
     {
     struct speciesInfo *si;
 
     AllocVar(si);
     si->frameStrand = giList->strand;
     si->name = name->name;
     verbose(2, "size %d\n", size);
     si->size = size;
     si->nucSequence = needMem(size + 1);
     memset(si->nucSequence, '-', size);
     si->aaSequence = needMem(size/3 + 1);
     hashAdd(siHash, si->name, si);
     slAddHead(&siList, si);
     }
 slReverse(&siList);
 
 return siList;
 }
 
 
 static void outSpeciesExons(FILE *f, char *dbName, struct speciesInfo *si, 
     struct exonInfo *giList, boolean doBlank, boolean doTable, int numCols)
 {
 int exonNum = 1;
 struct dnaSeq thisSeq;
 aaSeq *outSeq;
 int exonCount = 0;
 struct exonInfo *gi = giList;
 
 for(; gi; gi = gi->next)
     {
     if (gi->exonSize > 1)
 	exonCount++;
     }
 
 for(gi = giList; gi; gi = gi->next, exonNum++)
     {
     struct speciesInfo *siTemp = si;
     
     if (gi->exonSize == 1)
 	continue;
 
     for(; siTemp ; siTemp = siTemp->next)
 	{
 	char *ptr = exonBuffer;
 
 	switch(gi->frame)
 	    {
 	    case 0: /* just copy the sequence over */
 		memcpy(ptr, 
 		    &siTemp->nucSequence[gi->exonStart], gi->exonSize);
 		ptr += gi->exonSize;
 		break;
 	    case 1: /* we need to grab one nuc from the end 
 	             * of the previous exon */
 		*ptr++ = siTemp->nucSequence[gi->exonStart - 1];
 		memcpy(ptr, 
 		    &siTemp->nucSequence[gi->exonStart], gi->exonSize);
 		ptr += gi->exonSize;
 		break;
 
 	    case 2: /* we need to delete the first nuc from this exon
 	             * because we included it on the last exon */
 		memcpy(ptr, 
 		    &siTemp->nucSequence[gi->exonStart+1], gi->exonSize - 1);
 		ptr += gi->exonSize - 1;
 		break;
 	    }
 
 	int lastFrame = (gi->frame + gi->exonSize) % 3;
 	if (lastFrame == 1) /* delete the last nucleotide */
 	    --ptr;
 	else if (lastFrame == 2) /* add one more nucleotide from
 	                          * the next exon */
 	    *ptr++ = siTemp->nucSequence[gi->exonStart + gi->exonSize];
 	*ptr++ = 0;   /* null terminate */
 
 	thisSeq.dna = exonBuffer;
 	thisSeq.size = ptr - exonBuffer;
 	outSeq =  doTranslate(&thisSeq, 0,  0, FALSE);
 	char buffer[10 * 1024];
 
 	safef(buffer, sizeof buffer,  "%s_%s_%d_%d %d %d %d %s",
 	    gi->name, 
 	    siTemp->name, exonNum, exonCount, 
 	    outSeq->size,
 	    gi->frame, lastFrame,
 	    siTemp->curPosString->name);
 
 	if (doBlank || !allDashes(outSeq->dna))
 	    {
 	    if (doTable)
 		{
 		if (numCols == -1)
 		    fprintf(f, "%s ", buffer);
 		else
 		    {
 		    if (strlen(buffer) > numCols)
 			buffer[numCols] = 0;
 		    fprintf(f, "%-*s ", numCols, buffer);
 		    }
 		}
 	    else
 		fprintf(f, ">%s\n", buffer);
 
 	    fprintf(f, "%s\n", outSeq->dna);
 	    }
 	siTemp->curPosString = siTemp->curPosString->next;
 	}
     fprintf(f, "\n");
     }
 fprintf(f, "\n");
 }
 
 static void outSpeciesExonsNoTrans(FILE *f, char *dbName, 
     struct speciesInfo *si, struct exonInfo *giList, boolean doBlank,
     boolean doTable, int numCols)
 {
 int exonNum = 1;
 int exonCount = 0;
 struct exonInfo *gi;
 
 for(gi = giList; gi; gi = gi->next)
     exonCount++;
 
 for(gi = giList; gi; gi = gi->next, exonNum++)
     {
     struct speciesInfo *siTemp = si;
 
     int lastFrameNum = (gi->frame + gi->exonSize) % 3;
 
     for(; siTemp ; siTemp = siTemp->next)
 	{
 	int start = gi->exonStart;
 	int end = start + gi->exonSize;
 	char *ptr = &siTemp->nucSequence[gi->exonStart];
 
 	for (; start < end; start++, ptr++)
 	    if (*ptr != '-')
 		break;
 
 	if (!doBlank && (start == end))
 	    {
 	    siTemp->curPosString = siTemp->curPosString->next;
 	    continue;
 	    }
 
 	start = gi->exonStart;
 	ptr = &siTemp->nucSequence[gi->exonStart];
 	char buffer[10 * 1024];
 
 	safef(buffer, sizeof buffer, "%s_%s_%d_%d %d %d %d %s",
 	    gi->name, 
 	    siTemp->name, exonNum, exonCount, 
 	    gi->exonSize,
 	    gi->frame, lastFrameNum,
 	    siTemp->curPosString->name);
 
 	if (doTable)
 	    {
 	    if (numCols == -1)
 		fprintf(f, "%s ", buffer);
 	    else
 		{
 		if (strlen(buffer) > numCols)
 		    buffer[numCols] = 0;
 		fprintf(f, "%-*s ", numCols, buffer);
 		}
 	    }
 	else
 	    fprintf(f, ">%s\n", buffer);
 
 	siTemp->curPosString = siTemp->curPosString->next;
 
 	for (; start < end; start++)
 	    fprintf(f, "%c", *ptr++);
 	fprintf(f, "\n");
 	}
     fprintf(f, "\n");
     }
 fprintf(f, "\n");
 }
 
 /* translate nuc sequence into an sequence of amino acids */
 static void translateProtein(struct speciesInfo *si)
 {
 struct dnaSeq thisSeq;
 aaSeq *outSeq;
 
 thisSeq.dna = si->nucSequence;
 thisSeq.size = si->size;
 outSeq =  doTranslate(&thisSeq, 0,  0, FALSE);
 si->aaSequence  = outSeq->dna;
 si->aaSize = outSeq->size;
 }
 
 static char *allPos(struct speciesInfo *si)
 {
 char *ptr = bigBuffer;
 struct slName *names = si->posStrings;
 int size = sizeof bigBuffer;
 
 for(; names ; names = names->next)
     {
     int sz = safef(ptr, size, "%s", names->name);
     ptr += sz;
     size -= sz;
     if (names->next)
 	{
 	safef(ptr, size, ";");
 	ptr++;
 	size--;
 	}
     }
 
 return bigBuffer;
 }
 
 /* output a particular species sequence to the file stream */
 static void writeOutSpecies(FILE *f, char *dbName, struct speciesInfo *si, 
     struct exonInfo *giList, unsigned options, int numCols)
 {
 boolean inExons = options & MAFGENE_EXONS;
 boolean noTrans = options & MAFGENE_NOTRANS;
 boolean doBlank = options & MAFGENE_OUTBLANK;
 boolean doTable = options & MAFGENE_OUTTABLE;
 
 if (inExons)
     {
     if (noTrans)
 	outSpeciesExonsNoTrans(f, dbName, si, giList, doBlank, 
 	    doTable, numCols);
     else
 	outSpeciesExons(f, dbName, si, giList, doBlank, doTable, numCols);
     return;
     }
 
 struct exonInfo *lastGi;
 
 for(lastGi = giList; lastGi->next ; lastGi = lastGi->next)
     ;
 
 if (noTrans)
     {
     for(; si ; si = si->next)
 	{
 	if (doBlank || !allDashes(si->nucSequence))
 	    {
 	    char buffer[10 * 1024];
 
 	    safef(buffer, sizeof buffer, "%s_%s %d %s",
 		giList->name, si->name, si->size, allPos(si));
 
 	    if (doTable)
 		{
 		if (numCols == -1)
 		    fprintf(f, "%s ", buffer);
 		else
 		    {
 		    if (strlen(buffer) > numCols)
 			buffer[numCols] = 0;
 		    fprintf(f, "%-*s ", numCols, buffer);
 		    }
 		}
 	    else
 		fprintf(f, ">%s\n", buffer);
 
 	    fprintf(f, "%s\n", si->nucSequence);
 	    }
 	}
     fprintf(f, "\n\n");
     }
 else
     {
     for(; si ; si = si->next)
 	{
 	translateProtein(si);
 
 	char buffer[10 * 1024];
 
 	safef(buffer, sizeof buffer, "%s_%s %d %s",
 	    giList->name, si->name, si->aaSize, allPos(si));
 
 	if (doBlank || !allDashes(si->aaSequence))
 	    {
 	    if (doTable)
 		{
 		if (numCols == -1)
 		    fprintf(f, "%s ", buffer);
 		else
 		    {
 		    if (strlen(buffer) > numCols)
 			buffer[numCols] = 0;
 		    fprintf(f, "%-*s ", numCols, buffer);
 		    }
 		}
 	    else
 		fprintf(f, ">%s\n", buffer);
 
 	    fprintf(f, "%s\n", si->aaSequence);
 	    }
 	}
     fprintf(f, "\n\n");
     }
 }
 
 static void flushPosString(struct speciesInfo *si)
 {
 if (si->chrom != NULL)
     {
     char buffer[10*1024];
     char strand = '+';
 
     if (si->strand != si->frameStrand)
 	strand = '-';
 
     if (si->posString == NULL)
 	{
 	safef(buffer, sizeof buffer, "%s:%d-%d%c", si->chrom,
 	    si->start+1, si->end, strand);
 	}
     else
 	{
 	safef(buffer, sizeof buffer, "%s;%s:%d-%d%c", si->posString,
 	    si->chrom, si->start+1, si->end, strand);
 	freez(&si->posString);
 	}
     si->posString = cloneString(buffer);
     }
 
 si->chrom = NULL;
 }
 
 static void pushPosString(struct speciesInfo *si)
 {
 flushPosString(si);
 
 struct slName *newName = newSlName(si->posString);
 slAddTail(&si->posStrings, newName);
 
 freez(&si->posString);
 }
 
 static void updatePosString(struct speciesInfo *si, char *chrom, 
 		char strand, int start, int end)
 {
 if (start == end)
     return;
 
 if ((si->chrom == NULL) || 
     !sameString(si->chrom, chrom) ||
     si->strand != strand)
     {
     flushPosString(si);
 
     si->chrom = chrom;
     si->strand = strand;
     si->start = start;
     si->end = end;
     }
 
 if (strand == '+')
     {
     si->end = end;
     }
 else
     {
     si->start = start;
     }
 }
 
 /* copy the maf alignments into the species sequence buffers.
  * remove all the dashes from the reference sequence, and collapse
  * all the separate maf blocks into one sequence
  */
 static int copyAli(struct hash *siHash, struct mafAli *ali, int start)
 {
 struct mafComp *comp = ali->components;
 int jj;
 
 for(; comp; comp = comp->next)
     {
     char *chrom = strchr(comp->src, '.');
 
     if (chrom == NULL)
 	errAbort("all components must have a '.'");
 
     *chrom++ = 0;
 
     struct speciesInfo *si = hashFindVal(siHash, comp->src);
     if (si == NULL)
 	continue;
 
     if (comp->strand == '+')
 	updatePosString(si, chrom, comp->strand, 
 	    comp->start, comp->start + comp->size);
     else
 	updatePosString(si, chrom, comp->strand, 
 	    comp->srcSize - (comp->start + comp->size), 
 	    comp->srcSize - comp->start);
 
 
     char *tptr = ali->components->text;
     int size = 0;
     for(jj = 0 ; jj < ali->textSize; jj++)
 	if (*tptr++ != '-')
 	    size++;
 
     /* check to make sure maf is sane (no overlaps) */
     if (start + size >= si->size + 1)
 	errAbort("bad maf, nucSequence buffer overflow %d %d %d\n", 
 	    start,size, si->size);
 
     char *cptr = comp->text;
     char *sptr = &si->nucSequence[start];
     char *mptr = ali->components->text;
     if (cptr != NULL)
 	{
 	for(jj = 0 ; jj < ali->textSize; jj++)
 	    {
 	    if (*mptr++ != '-')
 		{
 		if (cptr != NULL)
 		    *sptr++ = *cptr++;
 		}
 	    else 
 		cptr++;
 	    }
 	}
     }
 
 char *mptr = ali->components->text;
 int count = 0;
 
 for(jj = 0 ; jj < ali->textSize; jj++)
     if (*mptr++ != '-')
 	count++;
 
 return start + count;
 }
 
 /* copyMafs - copy all the maf alignments into 
  * one sequence for each species
  */
 static void copyMafs(struct hash *siHash, struct exonInfo **giList, 
     boolean inExons)
 {
 int start = 0;
 struct exonInfo *gi = *giList;
 
 for(; gi; gi = gi->next)
     {
     int thisSize = 0;
     struct mafAli *ali = gi->ali;
 
     for(; ali; ali = ali->next)
 	{
 	int newStart = copyAli(siHash, ali, start);
 	thisSize += newStart - start;
 	start = newStart;
 	}
     struct hashCookie cookie =  hashFirst(siHash);
     struct hashEl *hel;
 
     if (inExons || (gi->next == NULL))
 	while ((hel = hashNext(&cookie)) != NULL)
 	    {
 	    struct speciesInfo *si = hel->val;
 
 	    pushPosString(si);
 	    }
     }
 
 boolean frameNeg = ((*giList)->strand == '-');
 
 if (frameNeg)
     {
     int size = 0;
     struct hashCookie cookie =  hashFirst(siHash);
     struct hashEl *hel;
     
     verbose(3, "flipping exons\n");
     while ((hel = hashNext(&cookie)) != NULL)
 	{
 	struct speciesInfo *si = hel->val;
 
 	if (si == NULL)
 	    continue;
 
 	if (size != 0)
 	    assert(size == si->size);
 	else
 	    size = si->size;
 	reverseComplement(si->nucSequence, si->size);
 
 	slReverse(&si->posStrings);
 	}
 
     gi = *giList;
     for(; gi; gi = gi->next)
 	{
 	verbose(3, "old start %d ",gi->exonStart);
 	gi->exonStart = size - (gi->exonStart + gi->exonSize);
 	verbose(3, "new start %d size %d \n",gi->exonStart, gi->exonSize);
 	if (gi->next == NULL)
 	    assert(gi->exonStart == 0);
 	}
 
     slReverse(giList);
     }
 }
 
 /* free species info list */
 static void freeSpeciesInfo(struct speciesInfo *list)
 {
 struct speciesInfo *siNext;
 
 for(; list ; list = siNext)
     {
     siNext = list->next;
 
     freez(&list->nucSequence);
     freez(&list->aaSequence);
     slNameFreeList(&list->posStrings);
     }
 }
 
 /* free exonInfo list */
 static void freeGIList(struct exonInfo *list)
 {
 struct exonInfo *giNext;
 
 for(; list ; list = giNext)
     {
     giNext = list->next;
 
     mafAliFreeList(&list->ali);
     }
 }
  
 static struct exonInfo *buildGIList(char *database, struct genePred *pred, 
-    char *mafTable)
+    char *mafTable, unsigned options)
 {
 struct exonInfo *giList = NULL;
 unsigned *exonStart = pred->exonStarts;
 unsigned *lastStart = &exonStart[pred->exonCount];
 unsigned *exonEnd = pred->exonEnds;
 int *frames = pred->exonFrames;
 
+boolean utr = options & MAFGENE_UTR;
+
 if (frames == NULL)
     {
     genePredAddExonFrames(pred);
     frames = pred->exonFrames;
     }
 
 assert(frames != NULL);
 
 int start = 0;
 
 
-/* first skip 5' UTR */
+/* first skip 5' UTR if the includeUtr option is not set */
+if (!utr) 
+    {
 for(; exonStart < lastStart; exonStart++, exonEnd++, frames++)
     {
     int size = *exonEnd - *exonStart;
 
     if (*exonStart + size > pred->cdsStart)
 	break;
     }
+    }
 
 for(; exonStart < lastStart; exonStart++, exonEnd++, frames++)
     {
     struct exonInfo *gi;
     int thisStart = *exonStart;
+    int thisEnd = *exonEnd;
 
+    if (!utr) 
+        {
     if (thisStart > pred->cdsEnd)
 	break;
 
     if (thisStart < pred->cdsStart)
 	thisStart = pred->cdsStart;
 
-    int thisEnd = *exonEnd;
 
     if (thisEnd > pred->cdsEnd)
 	thisEnd = pred->cdsEnd;
 
+	}
     int thisSize = thisEnd - thisStart;
+    if (!utr)
     verbose(3, "in %d %d cds %d %d\n",*exonStart,*exonEnd, thisStart, thisEnd);
-
     AllocVar(gi);
     gi->frame = *frames;
     gi->name = pred->name;
     gi->ali = getAliForRange(database, mafTable, pred->chrom, 
 	thisStart, thisEnd);
     gi->chromStart = thisStart;
     gi->chromEnd = thisEnd;
     gi->exonStart = start;
     gi->exonSize = thisSize;
     verbose(3, "exon size %d\n", thisSize);
     gi->strand = pred->strand[0];
     start += gi->exonSize;
     slAddHead(&giList, gi);
 
+    if (!utr) 
+        {
     if (thisEnd == pred->cdsEnd)
 	break;
     }
+    }
 slReverse(&giList);
 
 return giList;
 }
 
 void mafGeneOutPred(FILE *f, struct genePred *pred, char *dbName, 
     char *mafTable,  struct slName *speciesNameList, unsigned options,
     int numCols) 
 {
 boolean inExons = options & MAFGENE_EXONS;
 
 if (pred->cdsStart == pred->cdsEnd)
     return;
 
 if (numCols < -1)
     errAbort("Number of columns must be zero or greater.");
 
-struct exonInfo *giList = buildGIList(dbName, pred, mafTable);
+struct exonInfo *giList = buildGIList(dbName, pred, mafTable, options);
 
 if (giList == NULL)
     return;
 
 struct hash *speciesInfoHash = newHash(5);
 struct speciesInfo *speciesList = getSpeciesInfo(giList, speciesNameList, 
     speciesInfoHash);
 
 copyMafs(speciesInfoHash, &giList, inExons);
 
 struct speciesInfo *si = speciesList;
 for(; si ; si = si->next)
     si->curPosString = si->posStrings;
 
 writeOutSpecies(f, dbName, speciesList, giList, options, numCols);
 
 freeSpeciesInfo(speciesList);
 freeGIList(giList);
 }