e70152e44cc66cc599ff6b699eb8adc07f3e656a
kent
  Sat May 24 21:09:34 2014 -0700
Adding Copyright NNNN Regents of the University of California to all files I believe with reasonable certainty were developed under UCSC employ or as part of Genome Browser copyright assignment.
diff --git src/lib/dnaMotif.c src/lib/dnaMotif.c
index 042e7b9..0e2f3f7 100644
--- src/lib/dnaMotif.c
+++ src/lib/dnaMotif.c
@@ -1,628 +1,631 @@
 /* dnaMotif.c was originally generated by the autoSql program, which also 
  * generated dnaMotif.h and dnaMotif.sql.  This module links the database and
  * the RAM representation of objects. */
 
+/* Copyright (C) 2011 The Regents of the University of California 
+ * See README in this or parent directory for licensing information. */
+
 #include "common.h"
 #include "linefile.h"
 #include "sqlList.h"
 #include "dystring.h"
 #include "dnaMotif.h"
 #include "portable.h"
 
 
 struct dnaMotif *dnaMotifCommaIn(char **pS, struct dnaMotif *ret)
 /* Create a dnaMotif out of a comma separated string. 
  * This will fill in ret if non-null, otherwise will
  * return a new dnaMotif */
 {
 char *s = *pS;
 int i;
 
 if (ret == NULL)
     AllocVar(ret);
 ret->name = sqlStringComma(&s);
 ret->columnCount = sqlSignedComma(&s);
 s = sqlEatChar(s, '{');
 AllocArray(ret->aProb, ret->columnCount);
 for (i=0; i<ret->columnCount; ++i)
     {
     ret->aProb[i] = sqlSignedComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 s = sqlEatChar(s, '{');
 AllocArray(ret->cProb, ret->columnCount);
 for (i=0; i<ret->columnCount; ++i)
     {
     ret->cProb[i] = sqlSignedComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 s = sqlEatChar(s, '{');
 AllocArray(ret->gProb, ret->columnCount);
 for (i=0; i<ret->columnCount; ++i)
     {
     ret->gProb[i] = sqlSignedComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 s = sqlEatChar(s, '{');
 AllocArray(ret->tProb, ret->columnCount);
 for (i=0; i<ret->columnCount; ++i)
     {
     ret->tProb[i] = sqlSignedComma(&s);
     }
 s = sqlEatChar(s, '}');
 s = sqlEatChar(s, ',');
 *pS = s;
 return ret;
 }
 
 void dnaMotifFree(struct dnaMotif **pEl)
 /* Free a single dynamically allocated dnaMotif such as created
  * with dnaMotifLoad(). */
 {
 struct dnaMotif *el;
 
 if ((el = *pEl) == NULL) return;
 freeMem(el->name);
 freeMem(el->aProb);
 freeMem(el->cProb);
 freeMem(el->gProb);
 freeMem(el->tProb);
 freez(pEl);
 }
 
 void dnaMotifFreeList(struct dnaMotif **pList)
 /* Free a list of dynamically allocated dnaMotif's */
 {
 struct dnaMotif *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     dnaMotifFree(&el);
     }
 *pList = NULL;
 }
 
 void dnaMotifOutput(struct dnaMotif *el, FILE *f, char sep, char lastSep) 
 /* Print out dnaMotif.  Separate fields with sep. Follow last field with lastSep. */
 {
 int i;
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->name);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 fprintf(f, "%d", el->columnCount);
 fputc(sep,f);
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->columnCount; ++i)
     {
     fprintf(f, "%f", el->aProb[i]);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 fputc(sep,f);
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->columnCount; ++i)
     {
     fprintf(f, "%f", el->cProb[i]);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 fputc(sep,f);
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->columnCount; ++i)
     {
     fprintf(f, "%f", el->gProb[i]);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 fputc(sep,f);
 if (sep == ',') fputc('{',f);
 for (i=0; i<el->columnCount; ++i)
     {
     fprintf(f, "%f", el->tProb[i]);
     fputc(',', f);
     }
 if (sep == ',') fputc('}',f);
 fputc(lastSep,f);
 }
 
 float dnaMotifSequenceProb(struct dnaMotif *motif, DNA *dna)
 /* Return probability of dna according to motif.  Make sure
  * motif is probabalistic (with call to dnaMotifMakeProbabalistic
  * if you're not sure) before calling this. */
 {
 float p = 1.0;
 int i;
 for (i=0; i<motif->columnCount; ++i)
     {
     switch (dna[i])
         {
 	case 'a':
 	case 'A':
 	    p *= motif->aProb[i];
 	    break;
 	case 'c':
 	case 'C':
 	    p *= motif->cProb[i];
 	    break;
 	case 'g':
 	case 'G':
 	    p *= motif->gProb[i];
 	    break;
 	case 't':
 	case 'T':
 	    p *= motif->tProb[i];
 	    break;
 	case 0:
 	    warn("dna shorter than motif");
 	    internalErr();
 	    break;
 	default:
 	    p *= 0.25;
 	    break;
 	}
     }
 return p;
 }
 
 static float dnaMotifSequenceProbWithMark0(struct dnaMotif *motif, DNA *dna, double mark0[5])
 {
 float p = 1.0;
 int i;
 for (i=0; i<motif->columnCount; ++i)
     {
     int val = ntVal[(int) dna[i]] + 1;
     p *= (mark0[val]/mark0[0]);
     }
 return p;
 }
 
 static float dnaMotifSequenceProbLog(struct dnaMotif *motif, DNA *dna)
 {
 float p = 0;
 int i;
 for (i=0; i<motif->columnCount; ++i)
     {
     switch (dna[i])
         {
 	case 'a':
 	case 'A':
 	    p += motif->aProb[i];
 	    break;
 	case 'c':
 	case 'C':
 	    p += motif->cProb[i];
 	    break;
 	case 'g':
 	case 'G':
 	    p += motif->gProb[i];
 	    break;
 	case 't':
 	case 'T':
 	    p += motif->tProb[i];
 	    break;
 	case 0:
 	    warn("dna shorter than motif");
 	    internalErr();
 	    break;
 	default:
 	    p += logBase2(0.25);
 	    break;
 	}
     }
 return p;
 }
 
 char dnaMotifBestStrand(struct dnaMotif *motif, DNA *dna)
 /* Figure out which strand of DNA is better for probabalistic motif. */
 {
 float fScore, rScore;
 fScore = dnaMotifSequenceProb(motif, dna);
 reverseComplement(dna, motif->columnCount);
 rScore = dnaMotifSequenceProb(motif, dna);
 reverseComplement(dna, motif->columnCount);
 if (fScore >= rScore)
     return '+';
 else
     return '-';
 }
 
 double dnaMotifBitScore(struct dnaMotif *motif, DNA *dna)
 /* Return logBase2-odds score of dna given a probabalistic motif. */
 {
 double p = dnaMotifSequenceProb(motif, dna);
 double q = pow(0.25, motif->columnCount);
 double odds = p/q;
 return logBase2(odds);
 }
 
 double dnaMotifBitScoreWithMark0Bg(struct dnaMotif *motif, DNA *dna, double mark0[5])
 /* Return logBase2-odds score of dna given a probabalistic motif and using a 0-order markov model for the background. */
 {
 double p = dnaMotifSequenceProb(motif, dna);
 double q = dnaMotifSequenceProbWithMark0(motif, dna, mark0);
 double odds = p/q;
 return logBase2(odds);
 }
 
 double dnaMotifBitScoreWithMarkovBg(struct dnaMotif *motif, DNA *dna, double mark2[5][5][5])
 /* Return logBase2-odds score of dna given a probabalistic motif using a 2nd-order markov model for the background.
    motif and markd2 must be in log2 format.
    Seq must contain an extra two bases at the front (i.e. we start scoring from dna + 2). */
 {
 double p = dnaMotifSequenceProbLog(motif, dna + 2);
 double q = 0;
 int i, index;
 // XXXX assert somehow that length(dna) == motif->columnCount + 2?
 dnaUtilOpen();
 for(i = 0, index = 2; i < motif->columnCount; i++, index++)
     {
     double tmp = mark2[ntVal5[(int) dna[index - 2]] + 1][ntVal5[(int) dna[index - 1]] + 1][ntVal5[(int) dna[index]] + 1];
 #if 0
     char buf[4];
     safencpy(buf, sizeof(buf), dna + index - 2, 3);
     fprintf(stderr, "%s: tmp: %.6f\n", buf, tmp);
 #endif
     q += tmp;
     }
 return p - q;
 }
 
 
 void dnaMotifMakeLog2(struct dnaMotif *motif)
 {
 int i;
 for (i = 0; i < motif->columnCount; i++)
     {
     motif->aProb[i] = logBase2(motif->aProb[i]);
     motif->cProb[i] = logBase2(motif->cProb[i]);
     motif->gProb[i] = logBase2(motif->gProb[i]);
     motif->tProb[i] = logBase2(motif->tProb[i]);
     }
 }
 
 void dnaMotifNormalize(struct dnaMotif *motif)
 /* Make all columns of motif sum to one. */
 {
 int i;
 for (i=0; i<motif->columnCount; ++i)
     {
     float sum = motif->aProb[i] + motif->cProb[i] + motif->gProb[i] + motif->tProb[i];
     if (sum < 0)
         errAbort("%s has negative numbers, perhaps it's score not probability based", 
 		motif->name);
     if (sum == 0)
          motif->aProb[i] = motif->cProb[i] = motif->gProb[i] = motif->tProb[i] = 0.25;
     motif->aProb[i] /= sum;
     motif->cProb[i] /= sum;
     motif->gProb[i] /= sum;
     motif->tProb[i] /= sum;
     }
 }
 
 boolean dnaMotifIsScoreBased(struct dnaMotif *motif)
 /* Return TRUE if dnaMotif is score-based (which we decide by
  * the presense of negative values. */
 {
 int i;
 for (i=0; i<motif->columnCount; ++i)
     {
     if (motif->aProb[i] < 0) return TRUE;
     if (motif->cProb[i] < 0) return TRUE;
     if (motif->gProb[i] < 0) return TRUE;
     if (motif->tProb[i] < 0) return TRUE;
     }
 return FALSE;
 }
 
 void dnaMotifScoreToProb(struct dnaMotif *motif)
 /* Convert motif that is log-odds score based to motif
  * that is probability based.  This assumes that the
  * background distribution is simple: 25% for each base */
 {
 int i;
 for (i=0; i<motif->columnCount; ++i)
     {
     motif->aProb[i] = exp(motif->aProb[i]);
     motif->cProb[i] = exp(motif->cProb[i]);
     motif->gProb[i] = exp(motif->gProb[i]);
     motif->tProb[i] = exp(motif->tProb[i]);
     }
 dnaMotifNormalize(motif);
 }
 
 void dnaMotifMakeProbabalistic(struct dnaMotif *motif)
 /* Change motif, which may be score or count based, to 
  * probabalistic one, where each column adds to 1.0 */
 {
 if (dnaMotifIsScoreBased(motif))
     dnaMotifScoreToProb(motif);
 else
     dnaMotifNormalize(motif);
 }
 
 static void printProbRow(FILE *f, char *label, float *p, int pCount)
 /* Print one row of a probability profile. */
 {
 int i;
 fprintf(f, "%s ", label);
 for (i=0; i < pCount; ++i)
     fprintf(f, "%5.2f ", p[i]);
 printf("\n");
 }
 
 void dnaMotifPrintProb(struct dnaMotif *motif, FILE *f)
 /* Print DNA motif probabilities. */
 {
 printProbRow(f, "A", motif->aProb, motif->columnCount);
 printProbRow(f, "C", motif->cProb, motif->columnCount);
 printProbRow(f, "G", motif->gProb, motif->columnCount);
 printProbRow(f, "T", motif->tProb, motif->columnCount);
 }
 
 
 static double u1(double prob)
 /* Calculate partial uncertainty for one base. */
 {
 if (prob == 0)
     return 0;
 return prob * logBase2(prob);
 }
 
 static double uncertainty(struct dnaMotif *motif, int pos)
 /* Return the uncertainty at pos of motif.  This corresponds
  * to the H function in logo.pm */
 {
 return -( u1(motif->aProb[pos]) + u1(motif->cProb[pos])
 	+ u1(motif->gProb[pos]) +u1(motif->tProb[pos]) );
 }
 
 double dnaMotifBitsOfInfo(struct dnaMotif *motif, int pos)
 /* Return bits of information at position. */
 {
 if (pos > motif->columnCount || pos < 0)
     internalErr();
 return 2 - uncertainty(motif, pos);
 }
 
 struct letterProb
 /* A letter tied to a probability. */
     {
     struct letterProb *next;
     double prob;	/* Probability for this letter. */
     char letter;	/* The letter (upper case) */
     };
 
 static struct letterProb *letterProbNew(char letter, double prob)
 /* Make a new letterProb. */
 {
 struct letterProb *lp;
 AllocVar(lp);
 lp->letter = letter;
 lp->prob = prob;
 return lp;
 }
 
 static int letterProbCmp(const void *va, const void *vb)
 /* Compare to sort highest probability first. */
 {
 const struct letterProb *a = *((struct letterProb **)va);
 const struct letterProb *b = *((struct letterProb **)vb);
 double dif = a->prob - b->prob;
 if (dif < 0)
    return -1;
 else if (dif > 0)
    return 1;
 else
    return 0;
 }
 
 static void addBaseProb(struct letterProb **pList, char letter, double prob)
 /* If prob > 0 add letterProb to list. */
 {
 if (prob > 0)
     {
     struct letterProb *lp = letterProbNew(letter, prob);
     slAddHead(pList, lp);
     }
 }
 
 static struct letterProb *letterProbFromMotifColumn(struct dnaMotif *motif, int pos)
 /* Return letterProb list corresponding to column of motif. */
 {
 struct letterProb *lpList = NULL;
 addBaseProb(&lpList, 'A', motif->aProb[pos]);
 addBaseProb(&lpList, 'C', motif->cProb[pos]);
 addBaseProb(&lpList, 'G', motif->gProb[pos]);
 addBaseProb(&lpList, 'T', motif->tProb[pos]);
 slSort(&lpList, letterProbCmp);
 return lpList;
 }
 
 static void psOneColumn(struct dnaMotif *motif, int pos,
     double xStart, double yStart, double width, double totalHeight,
     FILE *f)
 /* Write one column of logo to postScript. */
 {
 struct letterProb *lp, *lpList = letterProbFromMotifColumn(motif, pos);
 double x = xStart, y = yStart, w = width, h;
 for (lp = lpList; lp != NULL; lp = lp->next)
     {
     h = totalHeight * lp->prob;
     if (h >= 1.0)
 	{
 	fprintf(f, "%cColor ", tolower(lp->letter));
 	fprintf(f, "%3.2f ", x);
 	fprintf(f, "%3.2f ", y);
 	fprintf(f, "%3.2f ", x + w);
 	fprintf(f, "%3.2f ", y + h);
 	fprintf(f, "(%c) textInBox\n", lp->letter);
 	}
     y += h;
     }
 fprintf(f, "\n");
 slFreeList(&lpList);
 }
 
 static void dnaMotifDims(struct dnaMotif *motif, double widthPerBase, double height, 
 	int *retWidth, int *retHeight)
 /* Calculate dimensions of motif when rendered. */
 {
 static int widthFudgeFactor = 2, heightFudgeFactor = 2;
 *retWidth = ceil(widthPerBase * motif->columnCount) + widthFudgeFactor;
 *retHeight = ceil(height) + heightFudgeFactor;
 }
 
 void dnaMotifToLogoPs2(struct dnaMotif *motif, double widthPerBase, double height, 
                        double minHeight, char *fileName)
 /* Write logo corresponding to motif to postScript file, with extended options. minHeight
  * is the minimum height that is excluded from information content scaling.  This allows
  * something to show up in columns with very little information content.  Setting this
  * to be the same as height creates an frequency-based logo.
  */
 {
 FILE *f = mustOpen(fileName, "w");
 int i;
 int xStart = 0;
 int w, h;
 char *s = 
 #include "dnaMotif.pss"
 ;
 
 dnaMotifDims(motif, widthPerBase, height, &w, &h);
 fprintf(f, "%%!PS-Adobe-3.1 EPSF-3.0\n");
 fprintf(f, "%%%%BoundingBox: 0 0 %d %d\n\n", w, h);
 fprintf(f, "%s", s);
 
 fprintf(f, "%s", "% Start of code for this specific logo\n");
 
 for (i=0; i<motif->columnCount; ++i)
     {
     double infoScale = dnaMotifBitsOfInfo(motif, i)/2.0;
     // only scale part beyond minHeight
     double useHeight = minHeight + (infoScale * (height - minHeight));
     if (useHeight > height)
         useHeight = height;
     psOneColumn(motif, i, xStart, 0, widthPerBase, useHeight, f);
     xStart += widthPerBase;
     }
 fprintf(f, "showpage\n");
 carefulClose(&f);
 }
 
 void dnaMotifToLogoPs(struct dnaMotif *motif, double widthPerBase, double height, char *fileName)
 /* Write logo corresponding to motif to postScript file. */
 {
 dnaMotifToLogoPs2(motif, widthPerBase, height, 0.0, fileName);
 }
 
 void dnaMotifToLogoPng(
 	struct dnaMotif *motif,	/* Motif to draw. */
 	double widthPerBase, 	/* Width of each base. */
 	double height, 		/* Max height. */
 	char *gsExe, 		/* ghostscript executable, NULL for default */
 	char *tempDir,          /* temp dir , NULL for default */
 	char *fileName)		/* output png file name. */
 /* Write logo corresponding to motif to png file. */
 {
 char *psName = rTempName(tempDir, "dnaMotif", ".ps");
 struct dyString *dy = dyStringNew(0);
 int w, h;
 int sysRet;
 
 if (gsExe == NULL) gsExe = "gs";
 if (tempDir == NULL) tempDir = "/tmp";
 dnaMotifToLogoPs(motif, widthPerBase, height, psName);
 dnaMotifDims(motif, widthPerBase, height, &w, &h);
 dyStringAppend(dy, gsExe);
 dyStringAppend(dy, " -sDEVICE=png16m -sOutputFile=");
 dyStringAppend(dy, fileName);
 dyStringAppend(dy, " -dBATCH -dNOPAUSE -q ");
 dyStringPrintf(dy, "-g%dx%d ", w, h);
 dyStringAppend(dy, psName);
 sysRet = system(dy->string);
 if (sysRet != 0)
     errAbort("System call returned %d for:\n  %s", sysRet, dy->string);
 
 /* Clean up. */
 dyStringFree(&dy);
 
 /* change permisssions so the webserver can access the file */
 dy = newDyString(0);
 dyStringPrintf(dy, "chmod 666 %s ", fileName);
 sysRet = system(dy->string);
 
 remove(psName);
 }
 
 void dnaMotifToLogoPsW(struct dnaMotif *motif, double widthPerBase, double width, double height, char *fileName)
 /* Write logo corresponding to motif to postScript file. */
 {
 FILE *f = mustOpen(fileName, "w");
 int i;
 int xStart = 0;
 int w, h;
 char *s = 
 #include "dnaMotif.pss"
 ;
 
 dnaMotifDims(motif, widthPerBase, height, &w, &h);
 fprintf(f, "%%!PS-Adobe-3.1 EPSF-3.0\n");
 fprintf(f, "%%%%BoundingBox: 0 0 %d %d\n\n", w, h);
 fprintf(f, "%s", s);
 
 fprintf(f, "%s", "% Start of code for this specific logo\n");
 
 for (i=0; i<motif->columnCount; ++i)
     {
     double infoScale = 0.9 ; 
     xStart = i * width / motif->columnCount;
     psOneColumn(motif, i, xStart, 0, widthPerBase, infoScale * height, f);
     }
 fprintf(f, "showpage\n");
 carefulClose(&f);
 }
 
 void dnaMotifToLogoPGM(
 	struct dnaMotif *motif,	/* Motif to draw. */
 	double widthPerBase, 	/* Width of each base. */
 	double width, 		/* Max width. */
 	double height, 		/* Max height. */
 	char *gsExe, 		/* ghostscript executable, NULL for default */
 	char *tempDir,          /* temp dir , NULL for default */
 	char *fileName)		/* output png file name. */
 /* Write logo corresponding to motif to pgm file. */
 {
 char *psName = rTempName(tempDir, "dnaMotif", ".ps");
 struct dyString *dy = dyStringNew(0);
 int w, h;
 int sysRet;
 
 if (gsExe == NULL) gsExe = "gs";
 if (tempDir == NULL) tempDir = "/tmp";
 dnaMotifToLogoPsW(motif, widthPerBase, width, height, psName);
 dnaMotifDims(motif, widthPerBase, height, &w, &h);
 dyStringAppend(dy, gsExe);
 dyStringAppend(dy, " -sDEVICE=pgmraw -sOutputFile=");
 dyStringAppend(dy, fileName);
 dyStringAppend(dy, " -dBATCH -dNOPAUSE -q ");
 dyStringPrintf(dy, "-g%dx%d ", (int) ceil(width), h);
 dyStringAppend(dy, psName);
 sysRet = system(dy->string);
 if (sysRet != 0)
     errAbort("System call returned %d for:\n  %s", sysRet, dy->string);
 
 /* Clean up. */
 dyStringFree(&dy);
 
 /* change permisssions so the webserver can access the file */
 dy = newDyString(0);
 dyStringPrintf(dy, "chmod 666 %s ", fileName);
 sysRet = system(dy->string);
 
 remove(psName);
 }