ab558bd30d088b3490661c98ce6fb665fd00e076 braney Fri Jul 2 07:48:40 2010 -0700 switch to using PNG diff --git src/ameme/ameme.c src/ameme/ameme.c index ce9a7c9..835ea29 100644 --- src/ameme/ameme.c +++ src/ameme/ameme.c @@ -1,3255 +1,3264 @@ /* ameme - The central part of AmphetaMeme, which finds patterns in DNA */ #include "common.h" #include "memalloc.h" #include "errabort.h" #include "obscure.h" #include "hash.h" #include "dnautil.h" #include "dnaseq.h" #include "localmem.h" #include "sig.h" #include "fa.h" #include "portable.h" #include "codebias.h" #include "memgfx.h" #include "htmshell.h" #include "cheapcgi.h" #include "slog.h" #include "dnaMarkov.h" #include "ameme.h" #include "fragFind.h" #include "linefile.h" #include "dystring.h" #include "dnaMotif.h" boolean isFromWeb; /* True if run as CGI. */ boolean isMotifMatcher; /* True if run from motifMatcher.html. */ boolean outputLogo; /* True if want sequence logo output. */ FILE *htmlOut; /* Where to send output. */ static void vaProgress(char *format, va_list args) /* Print message to indicate progress - to web page if in * interactive mode, to console if not. */ { FILE *f = (isFromWeb ? stdout : stderr); if (format != NULL) { vfprintf(f, format, args); fflush(f); } } static void progress(char *format, ...) /* Print message to indicate progress - to web page if in * interactive mode, to console if not. */ { va_list args; va_start(args, format); vaProgress(format, args); va_end(args); } void horizontalLine() /* Make horizontal line across html. */ { fprintf(htmlOut, "
"); dnaMotifMakeProbabalistic(motif); makeTempName(&pngTn, "logo", ".png"); dnaMotifToLogoPng(motif, 47, 140, gs, "../trash", pngTn.forCgi); fprintf(f," "); fprintf(f,""); } } void printProfile(FILE *f, struct profile *prof) /* Display salient facts about a profile. */ { struct profileColumn *col; int i; char *consSeq = consensusSeq(prof); int orderTranslater[4] = {A_BASE_VAL, C_BASE_VAL, G_BASE_VAL, T_BASE_VAL}; int baseVal; struct dnaMotif *motif; fprintf(f, "%5.4f @ %4.2f sd %4.2f ", invSlogScale*prof->score, prof->locale.mean, prof->locale.standardDeviation); touppers(consSeq); AllocVar(motif); motif->name = NULL; motif->columnCount = 0; fprintf(f, "%s\n", consSeq); for (col = prof->columns; col != NULL; col = col->next) motif->columnCount++; AllocArray(motif->aProb, motif->columnCount); AllocArray(motif->cProb, motif->columnCount); AllocArray(motif->gProb, motif->columnCount); AllocArray(motif->tProb, motif->columnCount); for (i = 0; i<4; ++i) { int j = 0; baseVal = orderTranslater[i]; fprintf(f, "\t%c ", valToNt[baseVal]); for (col = prof->columns; col != NULL; col = col->next) { fprintf(f, "%4.3f ", invSlog(col->slogProb[baseVal]) ); switch (valToNt[baseVal]) { case 'a': motif->aProb[j] = (float)invSlog(col->slogProb[baseVal]); break; case 'c': motif->cProb[j] = (float)invSlog(col->slogProb[baseVal]); break; case 'g': motif->gProb[j] = (float)invSlog(col->slogProb[baseVal]); break; case 't': motif->tProb[j] = (float)invSlog(col->slogProb[baseVal]); break; } j++; } fprintf(f, "\n"); } if (f == htmlOut && outputLogo) motifHitSection(NULL, motif, f); } void makeNullModels(struct seqList *bgSeq) /* Create null models from background sequence. */ { int frame; static DNA *stopCodons[3] = {"tag", "tga", "taa"}; int i; makeFreqTable(bgSeq); makeMark1(bgSeq, mark0, slogMark0, mark1, slogMark1); makeMark2(bgSeq, mark0, slogMark0, mark1, slogMark1, mark2, slogMark2); for (frame=0; frame<3; frame += 1) { int readFrame = (frame+2)%3; makeTripleTable(bgSeq, mark0, slogMark0, mark1, slogMark1, codingMark2[readFrame], slogCodingMark2[readFrame], frame, 3, 3); } /* Usually there's a little noise in our data that makes stop codons appear at a small * frequency in coding regions. We won't eliminate this completely, but reduce it * by a factor of 100.... */ for (i=0; i", pngTn.forHtml); fprintf(f,"\n"); fprintf(f,"motif consensus\n"); dnaMotifPrintProb(motif, f); fprintf(f,"
Looking for %d motifs in %d sequences. Longest sequence is %d bases.
\n", numMotifs, goodSeqListSize, goodSeqElSize); fprintf(htmlOut, "Settings are %s location; %sinclude reverse complement; %d occurrences per sequence; %s align; ", (useLocation ? "use" : "ignore"), (considerRc ? "" : "don't "), maxOcc, (leftAlign ? "left" : "right") ); fprintf(htmlOut, "restrain expansionist tendencies %f; number of sequences in initial scan %d; ", constrainer, startScanLimit); backgroundName = bgSource; if (backgroundName == NULL) backgroundName = badName; if (backgroundName == NULL) backgroundName = "same as foreground"; fprintf(htmlOut, "background model %s; background data %s;
", (nullModelCgiName == NULL ? "Markov 0" : nullModelCgiName), backgroundName); approxTime = calcApproximateTime(considerRc); progress("This run would take about %2.2f minutes on a lightly loaded vintage 2003 web server.", approxTime); fprintf(htmlOut, "\n"); horizontalLine(); doTopTiles(numMotifs, tn.forCgi, logFile, considerRc); loadAndColorProfiles(tn.forCgi, goodSeq, goodSeqElSize, considerRc); remove(tn.forCgi); fprintf(htmlOut, "\n"); freeSeqList(&goodSeq); } void doRandomTest(boolean premade, boolean considerRc) /* Generate tables for scores on random sequences. */ { int seqLen, seqCount; struct tempName randTn; FILE *logFile = mustOpen("\\temp\\random.txt", "w"); int reps = 2; int i; makeTempName(&randTn, "rand", ".fa"); fprintf(htmlOut, "
\n"); for (seqLen = 100; seqLen <= 500; seqLen += 100) for (seqCount = 100; seqCount <= 100; seqCount += 10) { goodName = randTn.forCgi; for (i=1; i<=reps; ++i) { fprintf(logFile, "----------------- %d Random sequence of %d nucleotides take %d----------------\n", seqCount, seqLen, i); generate(goodName, seqCount, seqLen); oneSearchSet(premade, logFile, considerRc); } } chmod(randTn.forCgi, 0666); fclose(logFile); } void impDoMiddle() /* Do main work of Improbizer. Find motifs and display them. */ { char *nullModelCgi = "background"; boolean premade = FALSE; boolean isRandomTest = cgiBoolean("randomTest"); boolean isControlRun = cgiBoolean("controlRun"); boolean considerRc = cgiBoolean("rcToo"); long startTime, endTime; boolean codingWeirdness; /* True when we need to worry about coding background. */ FILE *motifOutFile = NULL; char *motifOutName; startTime = clock1000(); fprintf(htmlOut, "Improbizer Results
\n"); leftAlign = cgiBoolean("leftAlign"); if (cgiVarExists(nullModelCgi)) { nullModelCgiName = cgiEncode(cgiString(nullModelCgi)); nullModel = cgiOneChoice(nullModelCgi, nullModelChoices, ArraySize(nullModelChoices)); } if (cgiVarExists("maxOcc")) maxOcc = cgiInt("maxOcc"); if (cgiVarExists("tileSize")) defaultTileSize = cgiInt("tileSize"); if (cgiVarExists("startScanLimit")) startScanLimit = cgiInt("startScanLimit"); if (cgiVarExists("numMotifs")) numMotifs = cgiInt("numMotifs"); if (cgiVarExists("constrainer")) constrainer = cgiDouble("constrainer"); if ((motifOutName = cgiOptionalString("motifOutput")) != NULL) motifOutFile = mustOpen(motifOutName, "w"); useLocation = !cgiBoolean("ignoreLocation"); if (!isRandomTest) { if (cgiVarExists("goodText")) { pasteToFa("goodText", &goodName, &goodSeqListSize, &goodSeqElSize); if (goodSeqListSize <= 0) errAbort("You need to paste in something. Go back and try again!"); } if (goodName == NULL) goodName = cgiString("good"); } if (cgiVarExists("badText")) { int numSeq, elSize; pasteToFa("badText", &badName, &numSeq, &elSize); if (numSeq <= 0) badName = NULL; } codingWeirdness = (nullModel == nmCoding); if (badName == NULL) { badName = cgiOptionalString("bad"); if (badName != NULL) codingWeirdness = FALSE; } /* If they selected a premade background, figure out file that goes with it, * then look up directory to find file in. */ if ((bgSource = cgiOptionalString("backgroundDataSource")) != NULL) { char *premadeBg = NULL; if (sameString(bgSource, "Worm Intron 3'")) premadeBg = "wormInt3"; else if (sameString(bgSource, "Worm Intron 5'")) premadeBg = "wormInt5"; else if (sameString(bgSource, "Worm Coding")) { premadeBg = "cecoding"; codingWeirdness = FALSE; } else if (sameString(bgSource, "Yeast Promoter")) premadeBg = "yeastPromo"; else if (sameString(bgSource, "Same as Foreground")) ; else if (sameString(bgSource, "From Data Pasted Below")) codingWeirdness = FALSE; else errAbort("Unknown backgroundDataSource"); if (premadeBg != NULL) { makePremadeBgPathName(premadeBg, badNameBuf, sizeof(badNameBuf)); badName = badNameBuf; premade = TRUE; } } if (codingWeirdness) { errAbort("For coding background you must either paste in-frame coding regions\n" "into the background, from the command line include a 'bad' file,\n" "or select the premade background 'Worm Coding'\n"); } getNullModel(goodName, badName, premade); if (isControlRun) goodName = randomSpoof(goodName); if (isRandomTest ) { fprintf(htmlOut, "Random test mode - this will take a good long time. Be sure to kill " "the process if you get impatient.
\n\n"); doRandomTest(premade, considerRc); } else { double approxTime = calcApproximateTime(considerRc); double maxTime = 5.0; if (isFromWeb && approxTime > maxTime) { errAbort("Sorry, this job is too big for our web server - it would use about " "%2.1f minutes of CPU time. Out of fairness to the other users of this " "machine we limit jobs to %2.1f minutes of CPU time or less. Please reduce " "the size of your data (now %d sequences of %d bases each), " "the number of motifs you're looking " "for (now %d), or the number of sequences in the initial scan (now %d). " "The most important influence on run time is the maximum size of an individual sequence. " " If you really need " "to run the program on a data set this large contact Jim Kent (kent@biology.ucsc.edu) " "to get a batch version of this program to run on your own machine.", approxTime, maxTime, goodSeqListSize, goodSeqElSize, numMotifs, startScanLimit ); } fprintf(htmlOut, "Improbizer will display the results in parts. First it will " "display the profiles (consensus sequences with the probability of " "each base at each position) individually as they are calculated. The " "position of a profile in a sequence is indicated by upper case. The " "strength of the profile match is indicated by the score on the left. " "There will be a delay during this phase as each profile is calculated. " "Second Improbizer will " "display all profiles at once over each sequence. Each profile " "has it's own color and the stronger the profile matches the darker " "it will appear in the sequence. Finally there will be a graphic " "summary of all the profiles at the end, using the same color " "conventions.
\n"); oneSearchSet(premade, motifOutFile, considerRc); endTime = clock1000(); horizontalLine(); fprintf(htmlOut, "Calculation time was %4.3f minutes\n", 0.001*(endTime-startTime)/60); } if (isControlRun) remove(goodName); } void explainMotif() /* Explain to user what a motif should look like and exit. */ { errAbort("" "Not all of the motifs are correct. A motif should look something like:\n" " a 0.208 0.239 0.018 0.990 0.003 0.003 0.990 0.990 0.003 0.650 0.506\n" " c 0.068 0.044 0.003 0.003 0.990 0.003 0.003 0.003 0.990 0.064 0.042\n" " g 0.074 0.061 0.070 0.003 0.003 0.003 0.003 0.003 0.003 0.181 0.050\n" " t 0.650 0.656 0.908 0.003 0.003 0.990 0.003 0.003 0.003 0.104 0.401\n" "Each row needs to start out with a nucleotide, and be followed with numbers\n" "which represent the probability of the nucleotide occurring at that point\n" "in the sequence. You can also submit motifs like:\n" " a 3 1 4 3 3 2\n" " c 2 5 2 2 1 0\n" " g 4 2 4 3 5 7\n" " t 1 2 0 2 1 1\n" "where the numbers represent counts rather than probabilities\n" "If you are including location information in the motif please add an\n" "additional line so that your motif looks something like:\n" " 3.1654 @ 18.40 sd 7.95 GGGAAC\n" " t 0.058 0.101 0.003 0.003 0.216 0.269 \n" " c 0.024 0.054 0.019 0.314 0.071 0.369 \n" " a 0.308 0.378 0.003 0.679 0.709 0.215 \n" " g 0.611 0.467 0.974 0.003 0.003 0.148 \n" "Where the top line is of the format:\n" " score @ mean sd variance consensus\n" "You can leave out the score and consensus, they are ignored (but\n" "if your motif comes from Improbizer it's easier to leave them in)." ); } boolean parseLocation(char *words[], int wordCount, double *retMean, double *retVariance) /* Get mean and variance out of what's left of line after the @ in a line like: * 7.5603 @ 17.15 sd 10.41 TTTACTAACAAT */ { if (wordCount < 3 || !isdigit(words[0][0]) || !isdigit(words[2][0])) explainMotif(); *retMean = atof(words[0]); *retVariance = atof(words[2]); return TRUE; } boolean andFour(boolean b[4]) /* Return TRUE if all four b[i] are TRUE. */ { return (b[0] && b[1] && b[2] && b[3]); } struct profile *parseMotifs(char *text, boolean useLocation) /* Convert motif from text description (something like the following) 7.5603 @ 17.15 sd 10.41 TTTACTAACAAT t 0.650 0.656 0.908 0.003 0.003 0.990 0.003 0.003 0.003 0.104 0.401 c 0.068 0.044 0.003 0.003 0.990 0.003 0.003 0.003 0.990 0.064 0.042 a 0.208 0.239 0.018 0.990 0.003 0.003 0.990 0.990 0.003 0.650 0.506 g 0.074 0.061 0.070 0.003 0.003 0.003 0.003 0.003 0.003 0.181 0.050 * to a profile structure. */ { #define maxLen 100 struct profile *profList = NULL; char *lines[128]; int lineCount; int lineIx = 0; lineCount = chopString(text, "\n", lines, ArraySize(lines)); for (;;) { char *words[maxLen+1]; double probs[4][maxLen]; double mean = 0.0, variance = 0.0; double oneProb; char *word; int baseVal; boolean gotLocation = FALSE; boolean gotNt[4]; int wordCount; int profLen = 0; int i=0,j=0; struct profile *prof; struct profileColumn *col; /* Skip over any blank lines or lines that start with #*/ for (;lineIx < lineCount; ++lineIx) { char *s = skipLeadingSpaces(lines[lineIx]); if (s != 0 && s[0] != '#') break; } zeroBytes(gotNt, sizeof(gotNt)); if (lineIx >= lineCount) break; /* Empty motif is not an error. */ if (lineCount < 4) explainMotif(); while (lineIxmaxLen+1) errAbort("Sorry, this program can only handle motifs with up to %d bases", maxLen); if (wordCount > 0) /* Tolerate blank lines. */ { /* First word should be the name of a base. Check this, * make sure each base only used once, and that all * lines of motif have the same number of elements. * Along with all this checking, store the probability * values in each row. */ if (sameString(words[1], "@") ) gotLocation = parseLocation(words+2, wordCount-2, &mean, &variance); else if (sameString(words[0], "@") ) gotLocation = parseLocation(words+1, wordCount-1, &mean, &variance); else /* Parse nucleotide probability line. */ { word = words[0]; if (strlen(word) != 1) explainMotif(); if ((baseVal = ntVal[(int)word[0]]) < 0) explainMotif(); if (gotNt[baseVal]) errAbort("Got more than one row for %s", word); gotNt[baseVal] = TRUE; if (profLen > 0 && profLen != wordCount-1) errAbort("Not all lines in motif have the same number of numbers. Aborting."); profLen = wordCount-1; for (j=0; j columns; i next) { double total = 0; double x; for (j=0; j<4; ++j) total += probs[j][i]; if (total <= 0.00001) errAbort("Column too close to zero"); for (j=0; j<4; ++j) { x = probs[j][i]/total; if (x <= 0.000000001) x = 0.000000001; col->slogProb[j] = slog(x); } } prof->locale.mean = mean; prof->locale.standardDeviation = variance; slAddTail(&profList, prof); } return profList; #undef maxLen } void mmDoMiddle() /* Do main work of Motif Matcher - read in motifs user has supplied and * display them. */ { char motifVarName[24]; char *rawMotifs[6]; char *rm; int motifCount = 0; int i; struct profile *profList = NULL, *prof; char *faName; int seqCount; int seqElSize; struct seqList *seqList; int nameSize; boolean considerRc = cgiBoolean("rcToo"); char *motifOutName; FILE *motifOutFile = NULL; char *motifInName; char *seqFileName, *backFileName; char *hitFileName = cgiOptionalString("hits"); FILE *hitFile = NULL; boolean hitTrombaFormat = cgiBoolean("hitTrombaFormat"); int profIx = 0; if ((motifOutName = cgiOptionalString("motifOutput")) != NULL) motifOutFile = mustOpen(motifOutName, "w"); fprintf(htmlOut, " "); /* Get some CGI variables. */ if (cgiVarExists("maxOcc")) maxOcc = cgiInt("maxOcc"); useLocation = !cgiBoolean("ignoreLocation"); /* Get input sequence from file. If necessary copy pasted * in sequence to file first. */ seqFileName = cgiOptionalString("seqFile"); if (seqFileName == NULL) seqFileName = cgiOptionalString("good"); if (seqFileName == NULL) { pasteToFa("seq", &faName, &seqCount, &seqElSize); seqFileName = faName; } seqList = readSeqList(seqFileName); nameSize = maxSeqNameSize(seqList); /* Set up background model. */ backFileName = cgiOptionalString("bad"); if (backFileName == NULL) backFileName = seqFileName; nullModel = nmMark0; if (cgiVarExists("background")) nullModel = cgiOneChoice("background", nullModelChoices, ArraySize(nullModelChoices)); getNullModel(seqFileName, backFileName, FALSE); /* Make all sequences the same size and fill in * soft mask table. */ seqElSize = uniformSeqSize(seqList, TRUE, nullModel); /* Read in motifs. */ if ((motifInName = cgiOptionalString("motifs")) != NULL) { char *buf; size_t size; readInGulp(motifInName, &buf, &size); if ((prof = parseMotifs(buf, useLocation)) != NULL) profList = slCat(profList, prof); freeMem(buf); } else { /* Collect the motifs that are actually present into an array. */ for (i=1; i<=ArraySize(rawMotifs); ++i) { snprintf(motifVarName, sizeof(motifVarName), "motif%d", i); rawMotifs[motifCount] = rm = cgiString(motifVarName); if ((prof = parseMotifs(rm, useLocation)) != NULL) { ++motifCount; profList = slCat(profList, prof); } } } motifCount = slCount(profList); if (motifCount == 0) errAbort("No motifs entered."); /* Possibly open hit file. */ if (hitFileName != NULL) hitFile = mustOpen(hitFileName, "w"); /* Run the motifs once to figure out where they land and * what the score is. */ for (prof = profList; prof != NULL; prof = prof->next) { struct profile *rcProf = NULL; if (considerRc) rcProf = rcProfileCopy(prof); prof->score = iterateProfile(prof, rcProf, seqList, seqElSize, NULL, NULL, FALSE, hitFile, ++profIx, hitTrombaFormat); horizontalLine(); printProfile(htmlOut, prof); if (motifOutFile) printProfile(motifOutFile, prof); showProfHits(prof, rcProf, seqList, seqElSize, nameSize, &prof->bestIndividualScore); freeProfile(&rcProf); } horizontalLine(); colorProfiles(profList, seqList, seqElSize, considerRc); } void doMiddle() /* Print out body of HTML. Decide whether to * run as improbizer or as motif matcher. */ { if (isMotifMatcher) mmDoMiddle(); else impDoMiddle(); } int main(int argc, char *argv[]) { char *programName; //pushCarefulMemHandler(); initProfileMemory(); dnaUtilOpen(); statUtilOpen(); isFromWeb = cgiIsOnWeb(); if (!isFromWeb && !cgiSpoof(&argc, argv)) { errAbort("ameme - find common patterns in DNA\n" "usage\n" " ameme good=goodIn.fa [bad=badIn.fa] [numMotifs=2] [background=m1] [maxOcc=2] [motifOutput=fileName] [html=output.html] [gif=output.gif] [rcToo=on] [controlRun=on] [startScanLimit=20] [outputLogo] [constrainer=1]\n" "where goodIn.fa is a multi-sequence fa file containing instances\n" "of the motif you want to find, badIn.fa is a file containing similar\n" "sequences but lacking the motif, numMotifs is the number of motifs\n" "to scan for, background is m0,m1, or m2 for various levels of Markov\n" "models, maxOcc is the maximum occurrences of the motif you \n" "expect to find in a single sequence and motifOutput is the name \n" "of a file to store just the motifs in. rcToo=on searches both strands.\n" "If you include controlRun=on in the command line, a random set of \n" "sequences will be generated that match your foreground data set in size, \n" "and your background data set in nucleotide probabilities. The program \n" "will then look for motifs in this random set. If the scores you get in a \n" "real run are about the same as those you get in a control run, then the motifs\n" "Improbizer has found are probably not significant.\n"); } outputLogo = cgiVarExists("outputLogo"); /* Figure out where to put html output. */ if (cgiVarExists("html")) { char *fileName = cgiString("html"); htmlOut = mustOpen(fileName, "w"); } else htmlOut = stdout; initRandom(); /* This one needs to be after htmlOut set up. */ /* Figure out if we're going to find a pattern in sequences, or just display * where a pre-computed pattern occurs in sequences. */ isMotifMatcher = cgiVarExists("motifMatcher"); if (isMotifMatcher) programName = "Motif Matcher"; else programName = "Improbizer"; /* Print out html header. Make background color brilliant white. */ if (isFromWeb) puts("Content-Type:text/html\n"); fprintf(htmlOut, "\n%s Results \n\n\n", programName); fprintf(htmlOut, "\n\n"); /* Wrap error handling et. around doMiddle. */ if (isFromWeb) htmEmptyShell(doMiddle, NULL); else doMiddle(); //carefulCheckHeap(); /* Write end of html. */ htmEnd(htmlOut); return 0; }