src/hg/regulate/regChromiaMergeWindows/regChromiaMergeWindows.c 1.1
1.1 2010/04/29 20:52:47 kent
Making a clusterer just for Chromia.
Index: src/hg/regulate/regChromiaMergeWindows/regChromiaMergeWindows.c
===================================================================
RCS file: src/hg/regulate/regChromiaMergeWindows/regChromiaMergeWindows.c
diff -N src/hg/regulate/regChromiaMergeWindows/regChromiaMergeWindows.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/regulate/regChromiaMergeWindows/regChromiaMergeWindows.c 29 Apr 2010 20:52:47 -0000 1.1
@@ -0,0 +1,163 @@
+/* regChromiaMergeWindows - Merge adjacent identically labeled windows in BED file generated by
+ * Chromia.. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+
+static char const rcsid[] = "$Id$";
+
+int scoreColumn = 4;
+int labelColumn = 3;
+int maxGap = 1200;
+double minSumScore = 10.0;
+double scoreNormFactor = 10000.0;
+double inNoiseThreshold = 0.0;
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+ "regChromiaMergeWindows - Merge adjacent identically labeled windows in BED file \n"
+ "generated by Chromia.\n"
+ "usage:\n"
+ " regChromiaMergeWindows chromiaInput output.bed\n"
+ "where chromiaInput is tab-separated with the following columns:\n"
+ " <chrom> <start> <end> <label> <score>\n"
+ "options:\n"
+ " -maxGap=N - maximum number of bases between merged windows. Default %d\n"
+ " -minSumScore=N.N - minimum sum of scores in region to output. Default %g\n"
+ " -scoreNormFactor=N.N - normalization factor to get scores in 0-1000. Default %g\n"
+ " -inNoiseThreshold=N.N - input lines scoring below this threshold are ignored\n"
+ , maxGap, minSumScore, scoreNormFactor
+ );
+}
+
+static struct optionSpec options[] = {
+ {"maxGap", OPTION_INT},
+ {"minSumScore", OPTION_DOUBLE},
+ {"scoreNormFactor", OPTION_DOUBLE},
+ {"inNoiseThreshold", OPTION_INT},
+ {NULL, 0},
+};
+
+
+void outputRegion(FILE *f, char *chrom, int start, int end, char *label, double sumOfScores)
+/* Output region to file. */
+{
+if (sumOfScores > minSumScore)
+ fprintf(f, "%s\t%d\t%d\t%s\t%d\n", chrom, start, end, label,
+ round(scoreNormFactor*sumOfScores/(end-start)));
+}
+
+void regChromiaMergeWindows(char *input, char *output)
+/* regChromiaMergeWindows - Merge adjacent identically labeled windows in BED file generated
+ * by Chromia.. */
+{
+struct lineFile *lf = lineFileOpen(input, TRUE);
+char *row[32];
+int rowSize = 0;
+FILE *f = mustOpen(output, "w");
+char lastLabel[128];
+lastLabel[0] = 0;
+char lastChrom[128];
+lastChrom[0] = 0;
+int lastChromStart = 0, lastChromEnd = 0;
+int regionStart = 0, regionEnd = 0;
+double sumOfScores = 0.0;
+
+for (;;)
+ {
+ /* Get next line chopped into words. Break at end of file. Check to make sure
+ * all lines have same number of words. */
+ int thisRowSize = lineFileChop(lf, row);
+ if (thisRowSize == 0)
+ break;
+ if (rowSize == 0)
+ rowSize = thisRowSize;
+ else if (rowSize != thisRowSize)
+ {
+ errAbort("First line of %s has %d words, but there are %d words in line %d",
+ lf->fileName, rowSize, thisRowSize, lf->lineIx);
+ }
+
+ /* Convert row into local variables. */
+ char *chrom = row[0];
+ int chromStart = lineFileNeedNum(lf, row, 1);
+ int chromEnd = lineFileNeedNum(lf, row, 2);
+ char *label = row[labelColumn];
+ double score = lineFileNeedDouble(lf, row, scoreColumn);
+
+ /* Make sure file is sorted with no overlap.*/
+ if (sameString(chrom, lastChrom))
+ {
+ int gapSize = chromStart - lastChromEnd;
+ if (gapSize < 0)
+ {
+ if (chromStart < lastChromStart)
+ errAbort("%s is not sorted. %s %d %d followed by %s %d %d line %d",
+ lf->fileName, lastChrom, lastChromStart, lastChromEnd,
+ chrom, chromStart, chromEnd, lf->lineIx);
+ else
+ errAbort("%s has overlaps. %s %d %d followed by %s %d %d line %d",
+ lf->fileName, lastChrom, lastChromStart, lastChromEnd,
+ chrom, chromStart, chromEnd, lf->lineIx);
+ }
+ }
+
+ /* Subtract noise threshold from score, and if not still positive just ignore line. */
+ score -= inNoiseThreshold;
+ if (score > 0)
+ {
+ /* See if we have entered a new region. */
+ boolean newRegion = FALSE;
+ if (sameString(chrom, lastChrom))
+ {
+ int gapSize = chromStart - lastChromEnd;
+ if (gapSize > maxGap)
+ newRegion = TRUE;
+ }
+ else
+ newRegion = TRUE;
+ if (!sameString(label, lastLabel))
+ newRegion = TRUE;
+
+ /* Got new region. Output old region if any, and initialize new region. */
+ if (newRegion)
+ {
+ if (regionStart != regionEnd)
+ outputRegion(f, lastChrom, regionStart, regionEnd, lastLabel, sumOfScores);
+ regionStart = chromStart;
+ sumOfScores = 0;
+ }
+
+ /* Update region. */
+ regionEnd = chromEnd;
+ sumOfScores += score;
+
+ /* Keep track of this row so can compare it to next row. */
+ safecpy(lastChrom, sizeof(lastChrom), chrom);
+ safecpy(lastLabel, sizeof(lastLabel), label);
+ lastChromStart = chromStart;
+ lastChromEnd = chromEnd;
+ }
+
+ }
+outputRegion(f, lastChrom, regionStart, regionEnd, lastLabel, sumOfScores);
+carefulClose(&f);
+lineFileClose(&lf);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+maxGap = optionInt("maxGap", maxGap);
+minSumScore = optionDouble("minSumScore", minSumScore);
+scoreNormFactor = optionDouble("scoreNormFactor", scoreNormFactor);
+inNoiseThreshold = optionDouble("inNoiseThreshold", inNoiseThreshold);
+if (argc != 3)
+ usage();
+regChromiaMergeWindows(argv[1], argv[2]);
+return 0;
+}