src/hg/regulate/regChromiaMergeWindows/regChromiaMergeWindows.c 1.1

1.1 2010/04/29 20:52:47 kent
Making a clusterer just for Chromia.
Index: src/hg/regulate/regChromiaMergeWindows/regChromiaMergeWindows.c
===================================================================
RCS file: src/hg/regulate/regChromiaMergeWindows/regChromiaMergeWindows.c
diff -N src/hg/regulate/regChromiaMergeWindows/regChromiaMergeWindows.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/regulate/regChromiaMergeWindows/regChromiaMergeWindows.c	29 Apr 2010 20:52:47 -0000	1.1
@@ -0,0 +1,163 @@
+/* regChromiaMergeWindows - Merge adjacent identically labeled windows in BED file generated by 
+ * Chromia.. */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+
+static char const rcsid[] = "$Id$";
+
+int scoreColumn = 4;
+int labelColumn = 3;
+int maxGap = 1200;
+double minSumScore = 10.0;
+double scoreNormFactor = 10000.0;
+double inNoiseThreshold = 0.0;
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "regChromiaMergeWindows - Merge adjacent identically labeled windows in BED file \n"
+  "generated by Chromia.\n"
+  "usage:\n"
+  "   regChromiaMergeWindows chromiaInput output.bed\n"
+  "where chromiaInput is tab-separated with the following columns:\n"
+  "   <chrom> <start> <end> <label> <score>\n"
+  "options:\n"
+  "   -maxGap=N - maximum number of bases between merged windows.  Default %d\n"
+  "   -minSumScore=N.N - minimum sum of scores in region to output. Default %g\n"
+  "   -scoreNormFactor=N.N - normalization factor to get scores in 0-1000. Default %g\n"
+  "   -inNoiseThreshold=N.N - input lines scoring below this threshold are ignored\n"
+  , maxGap, minSumScore, scoreNormFactor
+  );
+}
+
+static struct optionSpec options[] = {
+   {"maxGap", OPTION_INT},
+   {"minSumScore", OPTION_DOUBLE},
+   {"scoreNormFactor", OPTION_DOUBLE},
+   {"inNoiseThreshold", OPTION_INT},
+   {NULL, 0},
+};
+
+
+void outputRegion(FILE *f, char *chrom, int start, int end, char *label, double sumOfScores)
+/* Output region to file. */
+{
+if (sumOfScores > minSumScore)
+    fprintf(f, "%s\t%d\t%d\t%s\t%d\n", chrom, start, end, label, 
+    	round(scoreNormFactor*sumOfScores/(end-start)));
+}
+
+void regChromiaMergeWindows(char *input, char *output)
+/* regChromiaMergeWindows - Merge adjacent identically labeled windows in BED file generated 
+ * by Chromia.. */
+{
+struct lineFile *lf = lineFileOpen(input, TRUE);
+char *row[32];
+int rowSize = 0;
+FILE *f = mustOpen(output, "w");
+char lastLabel[128];
+lastLabel[0] = 0;
+char lastChrom[128];
+lastChrom[0] = 0;
+int lastChromStart = 0, lastChromEnd = 0;
+int regionStart = 0, regionEnd = 0;
+double sumOfScores = 0.0;
+
+for (;;)
+    {
+    /* Get next line chopped into words.  Break at end of file. Check to make sure
+     * all lines have same number of words. */
+    int thisRowSize = lineFileChop(lf, row);
+    if (thisRowSize == 0)
+        break;
+    if (rowSize == 0)
+        rowSize = thisRowSize;
+    else if (rowSize != thisRowSize)
+        {
+	errAbort("First line of %s has %d words, but there are %d words in line %d",
+		lf->fileName, rowSize, thisRowSize, lf->lineIx);
+	}
+
+    /* Convert row into local variables. */
+    char *chrom = row[0];
+    int chromStart = lineFileNeedNum(lf, row, 1);
+    int chromEnd = lineFileNeedNum(lf, row, 2);
+    char *label = row[labelColumn];
+    double score = lineFileNeedDouble(lf, row, scoreColumn);
+    
+    /* Make sure file is sorted with no overlap.*/
+    if (sameString(chrom, lastChrom))
+        {
+	int gapSize = chromStart - lastChromEnd;
+	if (gapSize < 0)
+	    {
+	    if (chromStart < lastChromStart)
+		errAbort("%s is not sorted. %s %d %d followed by %s %d %d line %d",
+		    lf->fileName, lastChrom, lastChromStart, lastChromEnd,
+		    chrom, chromStart, chromEnd, lf->lineIx);
+	    else  
+		errAbort("%s has overlaps. %s %d %d followed by %s %d %d line %d",
+		    lf->fileName, lastChrom, lastChromStart, lastChromEnd,
+		    chrom, chromStart, chromEnd, lf->lineIx);
+	    }
+	}
+
+    /* Subtract noise threshold from score, and if not still positive just ignore line. */
+    score -= inNoiseThreshold;
+    if (score > 0)
+	{
+	/* See if we have entered a new region. */
+	boolean newRegion = FALSE;
+	if (sameString(chrom, lastChrom))
+	    {
+	    int gapSize = chromStart - lastChromEnd;
+	    if (gapSize > maxGap)
+		 newRegion = TRUE;
+	    }
+	else
+	    newRegion = TRUE;
+	if (!sameString(label, lastLabel))
+	    newRegion = TRUE;
+	    
+	/* Got new region.  Output old region if any, and initialize new region. */
+	if (newRegion)
+	    {
+	    if (regionStart != regionEnd)
+		outputRegion(f, lastChrom, regionStart, regionEnd, lastLabel, sumOfScores);
+	    regionStart = chromStart;
+	    sumOfScores = 0;
+	    }
+
+	/* Update region. */
+	regionEnd = chromEnd;
+	sumOfScores += score;
+
+	/* Keep track of this row so can compare it to next row. */
+	safecpy(lastChrom, sizeof(lastChrom), chrom);
+	safecpy(lastLabel, sizeof(lastLabel), label);
+	lastChromStart = chromStart;
+	lastChromEnd = chromEnd;
+	}
+
+    }
+outputRegion(f, lastChrom, regionStart, regionEnd, lastLabel, sumOfScores);
+carefulClose(&f);
+lineFileClose(&lf);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+maxGap = optionInt("maxGap", maxGap);
+minSumScore = optionDouble("minSumScore", minSumScore);
+scoreNormFactor = optionDouble("scoreNormFactor", scoreNormFactor);
+inNoiseThreshold = optionDouble("inNoiseThreshold", inNoiseThreshold);
+if (argc != 3)
+    usage();
+regChromiaMergeWindows(argv[1], argv[2]);
+return 0;
+}