c8174ac2c2237b3f8f24766ca50348487ba94fa7
galt
  Wed Aug 15 14:48:47 2012 -0700
bedPileUps is a new utility for finding where how many pileups there are in input including the locations and the sizes, with a summary maximum and its location, plus average pileup size.  Matching is on chrom/start/end with name as an additional option for uniqueness.
diff --git src/utils/bedPileUps/bedPileUps.c src/utils/bedPileUps/bedPileUps.c
new file mode 100644
index 0000000..02c4431
--- /dev/null
+++ src/utils/bedPileUps/bedPileUps.c
@@ -0,0 +1,182 @@
+/* bedPileUps - Find (exact) pile-ups (overlaps) if any in bed input */
+#include "common.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "bedPileUps - Find (exact) overlaps if any in bed input\n"
+  "usage:\n"
+  "   bedPileUps in.bed\n"
+  "Where in.bed is in one of the ascii bed formats.\n"
+  "The in.bed file must be sorted by chromosome,start,\n"
+  "  to sort a bed file, use the unix sort command:\n"
+  "     sort -k1,1 -k2,2n unsorted.bed > sorted.bed\n"
+  "\n"
+  "Options:\n"
+  "  -name - include BED name field 4 when evaluating uniqueness\n"
+  "  -tab  - use tabs to parse fields\n"
+  "  -verbose=2 - show the location and size of each pileUp\n"
+  );
+}
+
+static struct optionSpec options[] = {
+   {"name", OPTION_BOOLEAN},
+   {"tab", OPTION_BOOLEAN},
+   {NULL, 0},
+};
+
+
+boolean nameOpt = FALSE;
+boolean tabSep = FALSE;
+
+void bedPileUps(char *inName)
+/* bedPileUps - Find (exact) overlaps if any in bed input (which must be sorted by chrom, start) */
+{
+char *line, *row[256];  // limit of 256 columns is arbitrary, but useful to catch pathological input
+struct lineFile *lf = lineFileOpen(inName, TRUE);
+boolean atEnd = FALSE, sameChrom = FALSE, newPile = FALSE;
+char *chrom = NULL;
+char *name = NULL;
+char *lastName = cloneString("");
+char *lastChrom = cloneString("");
+int lineNumber = 0;
+int fieldCount = 0;
+int start = 0, end = 0, lastStart = -1, lastEnd = -1;
+struct hash *chromHash = newHash(8);
+int pileUpCount = 0;
+int maxPileUpCount = 0;
+char maxPileUpLocation[1024];
+int numberOfPileUps = 0;
+long long sumOfPileUpCounts = 0;
+
+
+
+for (;;)
+    {
+    /* Get next line of input if any. */
+    if (lineFileNextReal(lf, &line))
+	{
+	++lineNumber;
+
+	/* Chop up line and make sure the word count is right. */
+	int wordCount;
+        if (tabSep)
+	    wordCount = chopTabs(line, row);
+	else
+	    wordCount = chopLine(line, row);
+	if (fieldCount == 0)
+	    {
+	    fieldCount = wordCount;
+	    if (fieldCount < 3)
+		errAbort("expecting at least 3 BED columns, found %d in line %d\n", wordCount, lineNumber);
+	    if (fieldCount < 4)
+		warn("BED 3 input does not support name column.\n");
+	    }
+	if (wordCount != fieldCount)
+	    errAbort("Expecting at least %d columns in line %d.\n", fieldCount, lineNumber);
+
+	chrom = row[0];
+	lineFileAllInts(lf, row, 1, &start, TRUE, 4, "integer", TRUE);
+	lineFileAllInts(lf, row, 2, &end  , TRUE, 4, "integer", TRUE);
+	if (fieldCount >= 4)
+	    {
+    	    name = row[3];
+	    verbose(3, "%s %d %d %s\n", chrom, start, end, name);
+	    }
+	else
+	    {
+	    verbose(3, "%s %d %d\n", chrom, start, end);
+	    }
+
+	sameChrom = sameString(chrom, lastChrom);
+	}
+    else  /* No next line */
+	{
+	atEnd = TRUE;
+	}
+
+    newPile = (!(start == lastStart && end == lastEnd && (!nameOpt || (nameOpt && sameString(name, lastName)))));
+
+    /* Check conditions to finish current pile. */
+    if (atEnd || !sameChrom || newPile)
+	{
+	// finish up any stats for the last pile we are leaving.
+	if (pileUpCount > 1)
+	    {
+	    ++numberOfPileUps;
+	    sumOfPileUpCounts += pileUpCount;
+	    if (pileUpCount > maxPileUpCount)
+		{
+		maxPileUpCount = pileUpCount;
+		safef(maxPileUpLocation, sizeof maxPileUpLocation, "%s %d %d", lastChrom, lastStart, lastEnd);
+		}
+	    if (fieldCount >=4)
+		verbose(2, "pileUp of size %d at %s %d %d %s\n", pileUpCount, lastChrom, lastStart, lastEnd, lastName);
+	    else
+		verbose(2, "pileUp of size %d at %s %d %d\n", pileUpCount, lastChrom, lastStart, lastEnd);
+	    
+	    }
+
+	if (atEnd)
+	    break;
+	}
+
+    /* Advance to next chromosome. Make sure we have never seen it before. */
+    if (!sameChrom)
+        {
+	if (hashLookup(chromHash, chrom))
+	    errAbort("Input is not sorted - have seen this chromosome %s before in line %d.\n", chrom, lineNumber);
+	hashAdd(chromHash, chrom, NULL);
+	freeMem(lastChrom);
+	lastChrom = cloneString(chrom);
+	lastStart = -1;	
+	lastEnd = -1;
+	}
+
+    /* Make sure that within a chrom, chromStart is non-decreasing */
+    if (start < lastStart)
+	errAbort("Input is not sorted - chromosome %s start %d < lastStart %d in line %d.\n", chrom, start, lastStart, lineNumber);
+
+    if (start == lastStart && end == lastEnd  && (!nameOpt || (nameOpt && sameString(name, lastName))))
+	{
+	++pileUpCount;
+	}
+    else
+	{
+    	lastStart = start;
+	lastEnd = end;
+	freeMem(lastName);
+	lastName = cloneString(name);
+	pileUpCount = 1;
+	}
+
+    }
+
+lineFileClose(&lf);
+
+/* print totals */
+if (numberOfPileUps > 0)
+    {
+    printf("Largest PileUp Size: %d is located at: %s\n", maxPileUpCount, maxPileUpLocation);
+    printf("Average PileUp Size: %3.1f\n", (double)sumOfPileUpCounts / numberOfPileUps);
+    }
+printf("Total Number of PileUps found: %d\n", numberOfPileUps);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+tabSep = optionExists("tab");
+nameOpt = optionExists("name");
+if (argc != 2)
+    usage();
+
+bedPileUps(argv[1]);
+optionFree();
+return 0;
+}