src/hg/bedItemOverlapCount/bedItemOverlapCount.c 1.15

1.15 2009/09/25 23:08:41 braney
added -bed12, -max, and -zero options. Made default counting unit size and unsigned long
Index: src/hg/bedItemOverlapCount/bedItemOverlapCount.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/bedItemOverlapCount/bedItemOverlapCount.c,v
retrieving revision 1.14
retrieving revision 1.15
diff -b -B -U 4 -r1.14 -r1.15
--- src/hg/bedItemOverlapCount/bedItemOverlapCount.c	1 May 2009 00:25:57 -0000	1.14
+++ src/hg/bedItemOverlapCount/bedItemOverlapCount.c	25 Sep 2009 23:08:41 -0000	1.15
@@ -13,25 +13,35 @@
 #include "hdb.h"
 
 static char const rcsid[] = "$Id$";
 
-/* Command line switches. */
-//static char *strand = (char *)NULL;	/* strand to process, default +	*/
-		/* this is not implemented, the user can filter + and - */
+/* define unitSize to be a larger storage class if your counts
+ * are overflowing. */
+typedef unsigned long unitSize;
+
+#define MAXCOUNT (unitSize)~0
+#define MAXMESSAGE "Overflow of overlap counts. Max is %lu.  Recompile with bigger unitSize or use -max option"
+#define INCWOVERFLOW(x) if(counts[x] == MAXCOUNT) {if(!doMax) errAbort(MAXMESSAGE,(unsigned long)MAXCOUNT);} else counts[x]++
 
+/* Command line switches. */
 static struct hash *chromHash = NULL;
 static char *host = NULL;
 static char *user = NULL;
 static char *password = NULL;
 char *chromSizes = NULL;		/* read chrom sizes from file instead of database . */
+boolean doMax;   /* if overlap count will overflow, just keep max */
+boolean doZero;  /* add blocks with 0 counts */
+boolean doBed12;  /* expect bed12 and process block by block */
 
 /* command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {"chromSize", OPTION_STRING},
     {"host", OPTION_STRING},
     {"user", OPTION_STRING},
     {"password", OPTION_STRING},
-    {"strand", OPTION_STRING},
+    {"max", OPTION_BOOLEAN},
+    {"zero", OPTION_BOOLEAN},
+    {"bed12", OPTION_BOOLEAN},
     {NULL, 0}
 };
 
 void usage()
@@ -40,27 +50,31 @@
 errAbort(
   "bedItemOverlapCount - count number of times a base is overlapped by the\n"
   "\titems in a bed file.  Output is bedGraph 4 to stdout.\n"
   "usage:\n"
-  " sort -k1,1 -k2,2n bedFile.bed | bedItemOverlapCount [options] <database> stdin\n"
+  " sort bedFile.bed | bedItemOverlapCount [options] <database> stdin\n"
   "or all the way to wiggle track data:\n"
-  " sort -k1,1 -k2,2n bedFile.bed \\\n"
-  "     | bedItemOverlapCount [options] <database> stdin \\\n"
+  " sort bedFile.bed | bedItemOverlapCount [options] <database> stdin \\\n"
   "         | wigEncode stdin data.wig data.wib\n"
   "options:\n"
+  "   -zero  add blocks with zero count, normally these are ommitted\n"
+  "   -bed12 expect bed12 and count based on blocks\n"
+  "          normally only first three fields are used.\n"
+  "   -max   if counts per base overflows set to max (%lu) instead of exiting\n"
   "   -chromSize=sizefile\tRead chrom sizes from file instead of database\n"
+  "             sizefile contains two white space separated fields per line:\n"
+  "		chrom name and size\n"
   "   -host=hostname\tmysql host used to get chrom sizes\n"
   "   -user=username\tmysql user\n"
   "   -password=password\tmysql password\n\n"
-  "\tchromSize file contains two white space separated fields per line: chrom name and size\n"
-  "\tYou will want to separate your + and - strand\n"
-  "\titems before sending into this program as it only looks at\n"
-  "\tthe chrom, start and end columns of the bed file.\n"
-  "\tProgram requires a <database> connection to lookup chrom sizes for a sanity\n"
-  "\tcheck of the incoming data (unless you use -chromSize argument).\n\n"
-  "The bed file must be sorted at least by chrom since the processing is\n"
-  "\tgoing to be chrom by chrom with no going back.\n"
-  " *** AND *** this is only for simple bed files without multiple blocks. ***"
+  "Notes:\n"
+  " * You may want to separate your + and - strand\n"
+  "   items before sending into this program as it only looks at\n"
+  "   the chrom, start and end columns of the bed file.\n"
+  " * Program requires a <database> connection to lookup chrom sizes for a sanity\n"
+  "   check of the incoming data (unless you use -chromSize argument).\n\n"
+  " * The bed file *must* be sorted by chrom\n"
+  " * Maximum count per base is %lu. Recompile with new unitSize to increase this\n",(unsigned long)MAXCOUNT, (unsigned long)MAXCOUNT
   );
 }
 
 static struct hash *loadAllChromInfo(char *database, unsigned *largest)
@@ -88,9 +102,9 @@
 while ((row = sqlNextRow(sr)) != NULL)
     {
     el = chromInfoLoad(row);
     if (el->size > max) max = el->size;
-    verbose(4, "Add hash %s value %u (%#lx)\n", el->chrom, el->size, (unsigned long)&el->size);
+    verbose(4, "Add hash %s value %u (%#lx)\n", el->chrom, el->size, (unsigned long)el->size);
     hashAdd(ret, el->chrom, (void *)(& el->size));
     }
 sqlFreeResult(&sr);
 sqlDisconnect(&conn);
@@ -107,51 +121,35 @@
     errAbort("Couldn't find size of chromosome %s", chromosome);
 return *(unsigned *)el->val;
 }
 
-static void outputCounts(unsigned short *counts, char *chrom, unsigned size)
+static void outputCounts(unitSize *counts, char *chrom, unsigned size)
 {
-int start = 0;
-int end = 0;
-int prevCount = 0;
-unsigned long i;
-for (i = 0; i < size; ++i)
-    {
-    if (counts[i])
-	{
-	if (0 == prevCount)	/*	starting an element	*/
-	    {
-	    start = i;
-	    end = start+1;
-	    }
-	else if (prevCount == counts[i])	/* continuing an element */
-	    {
-	    ++end;
-	    }
-	else		/*	prevCount != counts[i] -> finished an element */
+unitSize *start = counts;
+unitSize *lastCount = &counts[size];
+unitSize *ptr;
+
+for(ptr=counts; ptr < lastCount - 1; ptr++)
 	    {
-	    printf("%s\t%d\t%d\t%d\n", chrom, start, end, prevCount);
-	    start = i;
-	    end = start+1;
-	    }
-	}
-    else if (prevCount)		/*	finished an element	*/
+    if (ptr[0] != ptr[1])
 	{
-	printf("%s\t%d\t%d\t%d\n", chrom, start, end, prevCount);
-	start = i;
-	end = start+1;
+	if (doZero || (ptr[0] != 0))
+	    printf("%s\t%lu\t%lu\t%lu\n", 
+		chrom, start - counts, ptr - counts + 1, (unsigned long)ptr[0]);
+	start = ptr + 1;
 	}
-    prevCount = counts[i];
     }
-if (prevCount)		/*	last element at end of chrom	*/
-    printf("%s\t%d\t%d\t%d\n", chrom, start, end, prevCount);
+
+if (doZero || (ptr[0] != 0))
+    printf("%s\t%lu\t%lu\t%lu\n", 
+	chrom, start - counts, ptr - counts + 1, (unsigned long)ptr[0]);
 }
 
 static void bedItemOverlapCount(char *database, int fileCount, char *fileList[])
 {
 unsigned maxChromSize = 0;
 int i;
-unsigned short *counts = (unsigned short *)NULL;
+unitSize *counts = (unitSize)NULL;
 
 if (chromSizes != NULL)
     {
     chromHash = newHash(0);
@@ -176,27 +174,32 @@
 if (maxChromSize < 1)
     errAbort("maxChromSize is zero ?");
 
 /*	Allocate just once for the largest chrom and reuse this array */
-counts = needHugeMem(sizeof(unsigned short) * maxChromSize);
+counts = needHugeMem(sizeof(unitSize) * maxChromSize);
 
 /*	Reset the array to be zero to be reused */
-memset((void *)counts, 0, sizeof(unsigned short)*(size_t)maxChromSize);
+memset((void *)counts, 0, sizeof(unitSize)*(size_t)maxChromSize);
 
 unsigned chromSize = 0;
 char *prevChrom = (char *)NULL;
 boolean outputToDo = FALSE;
+struct hash *seenHash = newHash(5);
+
 for (i=0; i<fileCount; ++i)
-{
+    {
     struct lineFile *bf = lineFileOpen(fileList[i] , TRUE);
     struct bed *bed = (struct bed *)NULL;
-    char *row[3];
+    char *row[12];
+    int numFields = doBed12 ? 12 : 3;
 
-    while (lineFileNextRow(bf,row,ArraySize(row)))
+    while (lineFileNextRow(bf,row, numFields))
 	{
 	int i;
-	bed = bedLoadN(row, 3);
+	bed = bedLoadN(row, numFields);
+
 	verbose(3,"#\t%s\t%d\t%d\n",bed->chrom,bed->chromStart, bed->chromEnd);
+
     if (prevChrom && differentWord(bed->chrom,prevChrom)) // End a chr
         {
         verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
         if (outputToDo)
@@ -202,44 +205,74 @@
         if (outputToDo)
             outputCounts(counts, prevChrom, chromSize);
         outputToDo = FALSE;
         memset((void *)counts, 0,
-        sizeof(unsigned short)*(size_t)maxChromSize); /* zero counts */
-        freez(&prevChrom); // pointer is now NULL so it will be caught by next if!
+		sizeof(unitSize)*(size_t)maxChromSize); /* zero counts */
+	    freez(&prevChrom); 
+	    // prevChrom is now NULL so it will be caught by next if!
         }
     if ((char *)NULL == prevChrom)  // begin a chr
         {
+	    if (hashLookup(seenHash, bed->chrom))
+		errAbort("ERROR:input file not sorted. %s seen before on line %d\n",
+		    bed->chrom, bf->lineIx);
+
+	    hashAdd(seenHash, bed->chrom, NULL);
         prevChrom = cloneString(bed->chrom);
         chromSize = chromosomeSize(prevChrom);
         verbose(2,"#\tchrom %s starting, size %d\n", prevChrom,chromSize);
         }
     if (bed->chromEnd > chromSize)
         {
-        if (bed->chromStart >= chromSize || differentWord(bed->chrom,"chrM")) // circular chrom
+	    // check for circular chrM
+	    if (doBed12 || bed->chromStart>=chromSize 
+		|| differentWord(bed->chrom,"chrM")) 
             {
             warn("ERROR: %s\t%d\t%d", bed->chrom, bed->chromStart,
             bed->chromEnd);
-            errAbort("chromEnd > chromSize ?  %d > %d", bed->chromEnd,chromSize);
+		errAbort("chromEnd > chromSize ?  %d > %d", 
+		    bed->chromEnd,chromSize);
             }
+
         for (i = bed->chromStart; i < chromSize; ++i)
-            counts[i]++;
+		INCWOVERFLOW(i);
         for (i = 0; i < (bed->chromEnd - chromSize); ++i)
-            counts[i]++;
+		INCWOVERFLOW(i);
+	    }
+	else if (doBed12)
+	    {
+	    int *starts = bed->chromStarts;
+	    int *sizes = bed->blockSizes;
+	    int *endStarts = &bed->chromStarts[bed->blockCount];
+
+	    for(; starts < endStarts; starts++, sizes++)
+		{
+		unsigned int end = *starts + *sizes + bed->chromStart;
+		for (i = *starts + bed->chromStart; i < end; ++i)
+		    INCWOVERFLOW(i);
+		}
         }
     else
         {
         for (i = bed->chromStart; i < bed->chromEnd; ++i)
-            counts[i]++;
+		INCWOVERFLOW(i);
         }
     outputToDo = TRUE;
     bedFree(&bed); // plug the memory leak
     }
-    verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
+
     lineFileClose(&bf);
     // Note, next file could be on same chr!
-}
+    }
+
 if (outputToDo)
     outputCounts(counts, prevChrom, chromSize);
+
+verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
+freeMem(counts);
+freez(&prevChrom);
+hashFreeWithVals(&chromHash, freez);
+freeHash(&seenHash);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
@@ -251,8 +284,12 @@
 host = optionVal("host", NULL);
 user = optionVal("user", NULL);
 password = optionVal("password", NULL);
 chromSizes = optionVal("chromSize", NULL);
+doMax = optionExists("max");
+doBed12 = optionExists("bed12");
+doZero = optionExists("zero");
 verbose(2, "#\tworking on database: %s\n", argv[1]);
 bedItemOverlapCount(argv[1], argc-2, argv+2);
+optionFree();
 return 0;
 }