src/hg/bedItemOverlapCount/bedItemOverlapCount.c 1.14

1.14 2009/05/01 00:25:57 tdreszer
Fixed bug where last chr could be truncated if the previous one was smaller
Index: src/hg/bedItemOverlapCount/bedItemOverlapCount.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/bedItemOverlapCount/bedItemOverlapCount.c,v
retrieving revision 1.13
retrieving revision 1.14
diff -b -B -U 1000000 -r1.13 -r1.14
--- src/hg/bedItemOverlapCount/bedItemOverlapCount.c	23 Apr 2009 00:31:12 -0000	1.13
+++ src/hg/bedItemOverlapCount/bedItemOverlapCount.c	1 May 2009 00:25:57 -0000	1.14
@@ -1,266 +1,258 @@
 /* bedItemOverlapCount - count how many times a base is overlapped in a
 	bed file */
 #include "common.h"
 #include "options.h"
 #include "linefile.h"
 #include "obscure.h"
 #include "hash.h"
 #include "cheapcgi.h"
 #include "jksql.h"
 #include "dystring.h"
 #include "chromInfo.h"
 #include "wiggle.h"
 #include "hdb.h"
 
 static char const rcsid[] = "$Id$";
 
 /* Command line switches. */
 //static char *strand = (char *)NULL;	/* strand to process, default +	*/
 		/* this is not implemented, the user can filter + and - */
 
 static struct hash *chromHash = NULL;
 static char *host = NULL;
 static char *user = NULL;
 static char *password = NULL;
 char *chromSizes = NULL;		/* read chrom sizes from file instead of database . */
 
 /* command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {"chromSize", OPTION_STRING},
     {"host", OPTION_STRING},
     {"user", OPTION_STRING},
     {"password", OPTION_STRING},
     {"strand", OPTION_STRING},
     {NULL, 0}
 };
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "bedItemOverlapCount - count number of times a base is overlapped by the\n"
   "\titems in a bed file.  Output is bedGraph 4 to stdout.\n"
   "usage:\n"
   " sort -k1,1 -k2,2n bedFile.bed | bedItemOverlapCount [options] <database> stdin\n"
   "or all the way to wiggle track data:\n"
   " sort -k1,1 -k2,2n bedFile.bed \\\n"
   "     | bedItemOverlapCount [options] <database> stdin \\\n"
   "         | wigEncode stdin data.wig data.wib\n"
   "options:\n"
   "   -chromSize=sizefile\tRead chrom sizes from file instead of database\n"
   "   -host=hostname\tmysql host used to get chrom sizes\n"
   "   -user=username\tmysql user\n"
   "   -password=password\tmysql password\n\n"
   "\tchromSize file contains two white space separated fields per line: chrom name and size\n"
   "\tYou will want to separate your + and - strand\n"
   "\titems before sending into this program as it only looks at\n"
   "\tthe chrom, start and end columns of the bed file.\n"
   "\tProgram requires a <database> connection to lookup chrom sizes for a sanity\n"
   "\tcheck of the incoming data (unless you use -chromSize argument).\n\n"
   "The bed file must be sorted at least by chrom since the processing is\n"
   "\tgoing to be chrom by chrom with no going back.\n"
   " *** AND *** this is only for simple bed files without multiple blocks. ***"
   );
 }
 
 static struct hash *loadAllChromInfo(char *database, unsigned *largest)
 /* Load up all chromosome infos, return largest one if asked to do so. */
 {
 struct chromInfo *el;
 struct sqlConnection *conn = NULL;
 struct sqlResult *sr = NULL;
 struct hash *ret;
 char **row;
 unsigned max = 0;
 
 if(host)
     {
     conn = sqlConnectRemote(host, user, password, database);
     }
 else
     {
     conn = sqlConnect(database);
     }
 
 ret = newHash(0);
 
 sr = sqlGetResult(conn, "select * from chromInfo");
 while ((row = sqlNextRow(sr)) != NULL)
     {
     el = chromInfoLoad(row);
     if (el->size > max) max = el->size;
     verbose(4, "Add hash %s value %u (%#lx)\n", el->chrom, el->size, (unsigned long)&el->size);
     hashAdd(ret, el->chrom, (void *)(& el->size));
     }
 sqlFreeResult(&sr);
 sqlDisconnect(&conn);
 if (largest) *largest = max;
 return ret;
 }
 
 static unsigned chromosomeSize(char *chromosome)
 /* Return full extents of chromosome.  Warn and fill in if none. */
 {
 struct hashEl *el = hashLookup(chromHash,chromosome);
 
 if (el == NULL)
     errAbort("Couldn't find size of chromosome %s", chromosome);
 return *(unsigned *)el->val;
 }
 
 static void outputCounts(unsigned short *counts, char *chrom, unsigned size)
 {
 int start = 0;
 int end = 0;
 int prevCount = 0;
 unsigned long i;
 for (i = 0; i < size; ++i)
     {
     if (counts[i])
 	{
 	if (0 == prevCount)	/*	starting an element	*/
 	    {
 	    start = i;
 	    end = start+1;
 	    }
 	else if (prevCount == counts[i])	/* continuing an element */
 	    {
 	    ++end;
 	    }
 	else		/*	prevCount != counts[i] -> finished an element */
 	    {
 	    printf("%s\t%d\t%d\t%d\n", chrom, start, end, prevCount);
 	    start = i;
 	    end = start+1;
 	    }
 	}
     else if (prevCount)		/*	finished an element	*/
 	{
 	printf("%s\t%d\t%d\t%d\n", chrom, start, end, prevCount);
 	start = i;
 	end = start+1;
 	}
     prevCount = counts[i];
     }
 if (prevCount)		/*	last element at end of chrom	*/
     printf("%s\t%d\t%d\t%d\n", chrom, start, end, prevCount);
 }
 
 static void bedItemOverlapCount(char *database, int fileCount, char *fileList[])
 {
 unsigned maxChromSize = 0;
 int i;
 unsigned short *counts = (unsigned short *)NULL;
-char *prevChrom = (char *)NULL;
-boolean outputToDo = FALSE;
-unsigned chromSize = 0;
 
 if (chromSizes != NULL)
     {
     chromHash = newHash(0);
     struct lineFile *lf = lineFileOpen(chromSizes, TRUE);
     char *row[2];
     while (lineFileRow(lf, row))
         {
         unsigned *ptr;
         AllocVar(ptr);
         *ptr = sqlUnsigned(row[1]);
         maxChromSize = max(*ptr, maxChromSize);
         hashAdd(chromHash, row[0], ptr);
         }
     lineFileClose(&lf);
     }
 else
     {
     chromHash = loadAllChromInfo(database, &maxChromSize);
     }
 
 verbose(2,"#\tmaxChromSize: %u\n", maxChromSize);
 if (maxChromSize < 1)
     errAbort("maxChromSize is zero ?");
 
 /*	Allocate just once for the largest chrom and reuse this array */
 counts = needHugeMem(sizeof(unsigned short) * maxChromSize);
 
 /*	Reset the array to be zero to be reused */
 memset((void *)counts, 0, sizeof(unsigned short)*(size_t)maxChromSize);
 
+unsigned chromSize = 0;
+char *prevChrom = (char *)NULL;
+boolean outputToDo = FALSE;
 for (i=0; i<fileCount; ++i)
 {
     struct lineFile *bf = lineFileOpen(fileList[i] , TRUE);
-    unsigned thisChromSize = 0;
     struct bed *bed = (struct bed *)NULL;
     char *row[3];
 
     while (lineFileNextRow(bf,row,ArraySize(row)))
 	{
 	int i;
 	bed = bedLoadN(row, 3);
 	verbose(3,"#\t%s\t%d\t%d\n",bed->chrom,bed->chromStart, bed->chromEnd);
-	if (prevChrom && differentWord(bed->chrom,prevChrom))
+    if (prevChrom && differentWord(bed->chrom,prevChrom)) // End a chr
 	    {
-	    chromSize = chromosomeSize(prevChrom);
 	    verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
 	    if (outputToDo)
 		outputCounts(counts, prevChrom, chromSize);
 	    outputToDo = FALSE;
 	    memset((void *)counts, 0,
 		sizeof(unsigned short)*(size_t)maxChromSize); /* zero counts */
-	    freeMem(prevChrom);
-	    prevChrom = cloneString(bed->chrom);
-	    thisChromSize = chromosomeSize(prevChrom);
-	    verbose(2,"#\tchrom %s starting, size %d\n", prevChrom,
-		thisChromSize);
+        freez(&prevChrom); // pointer is now NULL so it will be caught by next if!
 	    }
-	else if ((char *)NULL == prevChrom)
+    if ((char *)NULL == prevChrom)  // begin a chr
 	    {
 	    prevChrom = cloneString(bed->chrom);
 	    chromSize = chromosomeSize(prevChrom);
-	    thisChromSize = chromosomeSize(prevChrom);
-	    verbose(2,"#\tchrom %s starting, size %d\n", prevChrom,
-		thisChromSize);
+        verbose(2,"#\tchrom %s starting, size %d\n", prevChrom,chromSize);
 	    }
-    if (bed->chromEnd > thisChromSize)
+    if (bed->chromEnd > chromSize)
         {
-        if (bed->chromStart >= thisChromSize || differentWord(bed->chrom,"chrM")) // circular chrom
+        if (bed->chromStart >= chromSize || differentWord(bed->chrom,"chrM")) // circular chrom
             {
             warn("ERROR: %s\t%d\t%d", bed->chrom, bed->chromStart,
             bed->chromEnd);
-            errAbort("chromEnd > chromSize ?  %d > %d", bed->chromEnd,
-                thisChromSize);
+            errAbort("chromEnd > chromSize ?  %d > %d", bed->chromEnd,chromSize);
             }
-        for (i = bed->chromStart; i < thisChromSize; ++i)
+        for (i = bed->chromStart; i < chromSize; ++i)
             counts[i]++;
-        for (i = 0; i < (bed->chromEnd - thisChromSize); ++i)
+        for (i = 0; i < (bed->chromEnd - chromSize); ++i)
             counts[i]++;
         }
     else
         {
         for (i = bed->chromStart; i < bed->chromEnd; ++i)
             counts[i]++;
         }
 	outputToDo = TRUE;
 	bedFree(&bed); // plug the memory leak
 	}
-    verbose(2,"#\tchrom %s done, size %d\n", prevChrom, thisChromSize);
+    verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
     lineFileClose(&bf);
+    // Note, next file could be on same chr!
 }
 if (outputToDo)
     outputCounts(counts, prevChrom, chromSize);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, optionSpecs);
 
 if (argc < 2)
     usage();
 host = optionVal("host", NULL);
 user = optionVal("user", NULL);
 password = optionVal("password", NULL);
 chromSizes = optionVal("chromSize", NULL);
 verbose(2, "#\tworking on database: %s\n", argv[1]);
 bedItemOverlapCount(argv[1], argc-2, argv+2);
 return 0;
 }