src/hg/bedItemOverlapCount/bedItemOverlapCount.c 1.15
1.15 2009/09/25 23:08:41 braney
added -bed12, -max, and -zero options. Made default counting unit size and unsigned long
Index: src/hg/bedItemOverlapCount/bedItemOverlapCount.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/bedItemOverlapCount/bedItemOverlapCount.c,v
retrieving revision 1.14
retrieving revision 1.15
diff -b -B -U 4 -r1.14 -r1.15
--- src/hg/bedItemOverlapCount/bedItemOverlapCount.c 1 May 2009 00:25:57 -0000 1.14
+++ src/hg/bedItemOverlapCount/bedItemOverlapCount.c 25 Sep 2009 23:08:41 -0000 1.15
@@ -13,25 +13,35 @@
#include "hdb.h"
static char const rcsid[] = "$Id$";
-/* Command line switches. */
-//static char *strand = (char *)NULL; /* strand to process, default + */
- /* this is not implemented, the user can filter + and - */
+/* define unitSize to be a larger storage class if your counts
+ * are overflowing. */
+typedef unsigned long unitSize;
+
+#define MAXCOUNT (unitSize)~0
+#define MAXMESSAGE "Overflow of overlap counts. Max is %lu. Recompile with bigger unitSize or use -max option"
+#define INCWOVERFLOW(x) if(counts[x] == MAXCOUNT) {if(!doMax) errAbort(MAXMESSAGE,(unsigned long)MAXCOUNT);} else counts[x]++
+/* Command line switches. */
static struct hash *chromHash = NULL;
static char *host = NULL;
static char *user = NULL;
static char *password = NULL;
char *chromSizes = NULL; /* read chrom sizes from file instead of database . */
+boolean doMax; /* if overlap count will overflow, just keep max */
+boolean doZero; /* add blocks with 0 counts */
+boolean doBed12; /* expect bed12 and process block by block */
/* command line option specifications */
static struct optionSpec optionSpecs[] = {
{"chromSize", OPTION_STRING},
{"host", OPTION_STRING},
{"user", OPTION_STRING},
{"password", OPTION_STRING},
- {"strand", OPTION_STRING},
+ {"max", OPTION_BOOLEAN},
+ {"zero", OPTION_BOOLEAN},
+ {"bed12", OPTION_BOOLEAN},
{NULL, 0}
};
void usage()
@@ -40,27 +50,31 @@
errAbort(
"bedItemOverlapCount - count number of times a base is overlapped by the\n"
"\titems in a bed file. Output is bedGraph 4 to stdout.\n"
"usage:\n"
- " sort -k1,1 -k2,2n bedFile.bed | bedItemOverlapCount [options] <database> stdin\n"
+ " sort bedFile.bed | bedItemOverlapCount [options] <database> stdin\n"
"or all the way to wiggle track data:\n"
- " sort -k1,1 -k2,2n bedFile.bed \\\n"
- " | bedItemOverlapCount [options] <database> stdin \\\n"
+ " sort bedFile.bed | bedItemOverlapCount [options] <database> stdin \\\n"
" | wigEncode stdin data.wig data.wib\n"
"options:\n"
+ " -zero add blocks with zero count, normally these are ommitted\n"
+ " -bed12 expect bed12 and count based on blocks\n"
+ " normally only first three fields are used.\n"
+ " -max if counts per base overflows set to max (%lu) instead of exiting\n"
" -chromSize=sizefile\tRead chrom sizes from file instead of database\n"
+ " sizefile contains two white space separated fields per line:\n"
+ " chrom name and size\n"
" -host=hostname\tmysql host used to get chrom sizes\n"
" -user=username\tmysql user\n"
" -password=password\tmysql password\n\n"
- "\tchromSize file contains two white space separated fields per line: chrom name and size\n"
- "\tYou will want to separate your + and - strand\n"
- "\titems before sending into this program as it only looks at\n"
- "\tthe chrom, start and end columns of the bed file.\n"
- "\tProgram requires a <database> connection to lookup chrom sizes for a sanity\n"
- "\tcheck of the incoming data (unless you use -chromSize argument).\n\n"
- "The bed file must be sorted at least by chrom since the processing is\n"
- "\tgoing to be chrom by chrom with no going back.\n"
- " *** AND *** this is only for simple bed files without multiple blocks. ***"
+ "Notes:\n"
+ " * You may want to separate your + and - strand\n"
+ " items before sending into this program as it only looks at\n"
+ " the chrom, start and end columns of the bed file.\n"
+ " * Program requires a <database> connection to lookup chrom sizes for a sanity\n"
+ " check of the incoming data (unless you use -chromSize argument).\n\n"
+ " * The bed file *must* be sorted by chrom\n"
+ " * Maximum count per base is %lu. Recompile with new unitSize to increase this\n",(unsigned long)MAXCOUNT, (unsigned long)MAXCOUNT
);
}
static struct hash *loadAllChromInfo(char *database, unsigned *largest)
@@ -88,9 +102,9 @@
while ((row = sqlNextRow(sr)) != NULL)
{
el = chromInfoLoad(row);
if (el->size > max) max = el->size;
- verbose(4, "Add hash %s value %u (%#lx)\n", el->chrom, el->size, (unsigned long)&el->size);
+ verbose(4, "Add hash %s value %u (%#lx)\n", el->chrom, el->size, (unsigned long)el->size);
hashAdd(ret, el->chrom, (void *)(& el->size));
}
sqlFreeResult(&sr);
sqlDisconnect(&conn);
@@ -107,51 +121,35 @@
errAbort("Couldn't find size of chromosome %s", chromosome);
return *(unsigned *)el->val;
}
-static void outputCounts(unsigned short *counts, char *chrom, unsigned size)
+static void outputCounts(unitSize *counts, char *chrom, unsigned size)
{
-int start = 0;
-int end = 0;
-int prevCount = 0;
-unsigned long i;
-for (i = 0; i < size; ++i)
- {
- if (counts[i])
- {
- if (0 == prevCount) /* starting an element */
- {
- start = i;
- end = start+1;
- }
- else if (prevCount == counts[i]) /* continuing an element */
- {
- ++end;
- }
- else /* prevCount != counts[i] -> finished an element */
+unitSize *start = counts;
+unitSize *lastCount = &counts[size];
+unitSize *ptr;
+
+for(ptr=counts; ptr < lastCount - 1; ptr++)
{
- printf("%s\t%d\t%d\t%d\n", chrom, start, end, prevCount);
- start = i;
- end = start+1;
- }
- }
- else if (prevCount) /* finished an element */
+ if (ptr[0] != ptr[1])
{
- printf("%s\t%d\t%d\t%d\n", chrom, start, end, prevCount);
- start = i;
- end = start+1;
+ if (doZero || (ptr[0] != 0))
+ printf("%s\t%lu\t%lu\t%lu\n",
+ chrom, start - counts, ptr - counts + 1, (unsigned long)ptr[0]);
+ start = ptr + 1;
}
- prevCount = counts[i];
}
-if (prevCount) /* last element at end of chrom */
- printf("%s\t%d\t%d\t%d\n", chrom, start, end, prevCount);
+
+if (doZero || (ptr[0] != 0))
+ printf("%s\t%lu\t%lu\t%lu\n",
+ chrom, start - counts, ptr - counts + 1, (unsigned long)ptr[0]);
}
static void bedItemOverlapCount(char *database, int fileCount, char *fileList[])
{
unsigned maxChromSize = 0;
int i;
-unsigned short *counts = (unsigned short *)NULL;
+unitSize *counts = (unitSize)NULL;
if (chromSizes != NULL)
{
chromHash = newHash(0);
@@ -176,27 +174,32 @@
if (maxChromSize < 1)
errAbort("maxChromSize is zero ?");
/* Allocate just once for the largest chrom and reuse this array */
-counts = needHugeMem(sizeof(unsigned short) * maxChromSize);
+counts = needHugeMem(sizeof(unitSize) * maxChromSize);
/* Reset the array to be zero to be reused */
-memset((void *)counts, 0, sizeof(unsigned short)*(size_t)maxChromSize);
+memset((void *)counts, 0, sizeof(unitSize)*(size_t)maxChromSize);
unsigned chromSize = 0;
char *prevChrom = (char *)NULL;
boolean outputToDo = FALSE;
+struct hash *seenHash = newHash(5);
+
for (i=0; i<fileCount; ++i)
-{
+ {
struct lineFile *bf = lineFileOpen(fileList[i] , TRUE);
struct bed *bed = (struct bed *)NULL;
- char *row[3];
+ char *row[12];
+ int numFields = doBed12 ? 12 : 3;
- while (lineFileNextRow(bf,row,ArraySize(row)))
+ while (lineFileNextRow(bf,row, numFields))
{
int i;
- bed = bedLoadN(row, 3);
+ bed = bedLoadN(row, numFields);
+
verbose(3,"#\t%s\t%d\t%d\n",bed->chrom,bed->chromStart, bed->chromEnd);
+
if (prevChrom && differentWord(bed->chrom,prevChrom)) // End a chr
{
verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
if (outputToDo)
@@ -202,44 +205,74 @@
if (outputToDo)
outputCounts(counts, prevChrom, chromSize);
outputToDo = FALSE;
memset((void *)counts, 0,
- sizeof(unsigned short)*(size_t)maxChromSize); /* zero counts */
- freez(&prevChrom); // pointer is now NULL so it will be caught by next if!
+ sizeof(unitSize)*(size_t)maxChromSize); /* zero counts */
+ freez(&prevChrom);
+ // prevChrom is now NULL so it will be caught by next if!
}
if ((char *)NULL == prevChrom) // begin a chr
{
+ if (hashLookup(seenHash, bed->chrom))
+ errAbort("ERROR:input file not sorted. %s seen before on line %d\n",
+ bed->chrom, bf->lineIx);
+
+ hashAdd(seenHash, bed->chrom, NULL);
prevChrom = cloneString(bed->chrom);
chromSize = chromosomeSize(prevChrom);
verbose(2,"#\tchrom %s starting, size %d\n", prevChrom,chromSize);
}
if (bed->chromEnd > chromSize)
{
- if (bed->chromStart >= chromSize || differentWord(bed->chrom,"chrM")) // circular chrom
+ // check for circular chrM
+ if (doBed12 || bed->chromStart>=chromSize
+ || differentWord(bed->chrom,"chrM"))
{
warn("ERROR: %s\t%d\t%d", bed->chrom, bed->chromStart,
bed->chromEnd);
- errAbort("chromEnd > chromSize ? %d > %d", bed->chromEnd,chromSize);
+ errAbort("chromEnd > chromSize ? %d > %d",
+ bed->chromEnd,chromSize);
}
+
for (i = bed->chromStart; i < chromSize; ++i)
- counts[i]++;
+ INCWOVERFLOW(i);
for (i = 0; i < (bed->chromEnd - chromSize); ++i)
- counts[i]++;
+ INCWOVERFLOW(i);
+ }
+ else if (doBed12)
+ {
+ int *starts = bed->chromStarts;
+ int *sizes = bed->blockSizes;
+ int *endStarts = &bed->chromStarts[bed->blockCount];
+
+ for(; starts < endStarts; starts++, sizes++)
+ {
+ unsigned int end = *starts + *sizes + bed->chromStart;
+ for (i = *starts + bed->chromStart; i < end; ++i)
+ INCWOVERFLOW(i);
+ }
}
else
{
for (i = bed->chromStart; i < bed->chromEnd; ++i)
- counts[i]++;
+ INCWOVERFLOW(i);
}
outputToDo = TRUE;
bedFree(&bed); // plug the memory leak
}
- verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
+
lineFileClose(&bf);
// Note, next file could be on same chr!
-}
+ }
+
if (outputToDo)
outputCounts(counts, prevChrom, chromSize);
+
+verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
+freeMem(counts);
+freez(&prevChrom);
+hashFreeWithVals(&chromHash, freez);
+freeHash(&seenHash);
}
int main(int argc, char *argv[])
/* Process command line. */
@@ -251,8 +284,12 @@
host = optionVal("host", NULL);
user = optionVal("user", NULL);
password = optionVal("password", NULL);
chromSizes = optionVal("chromSize", NULL);
+doMax = optionExists("max");
+doBed12 = optionExists("bed12");
+doZero = optionExists("zero");
verbose(2, "#\tworking on database: %s\n", argv[1]);
bedItemOverlapCount(argv[1], argc-2, argv+2);
+optionFree();
return 0;
}