src/hg/bedItemOverlapCount/bedItemOverlapCount.c 1.13
1.13 2009/04/23 00:31:12 larrym
modify code to support the normal two column chromSize file format
Index: src/hg/bedItemOverlapCount/bedItemOverlapCount.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/bedItemOverlapCount/bedItemOverlapCount.c,v
retrieving revision 1.12
retrieving revision 1.13
diff -b -B -U 1000000 -r1.12 -r1.13
--- src/hg/bedItemOverlapCount/bedItemOverlapCount.c 21 Apr 2009 23:02:25 -0000 1.12
+++ src/hg/bedItemOverlapCount/bedItemOverlapCount.c 23 Apr 2009 00:31:12 -0000 1.13
@@ -1,264 +1,266 @@
/* bedItemOverlapCount - count how many times a base is overlapped in a
bed file */
#include "common.h"
#include "options.h"
#include "linefile.h"
#include "obscure.h"
#include "hash.h"
#include "cheapcgi.h"
#include "jksql.h"
#include "dystring.h"
#include "chromInfo.h"
#include "wiggle.h"
#include "hdb.h"
static char const rcsid[] = "$Id$";
/* Command line switches. */
//static char *strand = (char *)NULL; /* strand to process, default + */
/* this is not implemented, the user can filter + and - */
static struct hash *chromHash = NULL;
static char *host = NULL;
static char *user = NULL;
static char *password = NULL;
char *chromSizes = NULL; /* read chrom sizes from file instead of database . */
/* command line option specifications */
static struct optionSpec optionSpecs[] = {
{"chromSize", OPTION_STRING},
{"host", OPTION_STRING},
{"user", OPTION_STRING},
{"password", OPTION_STRING},
{"strand", OPTION_STRING},
{NULL, 0}
};
void usage()
/* Explain usage and exit. */
{
errAbort(
"bedItemOverlapCount - count number of times a base is overlapped by the\n"
"\titems in a bed file. Output is bedGraph 4 to stdout.\n"
"usage:\n"
" sort -k1,1 -k2,2n bedFile.bed | bedItemOverlapCount [options] <database> stdin\n"
"or all the way to wiggle track data:\n"
" sort -k1,1 -k2,2n bedFile.bed \\\n"
" | bedItemOverlapCount [options] <database> stdin \\\n"
" | wigEncode stdin data.wig data.wib\n"
"options:\n"
" -chromSize=sizefile\tRead chrom sizes from file instead of database\n"
" -host=hostname\tmysql host used to get chrom sizes\n"
" -user=username\tmysql user\n"
" -password=password\tmysql password\n\n"
- "\tchromSize file is three white space separated fields per line: chrom name, size, and dummy value\n"
+ "\tchromSize file contains two white space separated fields per line: chrom name and size\n"
"\tYou will want to separate your + and - strand\n"
"\titems before sending into this program as it only looks at\n"
"\tthe chrom, start and end columns of the bed file.\n"
- "\tIt requires a <database> connection to lookup chrom sizes for a sanity\n"
+ "\tProgram requires a <database> connection to lookup chrom sizes for a sanity\n"
"\tcheck of the incoming data (unless you use -chromSize argument).\n\n"
"The bed file must be sorted at least by chrom since the processing is\n"
"\tgoing to be chrom by chrom with no going back.\n"
" *** AND *** this is only for simple bed files without multiple blocks. ***"
);
}
static struct hash *loadAllChromInfo(char *database, unsigned *largest)
/* Load up all chromosome infos, return largest one if asked to do so. */
{
struct chromInfo *el;
struct sqlConnection *conn = NULL;
struct sqlResult *sr = NULL;
struct hash *ret;
char **row;
unsigned max = 0;
if(host)
{
conn = sqlConnectRemote(host, user, password, database);
}
else
{
conn = sqlConnect(database);
}
ret = newHash(0);
sr = sqlGetResult(conn, "select * from chromInfo");
while ((row = sqlNextRow(sr)) != NULL)
{
el = chromInfoLoad(row);
if (el->size > max) max = el->size;
verbose(4, "Add hash %s value %u (%#lx)\n", el->chrom, el->size, (unsigned long)&el->size);
hashAdd(ret, el->chrom, (void *)(& el->size));
}
sqlFreeResult(&sr);
sqlDisconnect(&conn);
if (largest) *largest = max;
return ret;
}
static unsigned chromosomeSize(char *chromosome)
/* Return full extents of chromosome. Warn and fill in if none. */
{
struct hashEl *el = hashLookup(chromHash,chromosome);
if (el == NULL)
errAbort("Couldn't find size of chromosome %s", chromosome);
return *(unsigned *)el->val;
}
static void outputCounts(unsigned short *counts, char *chrom, unsigned size)
{
int start = 0;
int end = 0;
int prevCount = 0;
unsigned long i;
for (i = 0; i < size; ++i)
{
if (counts[i])
{
if (0 == prevCount) /* starting an element */
{
start = i;
end = start+1;
}
else if (prevCount == counts[i]) /* continuing an element */
{
++end;
}
else /* prevCount != counts[i] -> finished an element */
{
printf("%s\t%d\t%d\t%d\n", chrom, start, end, prevCount);
start = i;
end = start+1;
}
}
else if (prevCount) /* finished an element */
{
printf("%s\t%d\t%d\t%d\n", chrom, start, end, prevCount);
start = i;
end = start+1;
}
prevCount = counts[i];
}
if (prevCount) /* last element at end of chrom */
printf("%s\t%d\t%d\t%d\n", chrom, start, end, prevCount);
}
static void bedItemOverlapCount(char *database, int fileCount, char *fileList[])
{
unsigned maxChromSize = 0;
int i;
unsigned short *counts = (unsigned short *)NULL;
char *prevChrom = (char *)NULL;
boolean outputToDo = FALSE;
unsigned chromSize = 0;
if (chromSizes != NULL)
{
chromHash = newHash(0);
- // unfortunately, chromInfoLoadAll requires that the file have three fields (I don't know why),
- // so that's why we require a dummy third column in the chromInfo file.
- struct chromInfo *el = chromInfoLoadAll(chromSizes);
- for(;el != NULL;el=el->next)
- {
- if (el->size > maxChromSize) maxChromSize = el->size;
- verbose(4, "Add hash %s value %u (%#lx)\n", el->chrom, el->size, (unsigned long)&el->size);
- hashAdd(chromHash, el->chrom, (void *)(& el->size));
+ struct lineFile *lf = lineFileOpen(chromSizes, TRUE);
+ char *row[2];
+ while (lineFileRow(lf, row))
+ {
+ unsigned *ptr;
+ AllocVar(ptr);
+ *ptr = sqlUnsigned(row[1]);
+ maxChromSize = max(*ptr, maxChromSize);
+ hashAdd(chromHash, row[0], ptr);
}
+ lineFileClose(&lf);
}
else
{
chromHash = loadAllChromInfo(database, &maxChromSize);
}
verbose(2,"#\tmaxChromSize: %u\n", maxChromSize);
if (maxChromSize < 1)
errAbort("maxChromSize is zero ?");
/* Allocate just once for the largest chrom and reuse this array */
counts = needHugeMem(sizeof(unsigned short) * maxChromSize);
/* Reset the array to be zero to be reused */
memset((void *)counts, 0, sizeof(unsigned short)*(size_t)maxChromSize);
for (i=0; i<fileCount; ++i)
{
struct lineFile *bf = lineFileOpen(fileList[i] , TRUE);
unsigned thisChromSize = 0;
struct bed *bed = (struct bed *)NULL;
char *row[3];
while (lineFileNextRow(bf,row,ArraySize(row)))
{
int i;
bed = bedLoadN(row, 3);
verbose(3,"#\t%s\t%d\t%d\n",bed->chrom,bed->chromStart, bed->chromEnd);
if (prevChrom && differentWord(bed->chrom,prevChrom))
{
chromSize = chromosomeSize(prevChrom);
verbose(2,"#\tchrom %s done, size %d\n", prevChrom, chromSize);
if (outputToDo)
outputCounts(counts, prevChrom, chromSize);
outputToDo = FALSE;
memset((void *)counts, 0,
sizeof(unsigned short)*(size_t)maxChromSize); /* zero counts */
freeMem(prevChrom);
prevChrom = cloneString(bed->chrom);
thisChromSize = chromosomeSize(prevChrom);
verbose(2,"#\tchrom %s starting, size %d\n", prevChrom,
thisChromSize);
}
else if ((char *)NULL == prevChrom)
{
prevChrom = cloneString(bed->chrom);
chromSize = chromosomeSize(prevChrom);
thisChromSize = chromosomeSize(prevChrom);
verbose(2,"#\tchrom %s starting, size %d\n", prevChrom,
thisChromSize);
}
if (bed->chromEnd > thisChromSize)
{
if (bed->chromStart >= thisChromSize || differentWord(bed->chrom,"chrM")) // circular chrom
{
warn("ERROR: %s\t%d\t%d", bed->chrom, bed->chromStart,
bed->chromEnd);
errAbort("chromEnd > chromSize ? %d > %d", bed->chromEnd,
thisChromSize);
}
for (i = bed->chromStart; i < thisChromSize; ++i)
counts[i]++;
for (i = 0; i < (bed->chromEnd - thisChromSize); ++i)
counts[i]++;
}
else
{
for (i = bed->chromStart; i < bed->chromEnd; ++i)
counts[i]++;
}
outputToDo = TRUE;
bedFree(&bed); // plug the memory leak
}
verbose(2,"#\tchrom %s done, size %d\n", prevChrom, thisChromSize);
lineFileClose(&bf);
}
if (outputToDo)
outputCounts(counts, prevChrom, chromSize);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, optionSpecs);
if (argc < 2)
usage();
host = optionVal("host", NULL);
user = optionVal("user", NULL);
password = optionVal("password", NULL);
chromSizes = optionVal("chromSize", NULL);
verbose(2, "#\tworking on database: %s\n", argv[1]);
bedItemOverlapCount(argv[1], argc-2, argv+2);
return 0;
}