478ab9fb83f3dc56b5eba870627fe40374a76529 kent Mon Jan 24 10:28:29 2011 -0800 Adding new bedRemoveOverlap utility. diff --git src/utils/bedRemoveOverlap/bedRemoveOverlap.c src/utils/bedRemoveOverlap/bedRemoveOverlap.c new file mode 100644 index 0000000..fa2292b --- /dev/null +++ src/utils/bedRemoveOverlap/bedRemoveOverlap.c @@ -0,0 +1,151 @@ +/* bedRemoveOverlap - Remove overlapping records from a (sorted) bed file. Gets rid of the + * smaller of overlapping records.. */ +#include "common.h" +#include "linefile.h" +#include "hash.h" +#include "options.h" + +static char const rcsid[] = "$Id: newProg.c,v 1.30 2010/03/24 21:18:33 hiram Exp $"; + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "bedRemoveOverlap - Remove overlapping records from a (sorted) bed file. Gets rid of\n" + "`the smaller of overlapping records.\n" + "usage:\n" + " bedRemoveOverlap in.bed out.bed\n" + "options:\n" + " -xxx=XXX\n" + ); +} + +static struct optionSpec options[] = { + {NULL, 0}, +}; + +boolean isCommentLine(char *line) +/* Return TRUE if line is a BED comment - blank or starting with '#' */ +{ +char *s = skipLeadingSpaces(line); +char c = s[0]; +return c == '#' || c == 0; +} + +void bedRemoveOverlap(char *input, char *output) +/* bedRemoveOverlap - Remove overlapping records from a (sorted) bed file. Gets rid of + * the smaller of overlapping records.. */ +{ +struct lineFile *lf = lineFileOpen(input, TRUE); +FILE *f = mustOpen(output, "w"); +char *prevChrom = NULL; +unsigned int prevStart = 0, prevEnd = 0; +char *row[1024]; +int wordCount; +int maxLineSize = 1024*1024; +char *prevLine = needMem(maxLineSize); +char *curLine = needMem(maxLineSize); +int lineSize; +char *line; +boolean firstTime = TRUE; +while (lineFileNext(lf, &line, &lineSize)) + { + /* Skip comments and make sure other lines not too long. */ + if (isCommentLine(line)) + continue; + if (lineSize >= maxLineSize) + errAbort("Line too long (%d chars, max is %d) line %d of %s", lineSize, + maxLineSize, lf->lineIx, lf->fileName); + + /* Swap existing prevLine and curLine buffers. */ + char *tmp = prevLine; + prevLine = curLine; + curLine = tmp; + + /* Save current line for use next time through loop. */ + strcpy(curLine, line); + + /* Parse current line. */ + wordCount = chopLine(line, row); + if (wordCount == ArraySize(row)) + errAbort("Too many fields (%d max is %lu) line %d of %s", wordCount, ArraySize(row), + lf->lineIx, lf->fileName); + char *chrom = row[0]; + unsigned int start = lineFileNeedNum(lf, row, 1); + unsigned int end = lineFileNeedNum(lf, row, 2); + + if (firstTime) + { + /* First time through the loop don't output anything. */ + firstTime = FALSE; + prevChrom = cloneString(chrom); + } + else + { + /* Check to see if we are on a new chromosome. */ + int cmp = strcmp(chrom, prevChrom); + if (cmp != 0) + { + if (cmp < 0) + errAbort("%s not sorted line %d, %s before %s", + lf->fileName, lf->lineIx, prevChrom, chrom); + fprintf(f, "%s\n", prevLine); + freeMem(prevChrom); + prevChrom = cloneString(chrom); + } + else + { + if (prevStart > start) + errAbort("File not sorted line %d of %s", lf->lineIx, lf->fileName); + if (rangeIntersection(start, end, prevStart, prevEnd) > 0) + { + /* Detected overlap. Don't overlap anything this time through. + * instead dummy things up so bigger piece of overlap gets put + * out next time. */ + int curSize = end - start; + int prevSize = prevEnd - prevStart; + if (prevSize > curSize) + { + verbose(2, "%s:%d-%d overlaps %s:%d-%d. Skipping %s:%d-%d line %d of %s\n", + prevChrom, prevStart, prevEnd, chrom, start, end, chrom, start, end, + lf->lineIx, lf->fileName); + start = prevStart; + end = prevEnd; + strcpy(curLine, prevLine); + } + else + { + verbose(2, "%s:%d-%d overlaps %s:%d-%d. Skipping %s:%d-%d line %d of %s\n", + prevChrom, prevStart, prevEnd, chrom, start, end, + prevChrom, prevStart, prevEnd, lf->lineIx, lf->fileName); + /* This last bit is only needed for last line of file... */ + prevStart = start; + prevEnd = end; + strcpy(prevLine, curLine); + } + } + else + fprintf(f, "%s\n", prevLine); + } + } + prevStart = start; + prevEnd = end; + } + +/* Print last line if any. */ +if (!firstTime) + fprintf(f, "%s\n", curLine); + + +carefulClose(&f); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 3) + usage(); +bedRemoveOverlap(argv[1], argv[2]); +return 0; +}