d83e58012a5e77fe65f840071d230e171bc6b2a6 angie Tue Oct 5 15:39:39 2010 -0700 Redmine Track #1261 (Construct track of "bad apple" coverage anomalies in human genome): Added -allowStartEqualEnd (as in hgLoadBed) so that 0-length items like insertions aren't lost -- Mary-Claire King pointed out the lack of insertions in the first version of the track, oops. Majorly refactored bedIntersect.c and added test cases. diff --git src/hg/bedIntersect/bedIntersect.c src/hg/bedIntersect/bedIntersect.c index a716395..c077021 100644 --- src/hg/bedIntersect/bedIntersect.c +++ src/hg/bedIntersect/bedIntersect.c @@ -9,14 +9,16 @@ static boolean aHitAny = FALSE; static boolean bScore = FALSE; -static float aCoverage = 0.00001; +static float minCoverage = 0.00001; static boolean strictTab = FALSE; +static boolean allowStartEqualEnd = FALSE; static struct optionSpec optionSpecs[] = { {"aHitAny", OPTION_BOOLEAN}, {"bScore", OPTION_BOOLEAN}, - {"aCoverage", OPTION_FLOAT}, + {"minCoverage", OPTION_FLOAT}, {"tab", OPTION_BOOLEAN}, + {"allowStartEqualEnd", OPTION_BOOLEAN}, {NULL, 0} }; @@ -30,21 +32,16 @@ " bedIntersect a.bed b.bed output.bed\n" "options:\n" " -aHitAny output all of a if any of it is hit by b\n" - " -aCoverage=0.N min coverage of b to output match. Default .00001\n" + " -minCoverage=0.N min coverage of b to output match (or if -aHitAny, of a).\n" + " Not applied to 0-length items. Default %f\n" " -bScore output score from b.bed (must be at least 5 field bed)\n" " -tab chop input at tabs not spaces\n" + " -allowStartEqualEnd Don't discard 0-length items of a or b\n" + " (e.g. point insertions)\n", + minCoverage ); } -struct bed3 -/* A three field bed. */ - { - struct bed3 *next; - char *chrom; /* Allocated in hash. */ - int start; /* Start (0 based) */ - int end; /* End (non-inclusive) */ - }; - struct bed5 /* A five field bed. */ { @@ -60,16 +57,16 @@ /* Read bed and return it as a hash keyed by chromName * with binKeeper values. */ { -char *row[3]; +char *row[5]; struct lineFile *lf = lineFileOpen(fileName, TRUE); -struct binKeeper *bk; struct hash *hash = newHash(0); -struct hashEl *hel; -struct bed3 *bed; +int expectedCols = bScore ? 5 : 3; -while (lineFileRow(lf, row)) +while (lineFileNextRow(lf, row, expectedCols)) { - hel = hashLookup(hash, row[0]); + struct binKeeper *bk; + struct bed5 *bed; + struct hashEl *hel = hashLookup(hash, row[0]); if (hel == NULL) { bk = binKeeperNew(0, 1024*1024*1024); @@ -80,102 +77,92 @@ bed->chrom = hel->name; bed->start = lineFileNeedNum(lf, row, 1); bed->end = lineFileNeedNum(lf, row, 2); + if (bScore) + bed->score = lineFileNeedNum(lf, row, 4); if (bed->start > bed->end) errAbort("start after end line %d of %s", lf->lineIx, lf->fileName); + if (allowStartEqualEnd && bed->start == bed->end) + // Note we are tweaking binKeeper coords here, so use bed->start and bed->end. + binKeeperAdd(bk, max(0, bed->start-1), bed->end+1, bed); + else binKeeperAdd(bk, bed->start, bed->end, bed); } lineFileClose(&lf); return hash; } -struct hash *readBed5(char *fileName) -/* Read bed and return it as a hash keyed by chromName - * with binKeeper values. */ -{ -char *row[5]; -struct lineFile *lf = lineFileOpen(fileName, TRUE); -struct binKeeper *bk; -struct hash *hash = newHash(0); -struct hashEl *hel; -struct bed5 *bed; - -while (lineFileRow(lf, row)) - { - hel = hashLookup(hash, row[0]); - if (hel == NULL) +void outputBed(FILE *f, char **row, int wordCount, int aStart, int aEnd, struct bed5 *b) +/* Print a row of bed, possibly substituting in a different start, end and score. */ +{ +int s = aHitAny ? aStart : max(aStart, b->start); +int e = aHitAny ? aEnd : min(aEnd, b->end); +if (s == e && !allowStartEqualEnd) + return; +int i; +fprintf(f, "%s\t%d\t%d", row[0], s, e); +for (i = 3 ; i<wordCount; i++) { - bk = binKeeperNew(0, 511*1024*1024); - hel = hashAdd(hash, row[0], bk); + if (bScore && i==4) + fprintf(f, "\t%d", b->score); + else + fprintf(f, "\t%s", row[i]); } - bk = hel->val; - AllocVar(bed); - bed->chrom = hel->name; - bed->start = lineFileNeedNum(lf, row, 1); - bed->end = lineFileNeedNum(lf, row, 2); - bed->score = lineFileNeedNum(lf, row, 4); - if (bed->start > bed->end) - errAbort("start after end line %d of %s", lf->lineIx, lf->fileName); - binKeeperAdd(bk, bed->start, bed->end, bed); +fprintf(f, "\n"); } -lineFileClose(&lf); -return hash; + +float getCov(int aStart, int aEnd, struct bed5 *b) +/* Compute coverage of a's start and end vs. b's, taking aHitAny and + * allowStartEqualEnd into account. */ +{ +float overlap = positiveRangeIntersection(aStart, aEnd, b->start, b->end); +float denominator = aHitAny ? (aEnd - aStart) : (b->end - b->start); +float cov = (denominator == 0) ? 0.0 : overlap/denominator; +if (allowStartEqualEnd && (aStart == aEnd || b->start == b->end)) + cov = 1.0; +return cov; } + void bedIntersect(char *aFile, char *bFile, char *outFile) /* bedIntersect - Intersect two bed files. */ { -struct hash *bHash = NULL; struct lineFile *lf = lineFileOpen(aFile, TRUE); +struct hash *bHash = readBed(bFile); FILE *f = mustOpen(outFile, "w"); char *row[40]; -char *name; -int start, end, i, wordCount; -struct binElement *hitList = NULL, *hit; -struct binKeeper *bk; -struct bed5 *bed; +int wordCount; -if (bScore) - bHash = readBed5(bFile); -else - bHash = readBed(bFile); while ((wordCount = (strictTab ? lineFileChopTab(lf, row) : lineFileChop(lf, row))) != 0) { - name = row[0]; - start = lineFileNeedNum(lf, row, 1); - end = lineFileNeedNum(lf, row, 2); + char *chrom = row[0]; + int start = lineFileNeedNum(lf, row, 1); + int end = lineFileNeedNum(lf, row, 2); if (start > end) errAbort("start after end line %d of %s", lf->lineIx, lf->fileName); - bk = hashFindVal(bHash, name); + struct binKeeper *bk = hashFindVal(bHash, chrom); if (bk != NULL) { + struct binElement *hitList = NULL, *hit; + if (allowStartEqualEnd && start == end) + hitList = binKeeperFind(bk, start-1, end+1); + else hitList = binKeeperFind(bk, start, end); if (aHitAny) { - if (hitList != NULL) - { for (hit = hitList; hit != NULL; hit = hit->next) { - float overlap = (float)positiveRangeIntersection(start, end, hit->start, hit->end); - if (overlap/(float)(end-start) > aCoverage) - { - fprintf(f, "%s\t%d\t%d", name, start, end); - for (i = 3 ; i<wordCount; i++) - { - if (bScore && i==4) + float cov = getCov(start, end, hit->val); + if (cov >= minCoverage) { - bed = hit->val; - fprintf(f, "\t%d",bed->score); - } - else - fprintf(f, "\t%s",row[i]); - } - fprintf(f, "\n"); + outputBed(f, row, wordCount, start, end, hit->val); break; } else { - printf("filter out %s %d %d %d %d overlap %.1f %d %d %.3f\n", - name,start,end, hit->start,hit->end, overlap, end-start, hit->end-hit->start,overlap/(float)(end-start)); - } + struct bed5 *b = hit->val; + verbose(1, "filter out %s %d %d %d %d overlap %d %d %d %.3f\n", + chrom, start, end, b->start, b->end, + positiveRangeIntersection(start, end, b->start, b->end), + end-start, b->end-b->start, cov); } } } @@ -183,24 +170,8 @@ { for (hit = hitList; hit != NULL; hit = hit->next) { - int s = max(start, hit->start); - int e = min(end, hit->end); - float overlap = (float)positiveRangeIntersection(start, end, hit->start, hit->end); - if ((s < e) && (overlap/(float)(hit->end-hit->start) > aCoverage)) - { - fprintf(f, "%s\t%d\t%d", name, s, e); - for (i = 3 ; i<wordCount; i++) - { - if (bScore && i==4) - { - bed = hit->val; - fprintf(f, "\t%d",bed->score); - } - else - fprintf(f, "\t%s",row[i]); - } - fprintf(f, "\n"); - } + if (getCov(start, end, hit->val) >= minCoverage) + outputBed(f, row, wordCount, start, end, hit->val); } } slFreeList(&hitList); @@ -215,8 +186,9 @@ aHitAny = optionExists("aHitAny"); bScore = optionExists("bScore"); -aCoverage = optionFloat("aCoverage", aCoverage); +minCoverage = optionFloat("minCoverage", minCoverage); strictTab = optionExists("tab"); +allowStartEqualEnd = optionExists("allowStartEqualEnd"); if (argc != 4) usage(); bedIntersect(argv[1], argv[2], argv[3]);