e95b4385c1d7c9e464fb438be4dce684c030e19e angie Fri Oct 1 09:37:19 2010 -0700 Added -tab option to bedIntersect (chopping by spaces, printing out w/tabs caused incorrect column counts). diff --git src/hg/bedIntersect/bedIntersect.c src/hg/bedIntersect/bedIntersect.c index fb1162d..a716395 100644 --- src/hg/bedIntersect/bedIntersect.c +++ src/hg/bedIntersect/bedIntersect.c @@ -1,220 +1,224 @@ /* bedIntersect - Intersect two bed files. */ #include "common.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "binRange.h" static char const rcsid[] = "$Id: bedIntersect.c,v 1.7 2007/03/07 01:46:42 angie Exp $"; static boolean aHitAny = FALSE; static boolean bScore = FALSE; static float aCoverage = 0.00001; +static boolean strictTab = FALSE; static struct optionSpec optionSpecs[] = { {"aHitAny", OPTION_BOOLEAN}, {"bScore", OPTION_BOOLEAN}, {"aCoverage", OPTION_FLOAT}, + {"tab", OPTION_BOOLEAN}, {NULL, 0} }; void usage() /* Explain usage and exit. */ { errAbort( "bedIntersect - Intersect two bed files\n" "usage:\n" " bedIntersect a.bed b.bed output.bed\n" "options:\n" " -aHitAny output all of a if any of it is hit by b\n" " -aCoverage=0.N min coverage of b to output match. Default .00001\n" " -bScore output score from b.bed (must be at least 5 field bed)\n" + " -tab chop input at tabs not spaces\n" ); } struct bed3 /* A three field bed. */ { struct bed3 *next; char *chrom; /* Allocated in hash. */ int start; /* Start (0 based) */ int end; /* End (non-inclusive) */ }; struct bed5 /* A five field bed. */ { struct bed5 *next; char *chrom; /* Allocated in hash. */ int start; /* Start (0 based) */ int end; /* End (non-inclusive) */ char *name; /* Name of item */ int score; /* Score - 0-1000 */ }; struct hash *readBed(char *fileName) /* Read bed and return it as a hash keyed by chromName * with binKeeper values. */ { char *row[3]; struct lineFile *lf = lineFileOpen(fileName, TRUE); struct binKeeper *bk; struct hash *hash = newHash(0); struct hashEl *hel; struct bed3 *bed; while (lineFileRow(lf, row)) { hel = hashLookup(hash, row[0]); if (hel == NULL) { bk = binKeeperNew(0, 1024*1024*1024); hel = hashAdd(hash, row[0], bk); } bk = hel->val; AllocVar(bed); bed->chrom = hel->name; bed->start = lineFileNeedNum(lf, row, 1); bed->end = lineFileNeedNum(lf, row, 2); if (bed->start > bed->end) errAbort("start after end line %d of %s", lf->lineIx, lf->fileName); binKeeperAdd(bk, bed->start, bed->end, bed); } lineFileClose(&lf); return hash; } struct hash *readBed5(char *fileName) /* Read bed and return it as a hash keyed by chromName * with binKeeper values. */ { char *row[5]; struct lineFile *lf = lineFileOpen(fileName, TRUE); struct binKeeper *bk; struct hash *hash = newHash(0); struct hashEl *hel; struct bed5 *bed; while (lineFileRow(lf, row)) { hel = hashLookup(hash, row[0]); if (hel == NULL) { bk = binKeeperNew(0, 511*1024*1024); hel = hashAdd(hash, row[0], bk); } bk = hel->val; AllocVar(bed); bed->chrom = hel->name; bed->start = lineFileNeedNum(lf, row, 1); bed->end = lineFileNeedNum(lf, row, 2); bed->score = lineFileNeedNum(lf, row, 4); if (bed->start > bed->end) errAbort("start after end line %d of %s", lf->lineIx, lf->fileName); binKeeperAdd(bk, bed->start, bed->end, bed); } lineFileClose(&lf); return hash; } void bedIntersect(char *aFile, char *bFile, char *outFile) /* bedIntersect - Intersect two bed files. */ { struct hash *bHash = NULL; struct lineFile *lf = lineFileOpen(aFile, TRUE); FILE *f = mustOpen(outFile, "w"); char *row[40]; char *name; int start, end, i, wordCount; struct binElement *hitList = NULL, *hit; struct binKeeper *bk; struct bed5 *bed; if (bScore) bHash = readBed5(bFile); else bHash = readBed(bFile); -while ((wordCount = lineFileChop(lf, row)) != 0) +while ((wordCount = (strictTab ? lineFileChopTab(lf, row) : lineFileChop(lf, row))) != 0) { name = row[0]; start = lineFileNeedNum(lf, row, 1); end = lineFileNeedNum(lf, row, 2); if (start > end) errAbort("start after end line %d of %s", lf->lineIx, lf->fileName); bk = hashFindVal(bHash, name); if (bk != NULL) { hitList = binKeeperFind(bk, start, end); if (aHitAny) { if (hitList != NULL) { for (hit = hitList; hit != NULL; hit = hit->next) { float overlap = (float)positiveRangeIntersection(start, end, hit->start, hit->end); if (overlap/(float)(end-start) > aCoverage) { fprintf(f, "%s\t%d\t%d", name, start, end); for (i = 3 ; ival; fprintf(f, "\t%d",bed->score); } else fprintf(f, "\t%s",row[i]); } fprintf(f, "\n"); break; } else { printf("filter out %s %d %d %d %d overlap %.1f %d %d %.3f\n", name,start,end, hit->start,hit->end, overlap, end-start, hit->end-hit->start,overlap/(float)(end-start)); } } } } else { for (hit = hitList; hit != NULL; hit = hit->next) { int s = max(start, hit->start); int e = min(end, hit->end); float overlap = (float)positiveRangeIntersection(start, end, hit->start, hit->end); if ((s < e) && (overlap/(float)(hit->end-hit->start) > aCoverage)) { fprintf(f, "%s\t%d\t%d", name, s, e); for (i = 3 ; ival; fprintf(f, "\t%d",bed->score); } else fprintf(f, "\t%s",row[i]); } fprintf(f, "\n"); } } } slFreeList(&hitList); } } } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); aHitAny = optionExists("aHitAny"); bScore = optionExists("bScore"); aCoverage = optionFloat("aCoverage", aCoverage); +strictTab = optionExists("tab"); if (argc != 4) usage(); bedIntersect(argv[1], argv[2], argv[3]); return 0; }